From fe8acd5d81bc344411436a956eac55ee2b7e58e2 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 1 Nov 2017 14:36:09 +0100 Subject: [PATCH 01/34] Merging master --- .../schedoscope/export/jdbc/outputschema/SchemaUtils.java | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/jdbc/outputschema/SchemaUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/jdbc/outputschema/SchemaUtils.java index 47640e849..85a43799d 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/jdbc/outputschema/SchemaUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/jdbc/outputschema/SchemaUtils.java @@ -82,12 +82,7 @@ public static String[] getColumnTypesFromHcatSchema(HCatSchema inputSchema, if (!inputSchema.get(i).isComplex()) { - if (inputSchema.get(i).getTypeString() - .toLowerCase(Locale.getDefault()) == null - || inputSchema.get(i).getTypeString() - .toLowerCase(Locale.getDefault()) - .equals("null")) { - + if (inputSchema.get(i).getTypeString().toLowerCase(Locale.getDefault()).equals("null")) { type = columnTypeMapping.get("tinyint"); } else if (anonFields.contains(inputSchema.get(i).getName())) { From 5873e3c0537601d93d76eb937da32233bd22a4bd Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 3 Nov 2017 08:53:22 +0100 Subject: [PATCH 02/34] Initial rigging --- .../src/test/resources/log4j.properties | 2 +- schedoscope-export/pom.xml | 15 ++++ .../bigquery/outputschema/BigQuerySchema.java | 34 +++++++++ .../outputschema/BigQuerySchemaTest.java | 75 +++++++++++++++++++ .../src/test/resources/log4j.properties | 9 +++ 5 files changed, 134 insertions(+), 1 deletion(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java create mode 100644 schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java create mode 100644 schedoscope-export/src/test/resources/log4j.properties diff --git a/schedoscope-core/src/test/resources/log4j.properties b/schedoscope-core/src/test/resources/log4j.properties index c5e48adeb..2765c8328 100644 --- a/schedoscope-core/src/test/resources/log4j.properties +++ b/schedoscope-core/src/test/resources/log4j.properties @@ -1 +1 @@ -log4j.rootLogger=OFF \ No newline at end of file +#log4j.rootLogger=OFF diff --git a/schedoscope-export/pom.xml b/schedoscope-export/pom.xml index d4ebeba96..480b8f5a9 100644 --- a/schedoscope-export/pom.xml +++ b/schedoscope-export/pom.xml @@ -18,6 +18,11 @@ slf4j-api 1.7.13 + + com.google.guava + guava + 19.0 + com.twitter parquet-hadoop-bundle @@ -219,6 +224,16 @@ + + com.google.api-client + google-api-client + 1.23.0 + + + com.google.cloud + google-cloud-bigquery + 0.20.0-beta + com.101tec zkclient diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java new file mode 100644 index 000000000..0ae5c34f0 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -0,0 +1,34 @@ +package org.schedoscope.export.bigquery.outputschema; + +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.StandardTableDefinition; +import com.google.cloud.bigquery.TableId; +import com.google.cloud.bigquery.TableInfo; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.io.IOException; + +public class BigQuerySchema { + + private static final Log LOG = LogFactory.getLog(BigQuerySchema.class); + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { + + LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + + TableId tableId = TableId.of(database, table); + + Schema tableFields = Schema.of(); + + StandardTableDefinition tableDefinition = StandardTableDefinition.of(tableFields); + + TableInfo tableInfo = TableInfo.of(tableId, tableDefinition); + + LOG.info("Converted BigQuery schema: " + tableInfo); + + return tableInfo; + } + +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java new file mode 100644 index 000000000..df9b746b9 --- /dev/null +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -0,0 +1,75 @@ +package org.schedoscope.export.bigquery.outputschema; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.BigQueryOptions; +import com.google.cloud.bigquery.DatasetId; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.junit.Before; +import org.junit.Test; + +import java.io.IOException; +import java.util.Arrays; + +public class BigQuerySchemaTest { + + private BigQuerySchema bigQuerySchema = new BigQuerySchema(); + + private BigQuery bigQuery; + + private HCatSchema flatTable; + + @Before + public void setUp() throws HCatException { + + PrimitiveTypeInfo hcatStringType = new PrimitiveTypeInfo(); + hcatStringType.setTypeName("string"); + PrimitiveTypeInfo hcatIntType = new PrimitiveTypeInfo(); + hcatIntType.setTypeName("int"); + PrimitiveTypeInfo hcatLongType = new PrimitiveTypeInfo(); + hcatLongType.setTypeName("bigint"); + PrimitiveTypeInfo hcatByteType = new PrimitiveTypeInfo(); + hcatByteType.setTypeName("tinyint"); + PrimitiveTypeInfo hcatBooleanType = new PrimitiveTypeInfo(); + hcatBooleanType.setTypeName("boolean"); + PrimitiveTypeInfo hcatDoubleType = new PrimitiveTypeInfo(); + hcatDoubleType.setTypeName("double"); + PrimitiveTypeInfo hcatFloatType = new PrimitiveTypeInfo(); + hcatFloatType.setTypeName("float"); + + HCatFieldSchema hcatStringField = new HCatFieldSchema("aString", hcatStringType, "a string field"); + HCatFieldSchema hcatIntField = new HCatFieldSchema("anInt", hcatIntType, "an int field"); + HCatFieldSchema hcatLongField = new HCatFieldSchema("aLong", hcatLongType, "a long field"); + HCatFieldSchema hcatByteField = new HCatFieldSchema("aByte", hcatByteType, "a byte field"); + HCatFieldSchema hcatBooleanField = new HCatFieldSchema("aBoolean", hcatBooleanType, "a boolean field"); + HCatFieldSchema hcatDoubleField = new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"); + HCatFieldSchema hcatFloatField = new HCatFieldSchema("aFloat", hcatFloatType, "a float field"); + + flatTable = new HCatSchema( + Arrays.asList( + hcatStringField, + hcatIntField, + hcatLongField, + hcatByteField, + hcatBooleanField, + hcatDoubleField, + hcatFloatField + ) + ); + + + bigQuery = BigQueryOptions.getDefaultInstance().getService(); + + DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); + bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + + + } + + @Test + public void figuringOutApi() throws IOException { + bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatTable); + } +} diff --git a/schedoscope-export/src/test/resources/log4j.properties b/schedoscope-export/src/test/resources/log4j.properties new file mode 100644 index 000000000..827c48b90 --- /dev/null +++ b/schedoscope-export/src/test/resources/log4j.properties @@ -0,0 +1,9 @@ +#log4j.rootLogger=OFF +# Root logger option +log4j.rootLogger=INFO, stdout + +# Direct log messages to stdout +log4j.appender.stdout=org.apache.log4j.ConsoleAppender +log4j.appender.stdout.Target=System.out +log4j.appender.stdout.layout=org.apache.log4j.PatternLayout +log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n \ No newline at end of file From 5e1d02773b473f78a934680f942edde9dcc36801 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 14 Nov 2017 10:18:08 +0100 Subject: [PATCH 03/34] Initial schema conversion for flat tables --- .../bigquery/outputschema/BigQuerySchema.java | 54 +++++++++++++++++-- .../outputschema/BigQuerySchemaTest.java | 43 +++++++++++---- 2 files changed, 82 insertions(+), 15 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index 0ae5c34f0..d4dc1069f 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -1,26 +1,70 @@ package org.schedoscope.export.bigquery.outputschema; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.StandardTableDefinition; -import com.google.cloud.bigquery.TableId; -import com.google.cloud.bigquery.TableInfo; +import com.google.cloud.bigquery.*; import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; import java.io.IOException; +import java.util.LinkedList; public class BigQuerySchema { private static final Log LOG = LogFactory.getLog(BigQuerySchema.class); + public Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { + + String fieldName = fieldSchema.getName(); + String fieldDescription = fieldSchema.getComment(); + PrimitiveTypeInfo fieldType = fieldSchema.getTypeInfo(); + + Field.Type bigQueryType = null; + + switch (fieldType.getTypeName()) { + case "string": + bigQueryType = Field.Type.string(); + break; + case "int": + bigQueryType = Field.Type.integer(); + break; + case "bigint": + bigQueryType = Field.Type.integer(); + break; + case "tinyint": + bigQueryType = Field.Type.integer(); + break; + case "boolean": + bigQueryType = Field.Type.bool(); + break; + case "float": + bigQueryType = Field.Type.floatingPoint(); + break; + case "double": + bigQueryType = Field.Type.floatingPoint(); + break; + default: + bigQueryType = Field.Type.string(); + } + + return Field.newBuilder(fieldName, bigQueryType).setDescription(fieldDescription).setMode(Field.Mode.NULLABLE).build(); + + } + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); TableId tableId = TableId.of(database, table); - Schema tableFields = Schema.of(); + LinkedList biqQueryFields = new LinkedList<>(); + + for (HCatFieldSchema field : hcatSchema.getFields()) { + biqQueryFields.add(convertFieldSchemaToField(field)); + } + + Schema tableFields = Schema.of(biqQueryFields); StandardTableDefinition tableDefinition = StandardTableDefinition.of(tableFields); diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index df9b746b9..f700a4721 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -1,12 +1,11 @@ package org.schedoscope.export.bigquery.outputschema; -import com.google.cloud.bigquery.BigQuery; -import com.google.cloud.bigquery.BigQueryOptions; -import com.google.cloud.bigquery.DatasetId; +import com.google.cloud.bigquery.*; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.junit.After; import org.junit.Before; import org.junit.Test; @@ -19,11 +18,17 @@ public class BigQuerySchemaTest { private BigQuery bigQuery; - private HCatSchema flatTable; + private HCatSchema flatHcatSchema; + + private Schema flatBigQuerySchema; @Before public void setUp() throws HCatException { + bigQuery = BigQueryOptions.getDefaultInstance().getService(); + + createBigQueryDataSet(); + PrimitiveTypeInfo hcatStringType = new PrimitiveTypeInfo(); hcatStringType.setTypeName("string"); PrimitiveTypeInfo hcatIntType = new PrimitiveTypeInfo(); @@ -47,7 +52,7 @@ public void setUp() throws HCatException { HCatFieldSchema hcatDoubleField = new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"); HCatFieldSchema hcatFloatField = new HCatFieldSchema("aFloat", hcatFloatType, "a float field"); - flatTable = new HCatSchema( + flatHcatSchema = new HCatSchema( Arrays.asList( hcatStringField, hcatIntField, @@ -59,17 +64,35 @@ public void setUp() throws HCatException { ) ); + flatBigQuerySchema = Schema.of( + Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").build(), + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").build(), + Field.newBuilder("aLong", Field.Type.integer()).setDescription("a long field").build(), + Field.newBuilder("aByte", Field.Type.integer()).setDescription("a byte field").build(), + Field.newBuilder("aBoolean", Field.Type.bool()).setDescription("a boolean field").build(), + Field.newBuilder("aDouble", Field.Type.floatingPoint()).setDescription("a double field").build(), + Field.newBuilder("aFloat", Field.Type.floatingPoint()).setDescription("a float field").build() + ); - bigQuery = BigQueryOptions.getDefaultInstance().getService(); - DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); - bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + } + public void createBigQueryDataSet() { + dropBigQueryDataSets(); + DatasetInfo datasetInfo = DatasetInfo.newBuilder("schedoscope_export_big_query_schema_test").build(); + bigQuery.create(datasetInfo); + } + //@After + public void dropBigQueryDataSets() { + DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); + bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); } @Test - public void figuringOutApi() throws IOException { - bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatTable); + public void testFlatTableConversion() throws IOException { + bigQuery.create( + bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema) + ); } } From f64b3a494f9b0dff40cdbc011a24dd4ea13f6e5d Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 14 Nov 2017 11:11:55 +0100 Subject: [PATCH 04/34] Implemented test checks --- .../outputschema/BigQuerySchemaTest.java | 57 +++++++++++++++++-- 1 file changed, 53 insertions(+), 4 deletions(-) diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index f700a4721..f626fb92d 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -1,6 +1,7 @@ package org.schedoscope.export.bigquery.outputschema; import com.google.cloud.bigquery.*; +import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; @@ -12,6 +13,9 @@ import java.io.IOException; import java.util.Arrays; +import static org.junit.Assert.assertEquals; + + public class BigQuerySchemaTest { private BigQuerySchema bigQuerySchema = new BigQuerySchema(); @@ -83,16 +87,61 @@ public void createBigQueryDataSet() { bigQuery.create(datasetInfo); } - //@After + @After public void dropBigQueryDataSets() { DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); } + private void assertHcatSchemaEqualsBigQueryTable(TableInfo bigQueryTable, String databaseName, String tableName, HCatSchema hCatSchema) { + + assertEquals("schedoscope_export_big_query_schema_test", bigQueryTable.getTableId().getDataset()); + assertEquals("flat_table", bigQueryTable.getTableId().getTable()); + + for (int h = 0; h < hCatSchema.getFields().size(); h++) { + HCatFieldSchema hcatFieldSchema = hCatSchema.getFields().get(h); + Field bigQueryField = bigQueryTable.getDefinition().getSchema().getFields().get(h); + + assertEquals(hcatFieldSchema.getName(), bigQueryField.getName()); + assertEquals(hcatFieldSchema.getComment(), bigQueryField.getDescription()); + + if (hcatFieldSchema.getTypeInfo().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) { + assertEquals(Field.Mode.NULLABLE, bigQueryField.getMode()); + } + + switch (hcatFieldSchema.getTypeInfo().getTypeName()) { + case "string": + assertEquals(Field.Type.string(), bigQueryField.getType()); + break; + case "int": + assertEquals(Field.Type.integer(), bigQueryField.getType()); + break; + case "bigint": + assertEquals(Field.Type.integer(), bigQueryField.getType()); + break; + case "tinyint": + assertEquals(Field.Type.integer(), bigQueryField.getType()); + break; + case "boolean": + assertEquals(Field.Type.bool(), bigQueryField.getType()); + break; + case "float": + assertEquals(Field.Type.floatingPoint(), bigQueryField.getType()); + break; + case "double": + assertEquals(Field.Type.floatingPoint(), bigQueryField.getType()); + break; + default: + assertEquals(Field.Type.string(), bigQueryField.getType()); + } + } + + + } + @Test public void testFlatTableConversion() throws IOException { - bigQuery.create( - bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema) - ); + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + assertHcatSchemaEqualsBigQueryTable(converted, "schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); } } From bdaa1b3b1e8b928ebee71dfcd5369587003951f6 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 14 Nov 2017 14:10:20 +0100 Subject: [PATCH 05/34] arrays of primitive types --- .../bigquery/outputschema/BigQuerySchema.java | 56 +++++++-- .../outputschema/BigQuerySchemaTest.java | 118 +++++++----------- 2 files changed, 93 insertions(+), 81 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index d4dc1069f..e5e9c85fd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -4,6 +4,7 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; @@ -14,15 +15,10 @@ public class BigQuerySchema { private static final Log LOG = LogFactory.getLog(BigQuerySchema.class); - public Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { - - String fieldName = fieldSchema.getName(); - String fieldDescription = fieldSchema.getComment(); - PrimitiveTypeInfo fieldType = fieldSchema.getTypeInfo(); - - Field.Type bigQueryType = null; + public Field.Type convertTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { + Field.Type bigQueryType; - switch (fieldType.getTypeName()) { + switch (typeInfo.getTypeName()) { case "string": bigQueryType = Field.Type.string(); break; @@ -48,7 +44,49 @@ public Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { bigQueryType = Field.Type.string(); } - return Field.newBuilder(fieldName, bigQueryType).setDescription(fieldDescription).setMode(Field.Mode.NULLABLE).build(); + return bigQueryType; + } + + public Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { + + String fieldName = fieldSchema.getName(); + String fieldDescription = fieldSchema.getComment(); + + PrimitiveTypeInfo fieldType; + Field.Mode mode = Field.Mode.NULLABLE; + + if (fieldSchema.getCategory().equals(HCatFieldSchema.Category.ARRAY)) { + + HCatFieldSchema elementSchema = null; + + try { + elementSchema = fieldSchema.getArrayElementSchema().get(0); + } catch (HCatException e) { + // not going to happen + } + + if (elementSchema.getCategory().equals(HCatFieldSchema.Category.PRIMITIVE)) { + + mode = Field.Mode.REPEATED; + fieldType = elementSchema.getTypeInfo(); + + } else { + + fieldType = elementSchema.getTypeInfo(); + + } + + } else { + fieldType = fieldSchema.getTypeInfo(); + } + + Field.Type bigQueryType = convertTypeInfoToFieldType(fieldType); + + return Field + .newBuilder(fieldName, bigQueryType) + .setDescription(fieldDescription) + .setMode(mode) + .build(); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index f626fb92d..c907529a7 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -1,7 +1,6 @@ package org.schedoscope.export.bigquery.outputschema; import com.google.cloud.bigquery.*; -import org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; @@ -22,9 +21,9 @@ public class BigQuerySchemaTest { private BigQuery bigQuery; - private HCatSchema flatHcatSchema; + private HCatSchema flatHcatSchema, hcatSchemaWithPrimitiveList; - private Schema flatBigQuerySchema; + private Schema flatBigQuerySchema, bigQuerySchemaWithPrimitiveList; @Before public void setUp() throws HCatException { @@ -48,36 +47,43 @@ public void setUp() throws HCatException { PrimitiveTypeInfo hcatFloatType = new PrimitiveTypeInfo(); hcatFloatType.setTypeName("float"); - HCatFieldSchema hcatStringField = new HCatFieldSchema("aString", hcatStringType, "a string field"); - HCatFieldSchema hcatIntField = new HCatFieldSchema("anInt", hcatIntType, "an int field"); - HCatFieldSchema hcatLongField = new HCatFieldSchema("aLong", hcatLongType, "a long field"); - HCatFieldSchema hcatByteField = new HCatFieldSchema("aByte", hcatByteType, "a byte field"); - HCatFieldSchema hcatBooleanField = new HCatFieldSchema("aBoolean", hcatBooleanType, "a boolean field"); - HCatFieldSchema hcatDoubleField = new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"); - HCatFieldSchema hcatFloatField = new HCatFieldSchema("aFloat", hcatFloatType, "a float field"); - flatHcatSchema = new HCatSchema( Arrays.asList( - hcatStringField, - hcatIntField, - hcatLongField, - hcatByteField, - hcatBooleanField, - hcatDoubleField, - hcatFloatField + new HCatFieldSchema("aString", hcatStringType, "a string field"), + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("aLong", hcatLongType, "a long field"), + new HCatFieldSchema("aByte", hcatByteType, "a byte field"), + new HCatFieldSchema("aBoolean", hcatBooleanType, "a boolean field"), + new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"), + new HCatFieldSchema("aFloat", hcatFloatType, "a float field") ) ); flatBigQuerySchema = Schema.of( - Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").build(), - Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").build(), - Field.newBuilder("aLong", Field.Type.integer()).setDescription("a long field").build(), - Field.newBuilder("aByte", Field.Type.integer()).setDescription("a byte field").build(), - Field.newBuilder("aBoolean", Field.Type.bool()).setDescription("a boolean field").build(), - Field.newBuilder("aDouble", Field.Type.floatingPoint()).setDescription("a double field").build(), - Field.newBuilder("aFloat", Field.Type.floatingPoint()).setDescription("a float field").build() + Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aLong", Field.Type.integer()).setDescription("a long field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aByte", Field.Type.integer()).setDescription("a byte field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aBoolean", Field.Type.bool()).setDescription("a boolean field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aDouble", Field.Type.floatingPoint()).setDescription("a double field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aFloat", Field.Type.floatingPoint()).setDescription("a float field").setMode(Field.Mode.NULLABLE).build() + ); + + hcatSchemaWithPrimitiveList = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("listOfInts", + HCatFieldSchema.Type.ARRAY, + new HCatSchema(Arrays.asList(new HCatFieldSchema(null, hcatIntType, null))), + "a list of ints field") + + ) ); + bigQuerySchemaWithPrimitiveList = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("listOfInts", Field.Type.integer()).setDescription("a list of ints field").setMode(Field.Mode.REPEATED).build() + ); } @@ -87,61 +93,29 @@ public void createBigQueryDataSet() { bigQuery.create(datasetInfo); } - @After + //@After public void dropBigQueryDataSets() { DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); } - private void assertHcatSchemaEqualsBigQueryTable(TableInfo bigQueryTable, String databaseName, String tableName, HCatSchema hCatSchema) { - - assertEquals("schedoscope_export_big_query_schema_test", bigQueryTable.getTableId().getDataset()); - assertEquals("flat_table", bigQueryTable.getTableId().getTable()); - - for (int h = 0; h < hCatSchema.getFields().size(); h++) { - HCatFieldSchema hcatFieldSchema = hCatSchema.getFields().get(h); - Field bigQueryField = bigQueryTable.getDefinition().getSchema().getFields().get(h); - - assertEquals(hcatFieldSchema.getName(), bigQueryField.getName()); - assertEquals(hcatFieldSchema.getComment(), bigQueryField.getDescription()); - - if (hcatFieldSchema.getTypeInfo().getCategory().equals(ObjectInspector.Category.PRIMITIVE)) { - assertEquals(Field.Mode.NULLABLE, bigQueryField.getMode()); - } - - switch (hcatFieldSchema.getTypeInfo().getTypeName()) { - case "string": - assertEquals(Field.Type.string(), bigQueryField.getType()); - break; - case "int": - assertEquals(Field.Type.integer(), bigQueryField.getType()); - break; - case "bigint": - assertEquals(Field.Type.integer(), bigQueryField.getType()); - break; - case "tinyint": - assertEquals(Field.Type.integer(), bigQueryField.getType()); - break; - case "boolean": - assertEquals(Field.Type.bool(), bigQueryField.getType()); - break; - case "float": - assertEquals(Field.Type.floatingPoint(), bigQueryField.getType()); - break; - case "double": - assertEquals(Field.Type.floatingPoint(), bigQueryField.getType()); - break; - default: - assertEquals(Field.Type.string(), bigQueryField.getType()); - } - } - + @Test + public void testFlatTableConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("flat_table", converted.getTableId().getTable()); + assertEquals(flatBigQuerySchema, converted.getDefinition().getSchema()); } @Test - public void testFlatTableConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); - assertHcatSchemaEqualsBigQueryTable(converted, "schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + public void testTableWithPrimitiveListConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_primitive_list", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithPrimitiveList, converted.getDefinition().getSchema()); + + bigQuery.create(converted); } } From e1cbe3ad1adfbfc0152d73d11f04afaad233fb7a Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Thu, 16 Nov 2017 15:36:14 +0100 Subject: [PATCH 06/34] Finished schema generation --- schedoscope-export/pom.xml | 4 +- .../bigquery/outputschema/BigQuerySchema.java | 124 ++++++--- .../TemporalPartitioningScheme.java | 37 +++ .../export/bigquery/BigQueryBaseTest.java | 56 ++++ .../outputschema/BigQuerySchemaTest.java | 241 ++++++++++++++++-- 5 files changed, 407 insertions(+), 55 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java create mode 100644 schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java diff --git a/schedoscope-export/pom.xml b/schedoscope-export/pom.xml index 480b8f5a9..e42b942fd 100644 --- a/schedoscope-export/pom.xml +++ b/schedoscope-export/pom.xml @@ -337,8 +337,8 @@ maven-compiler-plugin 3.5.1 - 1.7 - 1.7 + 1.8 + 1.8 diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index e5e9c85fd..68e4cca83 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -15,7 +15,18 @@ public class BigQuerySchema { private static final Log LOG = LogFactory.getLog(BigQuerySchema.class); - public Field.Type convertTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { + static private PrimitiveTypeInfo stringTypeInfo; + + private PrimitiveTypeInfo stringTypeInfo() { + if (stringTypeInfo == null) { + stringTypeInfo = new PrimitiveTypeInfo(); + stringTypeInfo.setTypeName("string"); + } + + return stringTypeInfo; + } + + private Field.Type convertPrimitiveTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { Field.Type bigQueryType; switch (typeInfo.getTypeName()) { @@ -47,70 +58,121 @@ public Field.Type convertTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { return bigQueryType; } - public Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { + private Field convertPrimitiveSchemaToField(HCatFieldSchema fieldSchema) { - String fieldName = fieldSchema.getName(); - String fieldDescription = fieldSchema.getComment(); + return Field + .newBuilder(fieldSchema.getName(), convertPrimitiveTypeInfoToFieldType(fieldSchema.getTypeInfo())) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); - PrimitiveTypeInfo fieldType; - Field.Mode mode = Field.Mode.NULLABLE; + } - if (fieldSchema.getCategory().equals(HCatFieldSchema.Category.ARRAY)) { + private Field convertStructSchemaField(HCatFieldSchema fieldSchema) { - HCatFieldSchema elementSchema = null; + HCatSchema structSchema = null; - try { - elementSchema = fieldSchema.getArrayElementSchema().get(0); - } catch (HCatException e) { - // not going to happen - } + try { + structSchema = fieldSchema.getStructSubSchema(); + } catch (HCatException e) { + // not going to happen + } - if (elementSchema.getCategory().equals(HCatFieldSchema.Category.PRIMITIVE)) { + Schema recordSchema = convertSchemaToTableFields(structSchema); - mode = Field.Mode.REPEATED; - fieldType = elementSchema.getTypeInfo(); + return Field + .newBuilder(fieldSchema.getName(), Field.Type.record(recordSchema.getFields())) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); - } else { + } - fieldType = elementSchema.getTypeInfo(); + private Field convertArraySchemaField(HCatFieldSchema fieldSchema) { - } + HCatFieldSchema elementSchema = null; - } else { - fieldType = fieldSchema.getTypeInfo(); + try { + elementSchema = fieldSchema.getArrayElementSchema().get(0); + } catch (HCatException e) { + // not going to happen } - Field.Type bigQueryType = convertTypeInfoToFieldType(fieldType); + Field.Type arrayFieldType = null; + + if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) + arrayFieldType = convertPrimitiveTypeInfoToFieldType(elementSchema.getTypeInfo()); + else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) + arrayFieldType = Field.Type.string(); + else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) + arrayFieldType = Field.Type.string(); + else + try { + arrayFieldType = Field.Type.record(convertSchemaToTableFields(elementSchema.getStructSubSchema()).getFields()); + } catch (HCatException e) { + // not going to happen + } + return Field - .newBuilder(fieldName, bigQueryType) - .setDescription(fieldDescription) - .setMode(mode) + .newBuilder(fieldSchema.getName(), arrayFieldType) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.REPEATED) .build(); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { + private Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { - LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + if (HCatFieldSchema.Category.ARRAY == fieldSchema.getCategory()) + return convertArraySchemaField(fieldSchema); + else if (HCatFieldSchema.Category.STRUCT == fieldSchema.getCategory()) + return convertStructSchemaField(fieldSchema); + else if (HCatFieldSchema.Category.MAP == fieldSchema.getCategory()) + try { + return convertPrimitiveSchemaToField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment())); + } catch (HCatException e) { + // not going to happen + return null; + } + else + return convertPrimitiveSchemaToField(fieldSchema); - TableId tableId = TableId.of(database, table); + } + private Schema convertSchemaToTableFields(HCatSchema hcatSchema) { LinkedList biqQueryFields = new LinkedList<>(); for (HCatFieldSchema field : hcatSchema.getFields()) { biqQueryFields.add(convertFieldSchemaToField(field)); } - Schema tableFields = Schema.of(biqQueryFields); + return Schema.of(biqQueryFields); + } - StandardTableDefinition tableDefinition = StandardTableDefinition.of(tableFields); + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, TemporalPartitioningScheme partitioning) throws IOException { - TableInfo tableInfo = TableInfo.of(tableId, tableDefinition); + LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + + TableId tableId = TableId.of(database, table); + + StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition + .newBuilder() + .setSchema(convertSchemaToTableFields(hcatSchema)); + + if (partitioning.isDefined()) { + tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); + } + + TableInfo tableInfo = TableInfo.of(tableId, tableDefinitionBuilder.build()); LOG.info("Converted BigQuery schema: " + tableInfo); return tableInfo; } + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { + return convertSchemaToTableInfo(database, table, hcatSchema, new TemporalPartitioningScheme()); + } + } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java new file mode 100644 index 000000000..0babc5909 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java @@ -0,0 +1,37 @@ +package org.schedoscope.export.bigquery.outputschema; + +import java.util.Optional; + +import static java.util.Optional.empty; + +public class TemporalPartitioningScheme { + + public enum Granularity { + DAILY, MONTHLY + } + + public Optional getTemporalColumn() { + return temporalColumn; + } + + public Optional getGranularity() { + return granularity; + } + + private Optional temporalColumn = empty(); + + private Optional granularity = empty(); + + public boolean isDefined() { + return getTemporalColumn().isPresent() && getGranularity().isPresent(); + } + + public TemporalPartitioningScheme(String temporalColumn, Granularity granularity) { + this.granularity = Optional.of(granularity); + this.temporalColumn = Optional.of(temporalColumn); + } + + public TemporalPartitioningScheme() { + } + +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java new file mode 100644 index 000000000..a3daa4a1c --- /dev/null +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -0,0 +1,56 @@ +package org.schedoscope.export.bigquery; + +import com.google.cloud.bigquery.*; +import org.junit.AfterClass; +import org.junit.BeforeClass; + +public abstract class BigQueryBaseTest { + + private static boolean CALL_BIG_QUERY = false; + + private static boolean CLEAN_UP_BIG_QUERY = true; + + private static BigQuery bigQuery; + + public void createTable(TableInfo tableInfo) { + + if (CALL_BIG_QUERY) { + + if (bigQuery.getTable(tableInfo.getTableId()) != null) + bigQuery.delete(tableInfo.getTableId()); + + bigQuery.create(tableInfo); + + try { + Thread.currentThread().sleep(500); + } catch (InterruptedException e) { + } + + } + } + + @BeforeClass + public static void createBigQueryDataSet() { + if (!CALL_BIG_QUERY) + return; + + bigQuery = BigQueryOptions.getDefaultInstance().getService(); + + DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); + bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + + DatasetInfo datasetInfo = DatasetInfo.newBuilder("schedoscope_export_big_query_schema_test").build(); + bigQuery.create(datasetInfo); + } + + @AfterClass + public static void dropBigQueryDataSets() { + if (!CALL_BIG_QUERY || !CLEAN_UP_BIG_QUERY) + return; + + DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); + bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + } + + +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index c907529a7..869606d18 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -1,13 +1,15 @@ package org.schedoscope.export.bigquery.outputschema; -import com.google.cloud.bigquery.*; +import com.google.cloud.bigquery.Field; +import com.google.cloud.bigquery.Schema; +import com.google.cloud.bigquery.TableInfo; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; -import org.junit.After; import org.junit.Before; import org.junit.Test; +import org.schedoscope.export.bigquery.BigQueryBaseTest; import java.io.IOException; import java.util.Arrays; @@ -15,23 +17,17 @@ import static org.junit.Assert.assertEquals; -public class BigQuerySchemaTest { +public class BigQuerySchemaTest extends BigQueryBaseTest { private BigQuerySchema bigQuerySchema = new BigQuerySchema(); - private BigQuery bigQuery; + private HCatSchema flatHcatSchema, hcatSchemaWithPrimitiveList, hcatSchemaWithStruct, hcatSchemaWithListOfStruct, hcatSchemaWithListOfList, hcatSchemaWithMap, hcatSchemaWithListOfMaps; - private HCatSchema flatHcatSchema, hcatSchemaWithPrimitiveList; - - private Schema flatBigQuerySchema, bigQuerySchemaWithPrimitiveList; + private Schema flatBigQuerySchema, bigQuerySchemaWithPrimitiveList, bigQuerySchemaWithRecord, bigQuerySchemaWithListOfRecord, bigQuerySchemaWithListOfList, bigQuerySchemaWithMap, bigQuerySchemaWithListOfMaps; @Before public void setUp() throws HCatException { - bigQuery = BigQueryOptions.getDefaultInstance().getService(); - - createBigQueryDataSet(); - PrimitiveTypeInfo hcatStringType = new PrimitiveTypeInfo(); hcatStringType.setTypeName("string"); PrimitiveTypeInfo hcatIntType = new PrimitiveTypeInfo(); @@ -85,18 +81,161 @@ public void setUp() throws HCatException { Field.newBuilder("listOfInts", Field.Type.integer()).setDescription("a list of ints field").setMode(Field.Mode.REPEATED).build() ); - } + hcatSchemaWithStruct = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("aStruct", + HCatFieldSchema.Type.STRUCT, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema("aString", hcatStringType, "a string field"), + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("aLong", hcatLongType, "a long field"), + new HCatFieldSchema("aByte", hcatByteType, "a byte field"), + new HCatFieldSchema("aBoolean", hcatBooleanType, "a boolean field"), + new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"), + new HCatFieldSchema("aFloat", hcatFloatType, "a float field"), + new HCatFieldSchema( + "aNestedStruct", + HCatFieldSchema.Type.STRUCT, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema("aString", hcatStringType, "a string field") + ) + ), + "a nested struct field" + ) + ) + ), + "a struct field") - public void createBigQueryDataSet() { - dropBigQueryDataSets(); - DatasetInfo datasetInfo = DatasetInfo.newBuilder("schedoscope_export_big_query_schema_test").build(); - bigQuery.create(datasetInfo); - } + ) + ); + + bigQuerySchemaWithRecord = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aStruct", + Field.Type.record( + Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aLong", Field.Type.integer()).setDescription("a long field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aByte", Field.Type.integer()).setDescription("a byte field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aBoolean", Field.Type.bool()).setDescription("a boolean field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aDouble", Field.Type.floatingPoint()).setDescription("a double field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aFloat", Field.Type.floatingPoint()).setDescription("a float field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aNestedStruct", + Field.Type.record( + Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").setMode(Field.Mode.NULLABLE).build() + ) + ).setDescription("a nested struct field").setMode(Field.Mode.NULLABLE).build() + ) + ).setDescription("a struct field").setMode(Field.Mode.NULLABLE).build() + ); + + + hcatSchemaWithListOfStruct = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("listOfStructs", + HCatFieldSchema.Type.ARRAY, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema( + null, + HCatFieldSchema.Type.STRUCT, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema("aString", hcatStringType, "a string field") + ) + ), + null + ) + ) + ), + "a list of structs field") + ) + ); + + bigQuerySchemaWithListOfRecord = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("listOfStructs", + Field.Type.record( + Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").setMode(Field.Mode.NULLABLE).build() + ) + ).setDescription("a list of structs field").setMode(Field.Mode.REPEATED).build() + ); + + hcatSchemaWithListOfList = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("listOfList", + HCatFieldSchema.Type.ARRAY, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema(null, + HCatFieldSchema.Type.ARRAY, + new HCatSchema(Arrays.asList(new HCatFieldSchema(null, hcatIntType, null))), + null) + ) + ), + "a list of lists field") + + ) + ); + + bigQuerySchemaWithListOfList = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("listOfList", Field.Type.string()).setDescription("a list of lists field").setMode(Field.Mode.REPEATED).build() + ); + + hcatSchemaWithMap = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("aMap", + HCatFieldSchema.Type.MAP, + HCatFieldSchema.Type.STRING, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema(null, hcatStringType, null) + ) + ), + "a map field") + + ) + ); + + bigQuerySchemaWithMap = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("aMap", Field.Type.string()).setDescription("a map field").setMode(Field.Mode.NULLABLE).build() + ); - //@After - public void dropBigQueryDataSets() { - DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); - bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + hcatSchemaWithListOfMaps = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("listOfMap", + HCatFieldSchema.Type.ARRAY, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema(null, + HCatFieldSchema.Type.MAP, + HCatFieldSchema.Type.STRING, + new HCatSchema( + Arrays.asList( + new HCatFieldSchema(null, hcatStringType, null) + ) + ), + null) + ) + ), + "a list of maps field") + + ) + ); + + bigQuerySchemaWithListOfMaps = Schema.of( + Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), + Field.newBuilder("listOfMap", Field.Type.string()).setDescription("a list of maps field").setMode(Field.Mode.REPEATED).build() + ); } @Test @@ -106,6 +245,8 @@ public void testFlatTableConversion() throws IOException { assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); assertEquals(flatBigQuerySchema, converted.getDefinition().getSchema()); + + createTable(converted); } @Test @@ -116,6 +257,62 @@ public void testTableWithPrimitiveListConversion() throws IOException { assertEquals("table_with_primitive_list", converted.getTableId().getTable()); assertEquals(bigQuerySchemaWithPrimitiveList, converted.getDefinition().getSchema()); - bigQuery.create(converted); + createTable(converted); + } + + @Test + public void testTableWithStructConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_struct", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithRecord, converted.getDefinition().getSchema()); + + createTable(converted); + } + + @Test + public void testTableWithListStructConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_list_struct", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithListOfRecord, converted.getDefinition().getSchema()); + + createTable(converted); + } + + @Test + public void testTableWithListOfListsConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_list_of_lists", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithListOfList, converted.getDefinition().getSchema()); + + createTable(converted); + } + + + @Test + public void testTableWithMapConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_map", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithMap, converted.getDefinition().getSchema()); + + createTable(converted); + } + + @Test + public void testTableWithListOfMapConversion() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("table_with_list_of_map", converted.getTableId().getTable()); + assertEquals(bigQuerySchemaWithListOfMaps, converted.getDefinition().getSchema()); + + createTable(converted); } } From 6cc4ca64ddf8b184653ca10f5636a83bffc0ce4e Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Thu, 16 Nov 2017 17:36:23 +0100 Subject: [PATCH 07/34] Added _USED_HCAT_FILTER column --- .../bigquery/outputschema/BigQuerySchema.java | 9 ++++++++- .../outputschema/LogicalPartitioningScheme.java | 17 +++++++++++++++++ .../outputschema/BigQuerySchemaTest.java | 7 +++++++ 3 files changed, 32 insertions(+), 1 deletion(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index 68e4cca83..c272b526b 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -10,6 +10,7 @@ import java.io.IOException; import java.util.LinkedList; +import java.util.List; public class BigQuerySchema { @@ -26,6 +27,8 @@ private PrimitiveTypeInfo stringTypeInfo() { return stringTypeInfo; } + private Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); + private Field.Type convertPrimitiveTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { Field.Type bigQueryType; @@ -156,9 +159,13 @@ public TableInfo convertSchemaToTableInfo(String database, String table, HCatSch TableId tableId = TableId.of(database, table); + List fields = new LinkedList<>(); + fields.add(usedFilterField); + fields.addAll(convertSchemaToTableFields(hcatSchema).getFields()); + StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition .newBuilder() - .setSchema(convertSchemaToTableFields(hcatSchema)); + .setSchema(Schema.of(fields)); if (partitioning.isDefined()) { tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java new file mode 100644 index 000000000..2c45e4292 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java @@ -0,0 +1,17 @@ +package org.schedoscope.export.bigquery.outputschema; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; + +public class LogicalPartitioningScheme { + + List partitionColumns = new LinkedList(); + + public LogicalPartitioningScheme() {} + + public LogicalPartitioningScheme(String... partitionColumns) { + this.partitionColumns = Arrays.asList(partitionColumns); + } + +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index 869606d18..591f88f8b 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -56,6 +56,7 @@ public void setUp() throws HCatException { ); flatBigQuerySchema = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("aString", Field.Type.string()).setDescription("a string field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("aLong", Field.Type.integer()).setDescription("a long field").setMode(Field.Mode.NULLABLE).build(), @@ -77,6 +78,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithPrimitiveList = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("listOfInts", Field.Type.integer()).setDescription("a list of ints field").setMode(Field.Mode.REPEATED).build() ); @@ -113,6 +115,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithRecord = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("aStruct", Field.Type.record( @@ -157,6 +160,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithListOfRecord = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("listOfStructs", Field.Type.record( @@ -184,6 +188,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithListOfList = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("listOfList", Field.Type.string()).setDescription("a list of lists field").setMode(Field.Mode.REPEATED).build() ); @@ -205,6 +210,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithMap = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("aMap", Field.Type.string()).setDescription("a map field").setMode(Field.Mode.NULLABLE).build() ); @@ -233,6 +239,7 @@ public void setUp() throws HCatException { ); bigQuerySchemaWithListOfMaps = Schema.of( + Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setDescription("HCatInputFormat filter used to export the present record.").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("listOfMap", Field.Type.string()).setDescription("a list of maps field").setMode(Field.Mode.REPEATED).build() ); From 2e408d108ce49e52613eabbcc3ba26855da21f45 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 17 Nov 2017 08:59:13 +0100 Subject: [PATCH 08/34] Changed partitioning descriptor --- .../bigquery/outputschema/BigQuerySchema.java | 6 +-- .../LogicalPartitioningScheme.java | 17 ------ .../outputschema/PartitioningScheme.java | 52 +++++++++++++++++++ .../TemporalPartitioningScheme.java | 37 ------------- 4 files changed, 55 insertions(+), 57 deletions(-) delete mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java delete mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index c272b526b..71d71de65 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -153,7 +153,7 @@ private Schema convertSchemaToTableFields(HCatSchema hcatSchema) { return Schema.of(biqQueryFields); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, TemporalPartitioningScheme partitioning) throws IOException { + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); @@ -167,7 +167,7 @@ public TableInfo convertSchemaToTableInfo(String database, String table, HCatSch .newBuilder() .setSchema(Schema.of(fields)); - if (partitioning.isDefined()) { + if (partitioning.isTemporallyPartitioned()) { tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); } @@ -179,7 +179,7 @@ public TableInfo convertSchemaToTableInfo(String database, String table, HCatSch } public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { - return convertSchemaToTableInfo(database, table, hcatSchema, new TemporalPartitioningScheme()); + return convertSchemaToTableInfo(database, table, hcatSchema, new PartitioningScheme()); } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java deleted file mode 100644 index 2c45e4292..000000000 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/LogicalPartitioningScheme.java +++ /dev/null @@ -1,17 +0,0 @@ -package org.schedoscope.export.bigquery.outputschema; - -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; - -public class LogicalPartitioningScheme { - - List partitionColumns = new LinkedList(); - - public LogicalPartitioningScheme() {} - - public LogicalPartitioningScheme(String... partitionColumns) { - this.partitionColumns = Arrays.asList(partitionColumns); - } - -} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java new file mode 100644 index 000000000..77e7c2195 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java @@ -0,0 +1,52 @@ +package org.schedoscope.export.bigquery.outputschema; + +import java.util.Arrays; +import java.util.LinkedList; +import java.util.List; +import java.util.Optional; + +import static java.util.Optional.empty; + +public class PartitioningScheme { + + public enum Granularity { + DAILY, MONTHLY + } + + private Optional temporalPartitionColumn = empty(); + + private Optional granularity = empty(); + + private List logicalPartitionColumns = new LinkedList<>(); + + + public Optional getTemporalPartitionColumn() { + return temporalPartitionColumn; + } + + public Optional getGranularity() { + return granularity; + } + + public List getLogicalPartitionColumns() { + return logicalPartitionColumns; + } + + public boolean isTemporallyPartitioned() { + return getTemporalPartitionColumn().isPresent() && getGranularity().isPresent(); + } + + public boolean isLogicallyPartitioned() { + return !logicalPartitionColumns.isEmpty(); + } + + public PartitioningScheme(String temporalPartitionColumn, Granularity granularity, String... logicalPartitionColumns) { + this.granularity = Optional.of(granularity); + this.temporalPartitionColumn = Optional.of(temporalPartitionColumn); + this.logicalPartitionColumns.addAll(Arrays.asList(logicalPartitionColumns)); + } + + public PartitioningScheme() { + } + +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java deleted file mode 100644 index 0babc5909..000000000 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/TemporalPartitioningScheme.java +++ /dev/null @@ -1,37 +0,0 @@ -package org.schedoscope.export.bigquery.outputschema; - -import java.util.Optional; - -import static java.util.Optional.empty; - -public class TemporalPartitioningScheme { - - public enum Granularity { - DAILY, MONTHLY - } - - public Optional getTemporalColumn() { - return temporalColumn; - } - - public Optional getGranularity() { - return granularity; - } - - private Optional temporalColumn = empty(); - - private Optional granularity = empty(); - - public boolean isDefined() { - return getTemporalColumn().isPresent() && getGranularity().isPresent(); - } - - public TemporalPartitioningScheme(String temporalColumn, Granularity granularity) { - this.granularity = Optional.of(granularity); - this.temporalColumn = Optional.of(temporalColumn); - } - - public TemporalPartitioningScheme() { - } - -} From cb72d95f8875972c02ca8b6f8e5f1899ff8f2f71 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 24 Nov 2017 12:36:47 +0100 Subject: [PATCH 09/34] Added consideration of table postfixes --- .../bigquery/outputschema/BigQuerySchema.java | 12 ++++- .../outputschema/BigQuerySchemaTest.java | 51 +++++++++++++++++-- 2 files changed, 58 insertions(+), 5 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java index 71d71de65..b3f804737 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java @@ -153,11 +153,11 @@ private Schema convertSchemaToTableFields(HCatSchema hcatSchema) { return Schema.of(biqQueryFields); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); - TableId tableId = TableId.of(database, table); + TableId tableId = TableId.of(database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)); List fields = new LinkedList<>(); fields.add(usedFilterField); @@ -178,6 +178,14 @@ public TableInfo convertSchemaToTableInfo(String database, String table, HCatSch return tableInfo; } + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + return convertSchemaToTableInfo(database, table, hcatSchema, partitioning, ""); + } + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { + return convertSchemaToTableInfo(database, table, hcatSchema, new PartitioningScheme(), postfix); + } + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { return convertSchemaToTableInfo(database, table, hcatSchema, new PartitioningScheme()); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java index 591f88f8b..27f72f8ea 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java @@ -1,8 +1,6 @@ package org.schedoscope.export.bigquery.outputschema; -import com.google.cloud.bigquery.Field; -import com.google.cloud.bigquery.Schema; -import com.google.cloud.bigquery.TableInfo; +import com.google.cloud.bigquery.*; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; @@ -15,6 +13,7 @@ import java.util.Arrays; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; public class BigQuerySchemaTest extends BigQueryBaseTest { @@ -256,6 +255,52 @@ public void testFlatTableConversion() throws IOException { createTable(converted); } + @Test + public void testTableConversionWithPostfix() throws IOException { + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); + + assertTrue(converted.getTableId().getTable().endsWith("_test")); + } + + @Test + public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { + PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); + + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("flat_table", converted.getTableId().getTable()); + + StandardTableDefinition bigQueryTableDefinition = converted.getDefinition(); + + java.lang.reflect.Field field = StandardTableDefinition.class.getDeclaredField("timePartitioning"); + field.setAccessible(true); + TimePartitioning timePartitioning = (TimePartitioning) field.get(bigQueryTableDefinition); + + assertEquals(TimePartitioning.Type.DAY, timePartitioning.getType()); + + } + + @Test + public void testTableConversionWithPartitioningAndPostfix() throws IOException, NoSuchFieldException, IllegalAccessException { + PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); + + TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); + + assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); + assertEquals("flat_table_test", converted.getTableId().getTable()); + + StandardTableDefinition bigQueryTableDefinition = converted.getDefinition(); + + java.lang.reflect.Field field = StandardTableDefinition.class.getDeclaredField("timePartitioning"); + field.setAccessible(true); + TimePartitioning timePartitioning = (TimePartitioning) field.get(bigQueryTableDefinition); + + assertEquals(TimePartitioning.Type.DAY, timePartitioning.getType()); + + } + + @Test public void testTableWithPrimitiveListConversion() throws IOException { TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); From a46c9e8fd3e338666fc25aa209f1bbdebfda7ebb Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 29 Nov 2017 11:53:08 +0100 Subject: [PATCH 10/34] Generalizing HCat Record Traversal --- .../export/bigquery/BigQueryUtils.java | 116 +++++++++++ .../outputformat/BigQueryOutputFormat.java | 99 +++++++++ .../bigquery/outputschema/BigQuerySchema.java | 193 ------------------ .../outputschema/HCatSchemaConvertor.java | 103 ++++++++++ .../HCatSchemaToBigQuerySchemaConverter.java | 163 +++++++++++++++ .../export/bigquery/BigQueryBaseTest.java | 26 +-- ...tSchemaToBigQuerySchemaConverterTest.java} | 24 +-- 7 files changed, 506 insertions(+), 218 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java delete mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java rename schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/{BigQuerySchemaTest.java => HCatSchemaToBigQuerySchemaConverterTest.java} (90%) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java new file mode 100644 index 000000000..2736bb1be --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java @@ -0,0 +1,116 @@ +package org.schedoscope.export.bigquery; + +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.bigquery.*; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.Charset; + +public class BigQueryUtils { + + public BigQuery bigQueryService() { + return BigQueryOptions.getDefaultInstance().getService(); + } + + + public BigQuery bigQueryService(String gcpKey) throws IOException { + if (gcpKey == null) + return bigQueryService(); + + GoogleCredentials credentials = GoogleCredentials + .fromStream( + new ByteArrayInputStream(Charset.forName("UTF-8").encode(gcpKey).array()) + ); + + return BigQueryOptions.newBuilder().setCredentials(credentials).build().getService(); + } + + public boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { + return bigQueryService.getDataset(project == null ? DatasetId.of(dataset) : DatasetId.of(project, dataset)) != null; + } + + public boolean existsDataset(BigQuery bigQueryService, String dataset) { + return existsDataset(bigQueryService, null, dataset); + } + + public boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + return existsDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); + } + + public void createDataset(BigQuery bigQueryService, String project, String dataset) { + if (!existsDataset(bigQueryService, project, dataset)) { + bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build()); + } + } + + public void createDataset(BigQuery bigQueryService, String dataset) { + createDataset(bigQueryService, null, dataset); + } + + public void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + createDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); + } + + public void dropDataset(BigQuery bigQueryService, String project, String dataset) { + if (existsDataset(bigQueryService, project, dataset)) { + bigQueryService.delete( + (project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build().getDatasetId(), + BigQuery.DatasetDeleteOption.deleteContents() + ); + } + } + + public void dropDataset(BigQuery bigQueryService, String dataset) { + dropDataset(bigQueryService, null, dataset); + } + + public void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + dropDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); + } + + public boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { + return bigQueryService.getTable(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)) != null; + } + + public boolean existsTable(BigQuery bigQueryService, String dataset, String table) { + return existsTable(bigQueryService, null, table); + } + + public boolean existsTable(BigQuery bigQueryService, TableInfo tableInfo) { + return existsTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); + } + + public void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { + createDataset(bigQueryService, project, dataset); + + if (!existsTable(bigQueryService, project, dataset, table)) { + bigQueryService.create( + TableInfo.of( + project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table), + tableDefinition + ) + ); + } + } + + public void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { + createTable(bigQueryService, null, dataset, table, tableDefinition); + } + + public void createTable(BigQuery bigQueryService, TableInfo tableInfo) { + createTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable(), tableInfo.getDefinition()); + } + + public void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { + bigQueryService.delete(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); + } + + public void dropTable(BigQuery bigQueryService, String dataset, String table) { + dropTable(bigQueryService, null, table); + } + + public void dropTable(BigQuery bigQueryService, TableInfo tableInfo) { + dropTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); + } +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java new file mode 100644 index 000000000..15495c9b3 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -0,0 +1,99 @@ +package org.schedoscope.export.bigquery.outputformat; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.bigquery.TableDefinition; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.*; +import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.schedoscope.export.bigquery.BigQueryUtils; +import org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter; +import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; + +import java.io.IOException; + +public class BigQueryOutputFormat extends OutputFormat { + + private static Configuration configuration; + private static String project; + private static String database; + private static String table; + private static String usedHCatFilter; + private static HCatSchema hcatSchema; + private static String gcpKey; + private static String tableNamePostfix; + private static HCatSchemaToBigQuerySchemaConverter HCatSchemaToBigQuerySchemaConverter = new HCatSchemaToBigQuerySchemaConverter(); + private static BigQueryUtils execute; + private static BigQuery bigQueryService; + + + public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, BigQueryUtils bigQueryUtils) throws IOException { + configuration = conf; + BigQueryOutputFormat.project = project; + BigQueryOutputFormat.database = database; + BigQueryOutputFormat.table = table; + BigQueryOutputFormat.usedHCatFilter = usedHCatFilter; + BigQueryOutputFormat.hcatSchema = hcatSchema; + BigQueryOutputFormat.gcpKey = gcpKey; + execute = bigQueryUtils; + bigQueryService = execute.bigQueryService(gcpKey); + } + + public static void setOutput(Configuration conf, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, BigQueryUtils bigQueryUtils) { + setOutput(conf, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix, bigQueryUtils); + } + + public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix) throws IOException { + setOutput(conf, project, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix, new BigQueryUtils()); + } + + public static void setOutput(Configuration conf, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix) throws IOException { + setOutput(conf, null, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix); + } + + + public class BiqQueryRecordWriter extends RecordWriter { + + @Override + public void write(K key, V value) throws IOException, InterruptedException { + + } + + @Override + public void close(TaskAttemptContext context) throws IOException, InterruptedException { + + } + } + + + + @Override + public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { + + TableDefinition outputSchema = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition(hcatSchema, new PartitioningScheme()); + + String tmpOutputTable = table + + (tableNamePostfix != null ? "_" + tableNamePostfix : "") + + "_" + context.getTaskAttemptID().getTaskID().getId(); + + + execute.dropTable(bigQueryService, project, database, tmpOutputTable); + execute.createTable(bigQueryService, project, database, tmpOutputTable, outputSchema); + + return null; + + } + + + @Override + public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { + // do nothing + } + + @Override + public OutputCommitter getOutputCommitter(TaskAttemptContext context) throws IOException, InterruptedException { + return new FileOutputCommitter(FileOutputFormat.getOutputPath(context), context); + } + +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java deleted file mode 100644 index b3f804737..000000000 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchema.java +++ /dev/null @@ -1,193 +0,0 @@ -package org.schedoscope.export.bigquery.outputschema; - -import com.google.cloud.bigquery.*; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hive.hcatalog.common.HCatException; -import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hive.hcatalog.data.schema.HCatSchema; - -import java.io.IOException; -import java.util.LinkedList; -import java.util.List; - -public class BigQuerySchema { - - private static final Log LOG = LogFactory.getLog(BigQuerySchema.class); - - static private PrimitiveTypeInfo stringTypeInfo; - - private PrimitiveTypeInfo stringTypeInfo() { - if (stringTypeInfo == null) { - stringTypeInfo = new PrimitiveTypeInfo(); - stringTypeInfo.setTypeName("string"); - } - - return stringTypeInfo; - } - - private Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); - - private Field.Type convertPrimitiveTypeInfoToFieldType(PrimitiveTypeInfo typeInfo) { - Field.Type bigQueryType; - - switch (typeInfo.getTypeName()) { - case "string": - bigQueryType = Field.Type.string(); - break; - case "int": - bigQueryType = Field.Type.integer(); - break; - case "bigint": - bigQueryType = Field.Type.integer(); - break; - case "tinyint": - bigQueryType = Field.Type.integer(); - break; - case "boolean": - bigQueryType = Field.Type.bool(); - break; - case "float": - bigQueryType = Field.Type.floatingPoint(); - break; - case "double": - bigQueryType = Field.Type.floatingPoint(); - break; - default: - bigQueryType = Field.Type.string(); - } - - return bigQueryType; - } - - private Field convertPrimitiveSchemaToField(HCatFieldSchema fieldSchema) { - - return Field - .newBuilder(fieldSchema.getName(), convertPrimitiveTypeInfoToFieldType(fieldSchema.getTypeInfo())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); - - } - - private Field convertStructSchemaField(HCatFieldSchema fieldSchema) { - - HCatSchema structSchema = null; - - try { - structSchema = fieldSchema.getStructSubSchema(); - } catch (HCatException e) { - // not going to happen - } - - Schema recordSchema = convertSchemaToTableFields(structSchema); - - return Field - .newBuilder(fieldSchema.getName(), Field.Type.record(recordSchema.getFields())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); - - } - - private Field convertArraySchemaField(HCatFieldSchema fieldSchema) { - - HCatFieldSchema elementSchema = null; - - try { - elementSchema = fieldSchema.getArrayElementSchema().get(0); - } catch (HCatException e) { - // not going to happen - } - - Field.Type arrayFieldType = null; - - if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) - arrayFieldType = convertPrimitiveTypeInfoToFieldType(elementSchema.getTypeInfo()); - else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) - arrayFieldType = Field.Type.string(); - else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) - arrayFieldType = Field.Type.string(); - else - try { - arrayFieldType = Field.Type.record(convertSchemaToTableFields(elementSchema.getStructSubSchema()).getFields()); - } catch (HCatException e) { - // not going to happen - } - - - return Field - .newBuilder(fieldSchema.getName(), arrayFieldType) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.REPEATED) - .build(); - - } - - private Field convertFieldSchemaToField(HCatFieldSchema fieldSchema) { - - if (HCatFieldSchema.Category.ARRAY == fieldSchema.getCategory()) - return convertArraySchemaField(fieldSchema); - else if (HCatFieldSchema.Category.STRUCT == fieldSchema.getCategory()) - return convertStructSchemaField(fieldSchema); - else if (HCatFieldSchema.Category.MAP == fieldSchema.getCategory()) - try { - return convertPrimitiveSchemaToField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment())); - } catch (HCatException e) { - // not going to happen - return null; - } - else - return convertPrimitiveSchemaToField(fieldSchema); - - } - - private Schema convertSchemaToTableFields(HCatSchema hcatSchema) { - LinkedList biqQueryFields = new LinkedList<>(); - - for (HCatFieldSchema field : hcatSchema.getFields()) { - biqQueryFields.add(convertFieldSchemaToField(field)); - } - - return Schema.of(biqQueryFields); - } - - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { - - LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); - - TableId tableId = TableId.of(database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)); - - List fields = new LinkedList<>(); - fields.add(usedFilterField); - fields.addAll(convertSchemaToTableFields(hcatSchema).getFields()); - - StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition - .newBuilder() - .setSchema(Schema.of(fields)); - - if (partitioning.isTemporallyPartitioned()) { - tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); - } - - TableInfo tableInfo = TableInfo.of(tableId, tableDefinitionBuilder.build()); - - LOG.info("Converted BigQuery schema: " + tableInfo); - - return tableInfo; - } - - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { - return convertSchemaToTableInfo(database, table, hcatSchema, partitioning, ""); - } - - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { - return convertSchemaToTableInfo(database, table, hcatSchema, new PartitioningScheme(), postfix); - } - - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { - return convertSchemaToTableInfo(database, table, hcatSchema, new PartitioningScheme()); - } - -} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java new file mode 100644 index 000000000..e9cd24fe6 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java @@ -0,0 +1,103 @@ +package org.schedoscope.export.bigquery.outputschema; + + +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.util.LinkedList; +import java.util.List; + +public abstract class HCatSchemaConvertor { + + static private PrimitiveTypeInfo stringTypeInfo; + + private PrimitiveTypeInfo stringTypeInfo() { + if (stringTypeInfo == null) { + stringTypeInfo = new PrimitiveTypeInfo(); + stringTypeInfo.setTypeName("string"); + } + + return stringTypeInfo; + } + + protected abstract T createPrimitiveSchemaField(PrimitiveTypeInfo typeInfo); + + protected abstract F createPrimitiveArrayField(HCatFieldSchema fieldSchema, PrimitiveTypeInfo elementSchema); + + protected abstract F createStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema); + + protected abstract F createStructSchemaField(HCatFieldSchema fieldSchema, S recordSchema); + + protected abstract S createSchema(List fields); + + protected abstract F convertPrimitiveSchemaField(HCatFieldSchema fieldSchema); + + public F convertStructSchemaField(HCatFieldSchema fieldSchema) { + + HCatSchema structSchema = null; + + try { + structSchema = fieldSchema.getStructSubSchema(); + } catch (HCatException e) { + // not going to happen + } + + return createStructSchemaField(fieldSchema, convertSchemaFields(structSchema)); + + } + + public F convertArraySchemaField(HCatFieldSchema fieldSchema) { + + HCatFieldSchema elementSchema = null; + + try { + elementSchema = fieldSchema.getArrayElementSchema().get(0); + } catch (HCatException e) { + // not going to happen + } + + if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) + return createPrimitiveArrayField(fieldSchema, elementSchema.getTypeInfo()); + else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) + return createPrimitiveArrayField(fieldSchema, stringTypeInfo()); + else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) + return createPrimitiveArrayField(fieldSchema, stringTypeInfo()); + else + try { + return createStructArrayField(fieldSchema, elementSchema.getStructSubSchema()); + } catch (HCatException e) { + return null; // not going to happen + } + + } + + public F convertSchemaField(HCatFieldSchema fieldSchema) { + + if (HCatFieldSchema.Category.ARRAY == fieldSchema.getCategory()) + return convertArraySchemaField(fieldSchema); + else if (HCatFieldSchema.Category.STRUCT == fieldSchema.getCategory()) + return convertStructSchemaField(fieldSchema); + else if (HCatFieldSchema.Category.MAP == fieldSchema.getCategory()) + try { + return convertPrimitiveSchemaField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment())); + } catch (HCatException e) { + // not going to happen + return null; + } + else + return convertPrimitiveSchemaField(fieldSchema); + + } + + public S convertSchemaFields(HCatSchema hcatSchema) { + LinkedList convertedFields = new LinkedList<>(); + + for (HCatFieldSchema field : hcatSchema.getFields()) { + convertedFields.add(convertSchemaField(field)); + } + + return createSchema(convertedFields); + } +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java new file mode 100644 index 000000000..62d8e2791 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -0,0 +1,163 @@ +package org.schedoscope.export.bigquery.outputschema; + +import com.google.cloud.bigquery.*; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.io.IOException; +import java.util.LinkedList; +import java.util.List; + +public class HCatSchemaToBigQuerySchemaConverter extends HCatSchemaConvertor { + + private static final Log LOG = LogFactory.getLog(HCatSchemaToBigQuerySchemaConverter.class); + + private Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); + + + @Override + public Field.Type createPrimitiveSchemaField(PrimitiveTypeInfo typeInfo) { + Field.Type bigQueryType; + + switch (typeInfo.getTypeName()) { + case "string": + bigQueryType = Field.Type.string(); + break; + case "int": + bigQueryType = Field.Type.integer(); + break; + case "bigint": + bigQueryType = Field.Type.integer(); + break; + case "tinyint": + bigQueryType = Field.Type.integer(); + break; + case "boolean": + bigQueryType = Field.Type.bool(); + break; + case "float": + bigQueryType = Field.Type.floatingPoint(); + break; + case "double": + bigQueryType = Field.Type.floatingPoint(); + break; + default: + bigQueryType = Field.Type.string(); + } + + return bigQueryType; + } + + @Override + protected Field convertPrimitiveSchemaField(HCatFieldSchema fieldSchema) { + + return Field + .newBuilder(fieldSchema.getName(), createPrimitiveSchemaField(fieldSchema.getTypeInfo())) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); + + } + + @Override + protected Field createStructSchemaField(HCatFieldSchema fieldSchema, Schema recordSchema) { + return Field + .newBuilder(fieldSchema.getName(), Field.Type.record(recordSchema.getFields())) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); + } + + + @Override + protected Field createPrimitiveArrayField(HCatFieldSchema fieldSchema, PrimitiveTypeInfo elementSchema) { + return Field + .newBuilder(fieldSchema.getName(), createPrimitiveSchemaField(elementSchema)) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + @Override + protected Field createStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema) { + return Field + .newBuilder(fieldSchema.getName(), Field.Type.record(convertSchemaFields(subSchema).getFields())) + .setDescription(fieldSchema.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + + @Override + protected Schema createSchema(List fields) { + return Schema.of(fields); + } + + public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { + LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + + List fields = new LinkedList<>(); + fields.add(usedFilterField); + fields.addAll(convertSchemaFields(hcatSchema).getFields()); + + StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition + .newBuilder() + .setSchema(Schema.of(fields)); + + if (partitioning.isTemporallyPartitioned()) { + tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); + } + + TableDefinition tableDefinition = tableDefinitionBuilder.build(); + + LOG.info("Converted BigQuery table definition: " + tableDefinition); + + return tableDefinition; + } + + public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { + + LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + + TableId tableId = project == null ? TableId.of(database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)) : TableId.of(project, database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)); + + TableInfo tableInfo = TableInfo.of(tableId, convertSchemaToTableDefinition(hcatSchema, partitioning)); + + LOG.info("Converted BigQuery schema: " + tableInfo); + + return tableInfo; + } + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { + return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning, postfix); + } + + + public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + return convertSchemaToTableInfo(project, database, table, hcatSchema, partitioning, ""); + } + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning); + } + + public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { + return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme(), postfix); + } + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { + return convertSchemaToTableInfo(null, database, table, hcatSchema, postfix); + } + + public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { + return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme()); + } + + public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { + return convertSchemaToTableInfo(null, database, table, hcatSchema); + } + +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index a3daa4a1c..194a3dcdf 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -6,20 +6,21 @@ public abstract class BigQueryBaseTest { - private static boolean CALL_BIG_QUERY = false; + final private static boolean CALL_BIG_QUERY = false; - private static boolean CLEAN_UP_BIG_QUERY = true; + final private static boolean CLEAN_UP_BIG_QUERY = true; + + private static BigQueryUtils execute = new BigQueryUtils(); private static BigQuery bigQuery; + public void createTable(TableInfo tableInfo) { if (CALL_BIG_QUERY) { - if (bigQuery.getTable(tableInfo.getTableId()) != null) - bigQuery.delete(tableInfo.getTableId()); - - bigQuery.create(tableInfo); + execute.dropTable(bigQuery, tableInfo); + execute.createTable(bigQuery, tableInfo); try { Thread.currentThread().sleep(500); @@ -34,13 +35,12 @@ public static void createBigQueryDataSet() { if (!CALL_BIG_QUERY) return; - bigQuery = BigQueryOptions.getDefaultInstance().getService(); + bigQuery = execute.bigQueryService(); - DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); - bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) + execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); - DatasetInfo datasetInfo = DatasetInfo.newBuilder("schedoscope_export_big_query_schema_test").build(); - bigQuery.create(datasetInfo); + execute.createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); } @AfterClass @@ -48,8 +48,8 @@ public static void dropBigQueryDataSets() { if (!CALL_BIG_QUERY || !CLEAN_UP_BIG_QUERY) return; - DatasetId datasetId = DatasetId.of("schedoscope_export_big_query_schema_test"); - bigQuery.delete(datasetId, BigQuery.DatasetDeleteOption.deleteContents()); + if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) + execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java similarity index 90% rename from schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java rename to schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java index 27f72f8ea..be4ab4b43 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQuerySchemaTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java @@ -16,9 +16,9 @@ import static org.junit.Assert.assertTrue; -public class BigQuerySchemaTest extends BigQueryBaseTest { +public class HCatSchemaToBigQuerySchemaConverterTest extends BigQueryBaseTest { - private BigQuerySchema bigQuerySchema = new BigQuerySchema(); + private HCatSchemaToBigQuerySchemaConverter HCatSchemaToBigQuerySchemaConverter = new HCatSchemaToBigQuerySchemaConverter(); private HCatSchema flatHcatSchema, hcatSchemaWithPrimitiveList, hcatSchemaWithStruct, hcatSchemaWithListOfStruct, hcatSchemaWithListOfList, hcatSchemaWithMap, hcatSchemaWithListOfMaps; @@ -246,7 +246,7 @@ public void setUp() throws HCatException { @Test public void testFlatTableConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -257,7 +257,7 @@ public void testFlatTableConversion() throws IOException { @Test public void testTableConversionWithPostfix() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); assertTrue(converted.getTableId().getTable().endsWith("_test")); } @@ -266,7 +266,7 @@ public void testTableConversionWithPostfix() throws IOException { public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -285,7 +285,7 @@ public void testTableConversionWithPartitioning() throws IOException, NoSuchFiel public void testTableConversionWithPartitioningAndPostfix() throws IOException, NoSuchFieldException, IllegalAccessException { PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table_test", converted.getTableId().getTable()); @@ -303,7 +303,7 @@ public void testTableConversionWithPartitioningAndPostfix() throws IOException, @Test public void testTableWithPrimitiveListConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_primitive_list", converted.getTableId().getTable()); @@ -314,7 +314,7 @@ public void testTableWithPrimitiveListConversion() throws IOException { @Test public void testTableWithStructConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_struct", converted.getTableId().getTable()); @@ -325,7 +325,7 @@ public void testTableWithStructConversion() throws IOException { @Test public void testTableWithListStructConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_struct", converted.getTableId().getTable()); @@ -336,7 +336,7 @@ public void testTableWithListStructConversion() throws IOException { @Test public void testTableWithListOfListsConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_lists", converted.getTableId().getTable()); @@ -348,7 +348,7 @@ public void testTableWithListOfListsConversion() throws IOException { @Test public void testTableWithMapConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_map", converted.getTableId().getTable()); @@ -359,7 +359,7 @@ public void testTableWithMapConversion() throws IOException { @Test public void testTableWithListOfMapConversion() throws IOException { - TableInfo converted = bigQuerySchema.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); + TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_map", converted.getTableId().getTable()); From 317ac1dfaf921522e50a72c26053199e2c9380df Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 1 Dec 2017 09:37:26 +0100 Subject: [PATCH 11/34] 1st refactoring of HCatSchema traversal, not yet generic enough --- .../outputschema/HCatSchemaConvertor.java | 29 +++++---- .../HCatSchemaToBigQuerySchemaConverter.java | 65 +++++++++---------- 2 files changed, 47 insertions(+), 47 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java index e9cd24fe6..b111a13c3 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java @@ -22,17 +22,18 @@ private PrimitiveTypeInfo stringTypeInfo() { return stringTypeInfo; } - protected abstract T createPrimitiveSchemaField(PrimitiveTypeInfo typeInfo); + protected abstract T constructPrimitiveType(PrimitiveTypeInfo typeInfo); - protected abstract F createPrimitiveArrayField(HCatFieldSchema fieldSchema, PrimitiveTypeInfo elementSchema); + protected abstract F constructPrimitiveArrayField(HCatFieldSchema fieldSchema, T elementType); - protected abstract F createStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema); + protected abstract F constructPrimitiveField(HCatFieldSchema fieldSchema, T fieldType); - protected abstract F createStructSchemaField(HCatFieldSchema fieldSchema, S recordSchema); + protected abstract F constructStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema); - protected abstract S createSchema(List fields); + protected abstract F constructStructField(HCatFieldSchema fieldSchema, S recordSchema); + + protected abstract S constructSchema(List fields); - protected abstract F convertPrimitiveSchemaField(HCatFieldSchema fieldSchema); public F convertStructSchemaField(HCatFieldSchema fieldSchema) { @@ -44,7 +45,7 @@ public F convertStructSchemaField(HCatFieldSchema fieldSchema) { // not going to happen } - return createStructSchemaField(fieldSchema, convertSchemaFields(structSchema)); + return constructStructField(fieldSchema, convertSchemaFields(structSchema)); } @@ -59,14 +60,14 @@ public F convertArraySchemaField(HCatFieldSchema fieldSchema) { } if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) - return createPrimitiveArrayField(fieldSchema, elementSchema.getTypeInfo()); + return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(elementSchema.getTypeInfo())); else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) - return createPrimitiveArrayField(fieldSchema, stringTypeInfo()); + return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(stringTypeInfo())); else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) - return createPrimitiveArrayField(fieldSchema, stringTypeInfo()); + return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(stringTypeInfo())); else try { - return createStructArrayField(fieldSchema, elementSchema.getStructSubSchema()); + return constructStructArrayField(fieldSchema, elementSchema.getStructSubSchema()); } catch (HCatException e) { return null; // not going to happen } @@ -81,13 +82,13 @@ else if (HCatFieldSchema.Category.STRUCT == fieldSchema.getCategory()) return convertStructSchemaField(fieldSchema); else if (HCatFieldSchema.Category.MAP == fieldSchema.getCategory()) try { - return convertPrimitiveSchemaField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment())); + return constructPrimitiveField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment()), constructPrimitiveType(stringTypeInfo())); } catch (HCatException e) { // not going to happen return null; } else - return convertPrimitiveSchemaField(fieldSchema); + return constructPrimitiveField(fieldSchema, constructPrimitiveType(fieldSchema.getTypeInfo())); } @@ -98,6 +99,6 @@ public S convertSchemaFields(HCatSchema hcatSchema) { convertedFields.add(convertSchemaField(field)); } - return createSchema(convertedFields); + return constructSchema(convertedFields); } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 62d8e2791..29cbb3b8f 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -18,8 +18,34 @@ public class HCatSchemaToBigQuerySchemaConverter extends HCatSchemaConvertor fields) { + return Schema.of(fields); + } + + @Override + public Field.Type constructPrimitiveType(PrimitiveTypeInfo typeInfo) { Field.Type bigQueryType; switch (typeInfo.getTypeName()) { @@ -52,50 +78,23 @@ public Field.Type createPrimitiveSchemaField(PrimitiveTypeInfo typeInfo) { } @Override - protected Field convertPrimitiveSchemaField(HCatFieldSchema fieldSchema) { - + protected Field constructPrimitiveArrayField(HCatFieldSchema fieldSchema, Field.Type elementType) { return Field - .newBuilder(fieldSchema.getName(), createPrimitiveSchemaField(fieldSchema.getTypeInfo())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); - - } - - @Override - protected Field createStructSchemaField(HCatFieldSchema fieldSchema, Schema recordSchema) { - return Field - .newBuilder(fieldSchema.getName(), Field.Type.record(recordSchema.getFields())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); - } - - - @Override - protected Field createPrimitiveArrayField(HCatFieldSchema fieldSchema, PrimitiveTypeInfo elementSchema) { - return Field - .newBuilder(fieldSchema.getName(), createPrimitiveSchemaField(elementSchema)) + .newBuilder(fieldSchema.getName(), elementType) .setDescription(fieldSchema.getComment()) .setMode(Field.Mode.REPEATED) .build(); } @Override - protected Field createStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema) { + protected Field constructPrimitiveField(HCatFieldSchema fieldSchema, Field.Type fieldType) { return Field - .newBuilder(fieldSchema.getName(), Field.Type.record(convertSchemaFields(subSchema).getFields())) + .newBuilder(fieldSchema.getName(), fieldType) .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.REPEATED) + .setMode(Field.Mode.NULLABLE) .build(); } - - @Override - protected Schema createSchema(List fields) { - return Schema.of(fields); - } - public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); From 09fd092859bca6f72a5f0eef6105c50391d339e2 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Sun, 3 Dec 2017 21:00:40 +0100 Subject: [PATCH 12/34] Made transformation more functional, and support transformation along HCatSchema on the record level as well. --- .../outputschema/HCatSchemaConvertor.java | 104 -------- .../HCatSchemaToBigQuerySchemaConverter.java | 239 ++++++++++++------ .../outputschema/HCatSchemaTransformer.java | 162 ++++++++++++ ... HCatSchemaToBigQueryTransformerTest.java} | 27 +- 4 files changed, 330 insertions(+), 202 deletions(-) delete mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java rename schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/{HCatSchemaToBigQuerySchemaConverterTest.java => HCatSchemaToBigQueryTransformerTest.java} (90%) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java deleted file mode 100644 index b111a13c3..000000000 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaConvertor.java +++ /dev/null @@ -1,104 +0,0 @@ -package org.schedoscope.export.bigquery.outputschema; - - -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hive.hcatalog.common.HCatException; -import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hive.hcatalog.data.schema.HCatSchema; - -import java.util.LinkedList; -import java.util.List; - -public abstract class HCatSchemaConvertor { - - static private PrimitiveTypeInfo stringTypeInfo; - - private PrimitiveTypeInfo stringTypeInfo() { - if (stringTypeInfo == null) { - stringTypeInfo = new PrimitiveTypeInfo(); - stringTypeInfo.setTypeName("string"); - } - - return stringTypeInfo; - } - - protected abstract T constructPrimitiveType(PrimitiveTypeInfo typeInfo); - - protected abstract F constructPrimitiveArrayField(HCatFieldSchema fieldSchema, T elementType); - - protected abstract F constructPrimitiveField(HCatFieldSchema fieldSchema, T fieldType); - - protected abstract F constructStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema); - - protected abstract F constructStructField(HCatFieldSchema fieldSchema, S recordSchema); - - protected abstract S constructSchema(List fields); - - - public F convertStructSchemaField(HCatFieldSchema fieldSchema) { - - HCatSchema structSchema = null; - - try { - structSchema = fieldSchema.getStructSubSchema(); - } catch (HCatException e) { - // not going to happen - } - - return constructStructField(fieldSchema, convertSchemaFields(structSchema)); - - } - - public F convertArraySchemaField(HCatFieldSchema fieldSchema) { - - HCatFieldSchema elementSchema = null; - - try { - elementSchema = fieldSchema.getArrayElementSchema().get(0); - } catch (HCatException e) { - // not going to happen - } - - if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) - return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(elementSchema.getTypeInfo())); - else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) - return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(stringTypeInfo())); - else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) - return constructPrimitiveArrayField(fieldSchema, constructPrimitiveType(stringTypeInfo())); - else - try { - return constructStructArrayField(fieldSchema, elementSchema.getStructSubSchema()); - } catch (HCatException e) { - return null; // not going to happen - } - - } - - public F convertSchemaField(HCatFieldSchema fieldSchema) { - - if (HCatFieldSchema.Category.ARRAY == fieldSchema.getCategory()) - return convertArraySchemaField(fieldSchema); - else if (HCatFieldSchema.Category.STRUCT == fieldSchema.getCategory()) - return convertStructSchemaField(fieldSchema); - else if (HCatFieldSchema.Category.MAP == fieldSchema.getCategory()) - try { - return constructPrimitiveField(new HCatFieldSchema(fieldSchema.getName(), stringTypeInfo(), fieldSchema.getComment()), constructPrimitiveType(stringTypeInfo())); - } catch (HCatException e) { - // not going to happen - return null; - } - else - return constructPrimitiveField(fieldSchema, constructPrimitiveType(fieldSchema.getTypeInfo())); - - } - - public S convertSchemaFields(HCatSchema hcatSchema) { - LinkedList convertedFields = new LinkedList<>(); - - for (HCatFieldSchema field : hcatSchema.getFields()) { - convertedFields.add(convertSchemaField(field)); - } - - return constructSchema(convertedFields); - } -} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 29cbb3b8f..631877be3 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -4,103 +4,178 @@ import org.apache.commons.logging.Log; import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; import java.io.IOException; +import java.util.Arrays; import java.util.LinkedList; import java.util.List; +import java.util.function.Function; -public class HCatSchemaToBigQuerySchemaConverter extends HCatSchemaConvertor { +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.*; - private static final Log LOG = LogFactory.getLog(HCatSchemaToBigQuerySchemaConverter.class); +public class HCatSchemaToBigQuerySchemaConverter { - private Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); + static private final Log LOG = LogFactory.getLog(HCatSchemaToBigQuerySchemaConverter.class); + static private final PrimitiveTypeInfo stringTypeInfo = new PrimitiveTypeInfo(); - - @Override - protected Field constructStructField(HCatFieldSchema fieldSchema, Schema recordSchema) { - return Field - .newBuilder(fieldSchema.getName(), Field.Type.record(recordSchema.getFields())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); + static { + stringTypeInfo.setTypeName("string"); } + static private final Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); - @Override - protected Field constructStructArrayField(HCatFieldSchema fieldSchema, HCatSchema subSchema) { - return Field - .newBuilder(fieldSchema.getName(), Field.Type.record(convertSchemaFields(subSchema).getFields())) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.REPEATED) - .build(); - } + static private class Constructor implements HCatSchemaTransformer.Constructor { - @Override - protected Schema constructSchema(List fields) { - return Schema.of(fields); - } + @Override + public Function accessPrimitiveField(HCatFieldSchema field) { + return s -> field; + } - @Override - public Field.Type constructPrimitiveType(PrimitiveTypeInfo typeInfo) { - Field.Type bigQueryType; - - switch (typeInfo.getTypeName()) { - case "string": - bigQueryType = Field.Type.string(); - break; - case "int": - bigQueryType = Field.Type.integer(); - break; - case "bigint": - bigQueryType = Field.Type.integer(); - break; - case "tinyint": - bigQueryType = Field.Type.integer(); - break; - case "boolean": - bigQueryType = Field.Type.bool(); - break; - case "float": - bigQueryType = Field.Type.floatingPoint(); - break; - case "double": - bigQueryType = Field.Type.floatingPoint(); - break; - default: - bigQueryType = Field.Type.string(); - } - - return bigQueryType; - } + @Override + public Function accessMapField(HCatFieldSchema field) { + return s -> field; + } - @Override - protected Field constructPrimitiveArrayField(HCatFieldSchema fieldSchema, Field.Type elementType) { - return Field - .newBuilder(fieldSchema.getName(), elementType) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.REPEATED) - .build(); - } + @Override + public Function accessStructField(HCatFieldSchema field) { + return s -> { + try { + return field.getStructSubSchema(); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function> accessPrimitiveArrayField(HCatFieldSchema field) { + return x -> Arrays.asList(field); + } + + @Override + public Function> accessArrayArrayField(HCatFieldSchema field) { + return x -> Arrays.asList(field); + } + + @Override + public Function> accessMapArrayField(HCatFieldSchema field) { + return x -> Arrays.asList(field); + } + + @Override + public Function> accessStructArrayField(HCatFieldSchema field) { + return x -> { + try { + return Arrays.asList(field.getArrayElementSchema().get(0).getStructSubSchema()); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function, Schema> constructSchema() { + return Schema::of; + } + + + private Field.Type translatePrimitiveType(PrimitiveTypeInfo primitiveTypeInfo) { + switch (primitiveTypeInfo.getTypeName()) { + case "int": + case "bigint": + case "tinyint": + return Field.Type.integer(); + + case "boolean": + return Field.Type.bool(); + case "float": + case "double": + return Field.Type.floatingPoint(); + default: + return Field.Type.string(); + } + } + + @Override + public Function constructPrimitiveField(HCatFieldSchema field) { + return x -> Field + .newBuilder(field.getName(), translatePrimitiveType(field.getTypeInfo())) + .setDescription(field.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); + } + + @Override + public Function constructMapField(HCatFieldSchema field) { + return x -> Field + .newBuilder(field.getName(), Field.Type.string()) + .setDescription(field.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); + } + + @Override + public Function, Field> constructPrimitiveArrayField(HCatFieldSchema schema, PrimitiveTypeInfo field) { + return x -> Field + .newBuilder(schema.getName(), translatePrimitiveType(field)) + .setDescription(schema.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + @Override + public Function, Field> constructMapArrayField(HCatFieldSchema schema) { + return x -> Field + .newBuilder(schema.getName(), translatePrimitiveType(stringTypeInfo)) + .setDescription(schema.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + @Override + public Function, Field> constructArrayArrayField(HCatFieldSchema field) { + return x -> Field + .newBuilder(field.getName(), translatePrimitiveType(stringTypeInfo)) + .setDescription(field.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + @Override + public Function, Field> constructStructArrayField(HCatSchema schema, HCatFieldSchema field) { + return s -> Field + .newBuilder(field.getName(), Field.Type.record(s.get(0).getFields())) + .setDescription(field.getComment()) + .setMode(Field.Mode.REPEATED) + .build(); + } + + @Override + public Function constructStructField(HCatSchema schema, HCatFieldSchema field) { + return s -> Field + .newBuilder(field.getName(), Field.Type.record(s.getFields())) + .setDescription(field.getComment()) + .setMode(Field.Mode.NULLABLE) + .build(); + } - @Override - protected Field constructPrimitiveField(HCatFieldSchema fieldSchema, Field.Type fieldType) { - return Field - .newBuilder(fieldSchema.getName(), fieldType) - .setDescription(fieldSchema.getComment()) - .setMode(Field.Mode.NULLABLE) - .build(); } - public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { + private final static Constructor c = new Constructor(); + + static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); List fields = new LinkedList<>(); fields.add(usedFilterField); - fields.addAll(convertSchemaFields(hcatSchema).getFields()); + fields.addAll(transformSchema(c, hcatSchema).apply(hcatSchema).getFields()); StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition .newBuilder() @@ -117,45 +192,41 @@ public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, Par return tableDefinition; } - public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { - - LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); + static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { TableId tableId = project == null ? TableId.of(database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)) : TableId.of(project, database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)); TableInfo tableInfo = TableInfo.of(tableId, convertSchemaToTableDefinition(hcatSchema, partitioning)); - LOG.info("Converted BigQuery schema: " + tableInfo); - return tableInfo; } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { + static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning, postfix); } - public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { return convertSchemaToTableInfo(project, database, table, hcatSchema, partitioning, ""); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning); } - public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { + static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme(), postfix); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { + static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { return convertSchemaToTableInfo(null, database, table, hcatSchema, postfix); } - public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { + static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme()); } - public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { + static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { return convertSchemaToTableInfo(null, database, table, hcatSchema); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java new file mode 100644 index 000000000..d91542a2f --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java @@ -0,0 +1,162 @@ +package org.schedoscope.export.bigquery.outputschema; + + +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.util.List; +import java.util.function.Function; +import java.util.stream.Collectors; + +public class HCatSchemaTransformer { + + + public interface Constructor { + + Function accessPrimitiveField(HCatFieldSchema field); + + Function accessMapField(HCatFieldSchema field); + + Function accessStructField(HCatFieldSchema field); + + Function> accessPrimitiveArrayField(HCatFieldSchema field); + + Function> accessArrayArrayField(HCatFieldSchema field); + + Function> accessMapArrayField(HCatFieldSchema field); + + Function> accessStructArrayField(HCatFieldSchema field); + + Function, ST> constructSchema(); + + Function constructPrimitiveField(HCatFieldSchema field); + + Function constructMapField(HCatFieldSchema field); + + Function constructStructField(HCatSchema schema, HCatFieldSchema field); + + Function, FT> constructPrimitiveArrayField(HCatFieldSchema schema, PrimitiveTypeInfo field); + + Function, FT> constructMapArrayField(HCatFieldSchema schema); + + Function, FT> constructArrayArrayField(HCatFieldSchema field); + + Function, FT> constructStructArrayField(HCatSchema schema, HCatFieldSchema field); + } + + + static public Function transformSchema(Constructor c, HCatSchema schema) { + + return s -> + c.constructSchema().apply( + schema + .getFields() + .stream() + .map(field -> transformField(c, field).apply(s)) + .collect(Collectors.toList()) + ); + + } + + static public Function transformField(Constructor c, HCatFieldSchema field) { + + if (HCatFieldSchema.Category.ARRAY == field.getCategory()) + + return transformArrayField(c, field); + + else if (HCatFieldSchema.Category.STRUCT == field.getCategory()) + + return transformStructField(c, field); + + else if (HCatFieldSchema.Category.MAP == field.getCategory()) + + return transformMapField(c, field); + + else + + return transformPrimitiveField(c, field); + + } + + static public Function transformPrimitiveField(Constructor c, HCatFieldSchema field) { + + return s -> c.constructPrimitiveField(field).apply( + c.accessPrimitiveField(field).apply(s) + ); + + } + + static public Function transformArrayField(Constructor c, HCatFieldSchema field) { + + try { + + HCatFieldSchema elementSchema = field.getArrayElementSchema().get(0); + PrimitiveTypeInfo elementType = elementSchema.getTypeInfo(); + + if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) + + return s -> c.constructPrimitiveArrayField(field, elementType).apply( + c.accessPrimitiveArrayField(field).apply(s) + ); + + else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) + + return s -> c.constructMapArrayField(field).apply( + c.accessMapArrayField(field).apply(s) + ); + + else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) + + return s -> c.constructArrayArrayField(field).apply( + c.accessArrayArrayField(field).apply(s) + ); + + else { + + HCatSchema structSchema = elementSchema.getStructSubSchema(); + + return s -> c.constructStructArrayField(structSchema, field).apply( + c.accessStructArrayField(field).apply(s) + .stream() + .map(saf -> transformSchema(c, structSchema).apply(saf)) + .collect(Collectors.toList()) + ); + + } + + } catch (HCatException e) { + // not going to happen + + return null; + } + + } + + static public Function transformMapField(Constructor c, HCatFieldSchema field) { + + return s -> c.constructMapField(field).apply( + c.accessMapField(field).apply(s) + ); + + } + + static public Function transformStructField(Constructor c, HCatFieldSchema field) { + + try { + + HCatSchema structSchema = field.getStructSubSchema(); + + return s -> c.constructStructField(structSchema, field).apply( + transformSchema(c, structSchema).apply( + c.accessStructField(field).apply(s) + ) + ); + + } catch (HCatException e) { + // not going to happen + return null; + } + } +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java similarity index 90% rename from schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java rename to schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index be4ab4b43..27ce0f5dc 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverterTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -14,11 +14,10 @@ import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo; -public class HCatSchemaToBigQuerySchemaConverterTest extends BigQueryBaseTest { - - private HCatSchemaToBigQuerySchemaConverter HCatSchemaToBigQuerySchemaConverter = new HCatSchemaToBigQuerySchemaConverter(); +public class HCatSchemaToBigQueryTransformerTest extends BigQueryBaseTest { private HCatSchema flatHcatSchema, hcatSchemaWithPrimitiveList, hcatSchemaWithStruct, hcatSchemaWithListOfStruct, hcatSchemaWithListOfList, hcatSchemaWithMap, hcatSchemaWithListOfMaps; @@ -246,7 +245,7 @@ public void setUp() throws HCatException { @Test public void testFlatTableConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -257,7 +256,7 @@ public void testFlatTableConversion() throws IOException { @Test public void testTableConversionWithPostfix() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); assertTrue(converted.getTableId().getTable().endsWith("_test")); } @@ -266,7 +265,7 @@ public void testTableConversionWithPostfix() throws IOException { public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -282,10 +281,10 @@ public void testTableConversionWithPartitioning() throws IOException, NoSuchFiel } @Test - public void testTableConversionWithPartitioningAndPostfix() throws IOException, NoSuchFieldException, IllegalAccessException { + public void testTableConversionWithPartitioningAndPHCatSchemaToBigQuerySchemaConverterostfix() throws IOException, NoSuchFieldException, IllegalAccessException { PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table_test", converted.getTableId().getTable()); @@ -303,7 +302,7 @@ public void testTableConversionWithPartitioningAndPostfix() throws IOException, @Test public void testTableWithPrimitiveListConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_primitive_list", converted.getTableId().getTable()); @@ -314,7 +313,7 @@ public void testTableWithPrimitiveListConversion() throws IOException { @Test public void testTableWithStructConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_struct", converted.getTableId().getTable()); @@ -325,7 +324,7 @@ public void testTableWithStructConversion() throws IOException { @Test public void testTableWithListStructConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_struct", converted.getTableId().getTable()); @@ -336,7 +335,7 @@ public void testTableWithListStructConversion() throws IOException { @Test public void testTableWithListOfListsConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_lists", converted.getTableId().getTable()); @@ -348,7 +347,7 @@ public void testTableWithListOfListsConversion() throws IOException { @Test public void testTableWithMapConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_map", converted.getTableId().getTable()); @@ -359,7 +358,7 @@ public void testTableWithMapConversion() throws IOException { @Test public void testTableWithListOfMapConversion() throws IOException { - TableInfo converted = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); + TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_map", converted.getTableId().getTable()); From a3d72082acc6c32f23f1a15283d55ff51791e1bf Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Mon, 4 Dec 2017 11:04:11 +0100 Subject: [PATCH 13/34] First Implementation of Record Mapping --- .../HCatRecordToBigQueryMapConvertor.java | 200 ++++++++++++++++++ .../HCatSchemaToBigQuerySchemaConverter.java | 28 +-- .../outputschema/HCatSchemaTransformer.java | 52 ++--- 3 files changed, 240 insertions(+), 40 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java new file mode 100644 index 000000000..92bde65e5 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -0,0 +1,200 @@ +package org.schedoscope.export.bigquery.outputschema; + +import com.fasterxml.jackson.core.JsonProcessingException; +import com.fasterxml.jackson.databind.ObjectMapper; +import org.apache.commons.lang3.tuple.ImmutablePair; +import org.apache.commons.lang3.tuple.Pair; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.DefaultHCatRecord; +import org.apache.hive.hcatalog.data.HCatRecord; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.function.Function; +import java.util.stream.Collectors; + +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; + +public class HCatRecordToBigQueryMapConvertor { + + static private final Log LOG = LogFactory.getLog(HCatRecordToBigQueryMapConvertor.class); + + static private final ObjectMapper jsonConvertor = new ObjectMapper(); + + private static class Constructor implements HCatSchemaTransformer.Constructor, Map> { + + @Override + public Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field) { + return r -> { + try { + return r.get(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function accessMapField(HCatSchema schema, HCatFieldSchema field) { + return r -> { + try { + return r.getMap(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function accessStructField(HCatSchema schema, HCatFieldSchema field) { + return r -> { + try { + return new DefaultHCatRecord((List) r.getStruct(field.getName(), schema)); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field) { + return r -> { + try { + return (List) r.getList(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } + }; + } + + @Override + public Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field) { + return accessPrimitiveArrayField(schema, field); + } + + @Override + public Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field) { + return accessPrimitiveArrayField(schema, field); + } + + @Override + public Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field) { + return r -> accessPrimitiveArrayField(schema, field) + .apply(r) + .stream() + .map(s -> new DefaultHCatRecord((List) s)) + .collect(Collectors.toList()); + } + + @Override + public Function>, Map> constructSchema() { + return ps -> { + + Map m = new HashMap<>(); + + for (Pair p : ps) + m.put(p.getKey(), p.getValue()); + + return m; + + }; + } + + @Override + public Function> constructPrimitiveField(HCatFieldSchema field) { + return o -> new ImmutablePair<>(field.getName(), o); + } + + @Override + public Function> constructMapField(HCatFieldSchema field) { + return o -> { + try { + return new ImmutablePair<>(field.getName(), jsonConvertor.writeValueAsString(o)); + } catch (JsonProcessingException e) { + // should not happen + return null; + } + }; + } + + @Override + public Function, Pair> constructStructField(HCatSchema schema, HCatFieldSchema field) { + return o -> new ImmutablePair<>(field.getName(), o); + } + + @Override + public Function, Pair> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType) { + return os -> new ImmutablePair<>(field.getName(), os); + } + + @Override + public Function, Pair> constructMapArrayField(HCatFieldSchema field) { + return ms -> new ImmutablePair<>(field.getName(), + ms.stream() + .map(m -> { + try { + return jsonConvertor.writeValueAsString(m); + } catch (JsonProcessingException e) { + // should not happen + return null; + } + }) + .collect(Collectors.toList()) + ); + } + + @Override + public Function, Pair> constructArrayArrayField(HCatFieldSchema field) { + return as -> new ImmutablePair<>(field.getName(), + as.stream() + .map(a -> { + try { + return jsonConvertor.writeValueAsString(a); + } catch (JsonProcessingException e) { + // should not happen + return null; + } + }) + .collect(Collectors.toList()) + ); + } + + @Override + public Function>, Pair> constructStructArrayField(HCatSchema schema, HCatFieldSchema field) { + return ss -> new ImmutablePair<>(field.getName(), ss); + } + } + + + private static final Constructor c = new Constructor(); + + static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) throws JsonProcessingException { + + try { + LOG.info("Incoming HCat record: " + record.toString() + " of Schema: " + schema.toString()); + + Map bigQueryMap = transformSchema(c, schema).apply(record); + + LOG.info("Outgoing BigQuery map: " + jsonConvertor.writeValueAsString(bigQueryMap)); + + return bigQueryMap; + + } catch (JsonProcessingException e) { + // should not happen + LOG.error("Error converting HCatRecord", e); + + throw e; + } + + } +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 631877be3..475d344c0 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -14,7 +14,7 @@ import java.util.List; import java.util.function.Function; -import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.*; +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; public class HCatSchemaToBigQuerySchemaConverter { @@ -32,17 +32,17 @@ static private class Constructor implements HCatSchemaTransformer.Constructor accessPrimitiveField(HCatFieldSchema field) { + public Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field) { return s -> field; } @Override - public Function accessMapField(HCatFieldSchema field) { + public Function accessMapField(HCatSchema schema, HCatFieldSchema field) { return s -> field; } @Override - public Function accessStructField(HCatFieldSchema field) { + public Function accessStructField(HCatSchema schema, HCatFieldSchema field) { return s -> { try { return field.getStructSubSchema(); @@ -54,22 +54,22 @@ public Function accessStructField(HCatFieldSchema field) } @Override - public Function> accessPrimitiveArrayField(HCatFieldSchema field) { + public Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field) { return x -> Arrays.asList(field); } @Override - public Function> accessArrayArrayField(HCatFieldSchema field) { + public Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field) { return x -> Arrays.asList(field); } @Override - public Function> accessMapArrayField(HCatFieldSchema field) { + public Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field) { return x -> Arrays.asList(field); } @Override - public Function> accessStructArrayField(HCatFieldSchema field) { + public Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field) { return x -> { try { return Arrays.asList(field.getArrayElementSchema().get(0).getStructSubSchema()); @@ -122,19 +122,19 @@ public Function constructMapField(HCatFieldSchema field) } @Override - public Function, Field> constructPrimitiveArrayField(HCatFieldSchema schema, PrimitiveTypeInfo field) { + public Function, Field> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType) { return x -> Field - .newBuilder(schema.getName(), translatePrimitiveType(field)) - .setDescription(schema.getComment()) + .newBuilder(field.getName(), translatePrimitiveType(elementType)) + .setDescription(field.getComment()) .setMode(Field.Mode.REPEATED) .build(); } @Override - public Function, Field> constructMapArrayField(HCatFieldSchema schema) { + public Function, Field> constructMapArrayField(HCatFieldSchema field) { return x -> Field - .newBuilder(schema.getName(), translatePrimitiveType(stringTypeInfo)) - .setDescription(schema.getComment()) + .newBuilder(field.getName(), translatePrimitiveType(stringTypeInfo)) + .setDescription(field.getComment()) .setMode(Field.Mode.REPEATED) .build(); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java index d91542a2f..9e7020acb 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java @@ -15,19 +15,19 @@ public class HCatSchemaTransformer { public interface Constructor { - Function accessPrimitiveField(HCatFieldSchema field); + Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field); - Function accessMapField(HCatFieldSchema field); + Function accessMapField(HCatSchema schema, HCatFieldSchema field); - Function accessStructField(HCatFieldSchema field); + Function accessStructField(HCatSchema schema, HCatFieldSchema field); - Function> accessPrimitiveArrayField(HCatFieldSchema field); + Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field); - Function> accessArrayArrayField(HCatFieldSchema field); + Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field); - Function> accessMapArrayField(HCatFieldSchema field); + Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field); - Function> accessStructArrayField(HCatFieldSchema field); + Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field); Function, ST> constructSchema(); @@ -37,9 +37,9 @@ public interface Constructor { Function constructStructField(HCatSchema schema, HCatFieldSchema field); - Function, FT> constructPrimitiveArrayField(HCatFieldSchema schema, PrimitiveTypeInfo field); + Function, FT> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType); - Function, FT> constructMapArrayField(HCatFieldSchema schema); + Function, FT> constructMapArrayField(HCatFieldSchema field); Function, FT> constructArrayArrayField(HCatFieldSchema field); @@ -54,41 +54,41 @@ static public Function transformSchema(Constructor transformField(c, field).apply(s)) + .map(field -> transformField(c, schema, field).apply(s)) .collect(Collectors.toList()) ); } - static public Function transformField(Constructor c, HCatFieldSchema field) { + static public Function transformField(Constructor c, HCatSchema schema, HCatFieldSchema field) { if (HCatFieldSchema.Category.ARRAY == field.getCategory()) - return transformArrayField(c, field); + return transformArrayField(c, schema, field); else if (HCatFieldSchema.Category.STRUCT == field.getCategory()) - return transformStructField(c, field); + return transformStructField(c, schema, field); else if (HCatFieldSchema.Category.MAP == field.getCategory()) - return transformMapField(c, field); + return transformMapField(c, schema, field); else - return transformPrimitiveField(c, field); + return transformPrimitiveField(c, schema, field); } - static public Function transformPrimitiveField(Constructor c, HCatFieldSchema field) { + static public Function transformPrimitiveField(Constructor c, HCatSchema schema, HCatFieldSchema field) { return s -> c.constructPrimitiveField(field).apply( - c.accessPrimitiveField(field).apply(s) + c.accessPrimitiveField(schema, field).apply(s) ); } - static public Function transformArrayField(Constructor c, HCatFieldSchema field) { + static public Function transformArrayField(Constructor c, HCatSchema schema, HCatFieldSchema field) { try { @@ -98,19 +98,19 @@ static public Function transformArrayField(Constructor c.constructPrimitiveArrayField(field, elementType).apply( - c.accessPrimitiveArrayField(field).apply(s) + c.accessPrimitiveArrayField(schema, field).apply(s) ); else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) return s -> c.constructMapArrayField(field).apply( - c.accessMapArrayField(field).apply(s) + c.accessMapArrayField(schema, field).apply(s) ); else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) return s -> c.constructArrayArrayField(field).apply( - c.accessArrayArrayField(field).apply(s) + c.accessArrayArrayField(schema, field).apply(s) ); else { @@ -118,7 +118,7 @@ else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) HCatSchema structSchema = elementSchema.getStructSubSchema(); return s -> c.constructStructArrayField(structSchema, field).apply( - c.accessStructArrayField(field).apply(s) + c.accessStructArrayField(schema, field).apply(s) .stream() .map(saf -> transformSchema(c, structSchema).apply(saf)) .collect(Collectors.toList()) @@ -134,15 +134,15 @@ else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) } - static public Function transformMapField(Constructor c, HCatFieldSchema field) { + static public Function transformMapField(Constructor c, HCatSchema schema, HCatFieldSchema field) { return s -> c.constructMapField(field).apply( - c.accessMapField(field).apply(s) + c.accessMapField(schema, field).apply(s) ); } - static public Function transformStructField(Constructor c, HCatFieldSchema field) { + static public Function transformStructField(Constructor c, HCatSchema schema, HCatFieldSchema field) { try { @@ -150,7 +150,7 @@ static public Function transformStructField(Constructor c.constructStructField(structSchema, field).apply( transformSchema(c, structSchema).apply( - c.accessStructField(field).apply(s) + c.accessStructField(schema, field).apply(s) ) ); From 46a9a3c5a93afe67eb78abb4323c3d253b065c04 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 6 Dec 2017 09:18:35 +0100 Subject: [PATCH 14/34] Reshaped Constructor Interface to not use functions but present hooks as normal interface methods --- .../HCatRecordToBigQueryMapConvertor.java | 136 ++++++++---------- .../HCatSchemaToBigQuerySchemaConverter.java | 130 ++++++++--------- .../outputschema/HCatSchemaTransformer.java | 62 ++++---- 3 files changed, 153 insertions(+), 175 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index 92bde65e5..f5cdadf45 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -16,7 +16,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.function.Function; import java.util.stream.Collectors; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; @@ -30,117 +29,102 @@ public class HCatRecordToBigQueryMapConvertor { private static class Constructor implements HCatSchemaTransformer.Constructor, Map> { @Override - public Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field) { - return r -> { - try { - return r.get(field.getName(), schema); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public Object accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + try { + return hCatRecord.get(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } } @Override - public Function accessMapField(HCatSchema schema, HCatFieldSchema field) { - return r -> { - try { - return r.getMap(field.getName(), schema); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public Object accessMapField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + try { + return hCatRecord.getMap(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } } @Override - public Function accessStructField(HCatSchema schema, HCatFieldSchema field) { - return r -> { - try { - return new DefaultHCatRecord((List) r.getStruct(field.getName(), schema)); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public HCatRecord accessStructField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + try { + return new DefaultHCatRecord((List) hCatRecord.getStruct(field.getName(), schema)); + } catch (HCatException e) { + // not going to happen + return null; + } } @Override - public Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field) { - return r -> { - try { - return (List) r.getList(field.getName(), schema); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public List accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + try { + return (List) hCatRecord.getList(field.getName(), schema); + } catch (HCatException e) { + // not going to happen + return null; + } } @Override - public Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field) { - return accessPrimitiveArrayField(schema, field); + public List accessArrayArrayField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + return accessPrimitiveArrayField(schema, field, hCatRecord); } @Override - public Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field) { - return accessPrimitiveArrayField(schema, field); + public List accessMapArrayField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + return accessPrimitiveArrayField(schema, field, hCatRecord); } @Override - public Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field) { - return r -> accessPrimitiveArrayField(schema, field) - .apply(r) + public List accessStructArrayField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { + return accessPrimitiveArrayField(schema, field, hCatRecord) .stream() .map(s -> new DefaultHCatRecord((List) s)) .collect(Collectors.toList()); } @Override - public Function>, Map> constructSchema() { - return ps -> { + public Map constructSchema(List> pairs) { + Map m = new HashMap<>(); - Map m = new HashMap<>(); + for (Pair p : pairs) + m.put(p.getKey(), p.getValue()); - for (Pair p : ps) - m.put(p.getKey(), p.getValue()); - - return m; - - }; + return m; } @Override - public Function> constructPrimitiveField(HCatFieldSchema field) { - return o -> new ImmutablePair<>(field.getName(), o); + public Pair constructPrimitiveField(HCatFieldSchema field, Object o) { + return new ImmutablePair<>(field.getName(), o); } @Override - public Function> constructMapField(HCatFieldSchema field) { - return o -> { - try { - return new ImmutablePair<>(field.getName(), jsonConvertor.writeValueAsString(o)); - } catch (JsonProcessingException e) { - // should not happen - return null; - } - }; + public Pair constructMapField(HCatFieldSchema field, Object o) { + try { + return new ImmutablePair<>(field.getName(), jsonConvertor.writeValueAsString(o)); + } catch (JsonProcessingException e) { + // should not happen + return null; + } } @Override - public Function, Pair> constructStructField(HCatSchema schema, HCatFieldSchema field) { - return o -> new ImmutablePair<>(field.getName(), o); + public Pair constructStructField(HCatSchema schema, HCatFieldSchema field, Map stringObjectMap) { + return new ImmutablePair<>(field.getName(), stringObjectMap); } @Override - public Function, Pair> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType) { - return os -> new ImmutablePair<>(field.getName(), os); + public Pair constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType, List objects) { + return new ImmutablePair<>(field.getName(), objects); } @Override - public Function, Pair> constructMapArrayField(HCatFieldSchema field) { - return ms -> new ImmutablePair<>(field.getName(), - ms.stream() + public Pair constructMapArrayField(HCatFieldSchema field, List objects) { + return new ImmutablePair<>(field.getName(), + objects.stream() .map(m -> { try { return jsonConvertor.writeValueAsString(m); @@ -154,9 +138,9 @@ public Function, Pair> constructMapArrayField(HCatF } @Override - public Function, Pair> constructArrayArrayField(HCatFieldSchema field) { - return as -> new ImmutablePair<>(field.getName(), - as.stream() + public Pair constructArrayArrayField(HCatFieldSchema field, List objects) { + return new ImmutablePair<>(field.getName(), + objects.stream() .map(a -> { try { return jsonConvertor.writeValueAsString(a); @@ -170,8 +154,8 @@ public Function, Pair> constructArrayArrayField(HCa } @Override - public Function>, Pair> constructStructArrayField(HCatSchema schema, HCatFieldSchema field) { - return ss -> new ImmutablePair<>(field.getName(), ss); + public Pair constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List> maps) { + return new ImmutablePair<>(field.getName(), maps); } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 475d344c0..cc56afec7 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -12,7 +12,6 @@ import java.util.Arrays; import java.util.LinkedList; import java.util.List; -import java.util.function.Function; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; @@ -30,82 +29,76 @@ public class HCatSchemaToBigQuerySchemaConverter { static private class Constructor implements HCatSchemaTransformer.Constructor { + private Field.Type translatePrimitiveType(PrimitiveTypeInfo primitiveTypeInfo) { + switch (primitiveTypeInfo.getTypeName()) { + case "int": + case "bigint": + case "tinyint": + return Field.Type.integer(); - @Override - public Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field) { - return s -> field; + case "boolean": + return Field.Type.bool(); + case "float": + case "double": + return Field.Type.floatingPoint(); + default: + return Field.Type.string(); + } } @Override - public Function accessMapField(HCatSchema schema, HCatFieldSchema field) { - return s -> field; + public HCatFieldSchema accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + return field; } @Override - public Function accessStructField(HCatSchema schema, HCatFieldSchema field) { - return s -> { - try { - return field.getStructSubSchema(); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public HCatFieldSchema accessMapField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + return field; } @Override - public Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field) { - return x -> Arrays.asList(field); + public HCatSchema accessStructField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + try { + return field.getStructSubSchema(); + } catch (HCatException e) { + // not going to happen + return null; + } } @Override - public Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field) { - return x -> Arrays.asList(field); + public List accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + return Arrays.asList(field); } @Override - public Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field) { - return x -> Arrays.asList(field); + public List accessArrayArrayField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + return Arrays.asList(field); } @Override - public Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field) { - return x -> { - try { - return Arrays.asList(field.getArrayElementSchema().get(0).getStructSubSchema()); - } catch (HCatException e) { - // not going to happen - return null; - } - }; + public List accessMapArrayField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + return Arrays.asList(field); } @Override - public Function, Schema> constructSchema() { - return Schema::of; + public List accessStructArrayField(HCatSchema schema, HCatFieldSchema field, HCatSchema hCatSchema) { + try { + return Arrays.asList(field.getArrayElementSchema().get(0).getStructSubSchema()); + } catch (HCatException e) { + // not going to happen + return null; + } } - - private Field.Type translatePrimitiveType(PrimitiveTypeInfo primitiveTypeInfo) { - switch (primitiveTypeInfo.getTypeName()) { - case "int": - case "bigint": - case "tinyint": - return Field.Type.integer(); - - case "boolean": - return Field.Type.bool(); - case "float": - case "double": - return Field.Type.floatingPoint(); - default: - return Field.Type.string(); - } + @Override + public Schema constructSchema(List fields) { + return Schema.of(fields); } @Override - public Function constructPrimitiveField(HCatFieldSchema field) { - return x -> Field + public Field constructPrimitiveField(HCatFieldSchema field, HCatFieldSchema fieldSchema) { + return Field .newBuilder(field.getName(), translatePrimitiveType(field.getTypeInfo())) .setDescription(field.getComment()) .setMode(Field.Mode.NULLABLE) @@ -113,8 +106,8 @@ public Function constructPrimitiveField(HCatFieldSchema } @Override - public Function constructMapField(HCatFieldSchema field) { - return x -> Field + public Field constructMapField(HCatFieldSchema field, HCatFieldSchema fieldSchema) { + return Field .newBuilder(field.getName(), Field.Type.string()) .setDescription(field.getComment()) .setMode(Field.Mode.NULLABLE) @@ -122,26 +115,26 @@ public Function constructMapField(HCatFieldSchema field) } @Override - public Function, Field> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType) { - return x -> Field - .newBuilder(field.getName(), translatePrimitiveType(elementType)) + public Field constructStructField(HCatSchema schema, HCatFieldSchema field, Schema structSchema) { + return Field + .newBuilder(field.getName(), Field.Type.record(structSchema.getFields())) .setDescription(field.getComment()) - .setMode(Field.Mode.REPEATED) + .setMode(Field.Mode.NULLABLE) .build(); } @Override - public Function, Field> constructMapArrayField(HCatFieldSchema field) { - return x -> Field - .newBuilder(field.getName(), translatePrimitiveType(stringTypeInfo)) + public Field constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType, List hCatFieldSchemas) { + return Field + .newBuilder(field.getName(), translatePrimitiveType(elementType)) .setDescription(field.getComment()) .setMode(Field.Mode.REPEATED) .build(); } @Override - public Function, Field> constructArrayArrayField(HCatFieldSchema field) { - return x -> Field + public Field constructMapArrayField(HCatFieldSchema field, List hCatFieldSchemas) { + return Field .newBuilder(field.getName(), translatePrimitiveType(stringTypeInfo)) .setDescription(field.getComment()) .setMode(Field.Mode.REPEATED) @@ -149,23 +142,22 @@ public Function, Field> constructArrayArrayField(HCatField } @Override - public Function, Field> constructStructArrayField(HCatSchema schema, HCatFieldSchema field) { - return s -> Field - .newBuilder(field.getName(), Field.Type.record(s.get(0).getFields())) + public Field constructArrayArrayField(HCatFieldSchema field, List hCatFieldSchemas) { + return Field + .newBuilder(field.getName(), translatePrimitiveType(stringTypeInfo)) .setDescription(field.getComment()) .setMode(Field.Mode.REPEATED) .build(); } @Override - public Function constructStructField(HCatSchema schema, HCatFieldSchema field) { - return s -> Field - .newBuilder(field.getName(), Field.Type.record(s.getFields())) + public Field constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List schemas) { + return Field + .newBuilder(field.getName(), Field.Type.record(schemas.get(0).getFields())) .setDescription(field.getComment()) - .setMode(Field.Mode.NULLABLE) + .setMode(Field.Mode.REPEATED) .build(); } - } private final static Constructor c = new Constructor(); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java index 9e7020acb..eebeccd97 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java @@ -15,42 +15,42 @@ public class HCatSchemaTransformer { public interface Constructor { - Function accessPrimitiveField(HCatSchema schema, HCatFieldSchema field); + F accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, S s); - Function accessMapField(HCatSchema schema, HCatFieldSchema field); + F accessMapField(HCatSchema schema, HCatFieldSchema field, S s); - Function accessStructField(HCatSchema schema, HCatFieldSchema field); + S accessStructField(HCatSchema schema, HCatFieldSchema field, S s); - Function> accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field); + List accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field, S s); - Function> accessArrayArrayField(HCatSchema schema, HCatFieldSchema field); + List accessArrayArrayField(HCatSchema schema, HCatFieldSchema field, S s); - Function> accessMapArrayField(HCatSchema schema, HCatFieldSchema field); + List accessMapArrayField(HCatSchema schema, HCatFieldSchema field, S s); - Function> accessStructArrayField(HCatSchema schema, HCatFieldSchema field); + List accessStructArrayField(HCatSchema schema, HCatFieldSchema field, S s); - Function, ST> constructSchema(); + ST constructSchema(List fts); - Function constructPrimitiveField(HCatFieldSchema field); + FT constructPrimitiveField(HCatFieldSchema field, F f); - Function constructMapField(HCatFieldSchema field); + FT constructMapField(HCatFieldSchema field, F f); - Function constructStructField(HCatSchema schema, HCatFieldSchema field); + FT constructStructField(HCatSchema schema, HCatFieldSchema field, ST st); - Function, FT> constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType); + FT constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType, List fs); - Function, FT> constructMapArrayField(HCatFieldSchema field); + FT constructMapArrayField(HCatFieldSchema field, List fs); - Function, FT> constructArrayArrayField(HCatFieldSchema field); + FT constructArrayArrayField(HCatFieldSchema field, List fs); - Function, FT> constructStructArrayField(HCatSchema schema, HCatFieldSchema field); + FT constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List sts); } static public Function transformSchema(Constructor c, HCatSchema schema) { return s -> - c.constructSchema().apply( + c.constructSchema( schema .getFields() .stream() @@ -82,8 +82,8 @@ else if (HCatFieldSchema.Category.MAP == field.getCategory()) static public Function transformPrimitiveField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - return s -> c.constructPrimitiveField(field).apply( - c.accessPrimitiveField(schema, field).apply(s) + return s -> c.constructPrimitiveField(field, + c.accessPrimitiveField(schema, field, s) ); } @@ -97,28 +97,29 @@ static public Function transformArrayField(Constructor c.constructPrimitiveArrayField(field, elementType).apply( - c.accessPrimitiveArrayField(schema, field).apply(s) + return s -> c.constructPrimitiveArrayField( + field, elementType, + c.accessPrimitiveArrayField(schema, field, s) ); else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) - return s -> c.constructMapArrayField(field).apply( - c.accessMapArrayField(schema, field).apply(s) + return s -> c.constructMapArrayField(field, + c.accessMapArrayField(schema, field, s) ); else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) - return s -> c.constructArrayArrayField(field).apply( - c.accessArrayArrayField(schema, field).apply(s) + return s -> c.constructArrayArrayField(field, + c.accessArrayArrayField(schema, field, s) ); else { HCatSchema structSchema = elementSchema.getStructSubSchema(); - return s -> c.constructStructArrayField(structSchema, field).apply( - c.accessStructArrayField(schema, field).apply(s) + return s -> c.constructStructArrayField(structSchema, field, + c.accessStructArrayField(schema, field, s) .stream() .map(saf -> transformSchema(c, structSchema).apply(saf)) .collect(Collectors.toList()) @@ -136,8 +137,8 @@ else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) static public Function transformMapField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - return s -> c.constructMapField(field).apply( - c.accessMapField(schema, field).apply(s) + return s -> c.constructMapField(field, + c.accessMapField(schema, field, s) ); } @@ -148,9 +149,10 @@ static public Function transformStructField(Constructor c.constructStructField(structSchema, field).apply( + return s -> c.constructStructField( + structSchema, field, transformSchema(c, structSchema).apply( - c.accessStructField(schema, field).apply(s) + c.accessStructField(schema, field, s) ) ); From 97eb11756e7f772e688786c490f49da72d0f884b Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 8 Dec 2017 16:41:20 +0100 Subject: [PATCH 15/34] Implemented record conversion and tested insertion of converted records into tables --- .../export/bigquery/BigQueryUtils.java | 20 ++ .../HCatRecordToBigQueryMapConvertor.java | 6 +- .../HCatSchemaToBigQuerySchemaConverter.java | 6 +- .../export/bigquery/BigQueryBaseTest.java | 47 +++- .../HCatSchemaToBigQueryTransformerTest.java | 248 ++++++++++++++++++ 5 files changed, 314 insertions(+), 13 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java index 2736bb1be..e814dd759 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java @@ -6,6 +6,9 @@ import java.io.ByteArrayInputStream; import java.io.IOException; import java.nio.charset.Charset; +import java.util.Arrays; +import java.util.Map; +import java.util.stream.Collectors; public class BigQueryUtils { @@ -113,4 +116,21 @@ public void dropTable(BigQuery bigQueryService, String dataset, String table) { public void dropTable(BigQuery bigQueryService, TableInfo tableInfo) { dropTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); } + + public void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { + + InsertAllRequest insertAllRequest = InsertAllRequest.newBuilder(table) + .setRows( + Arrays.stream(rowsToInsert) + .map(InsertAllRequest.RowToInsert::of) + .collect(Collectors.toList()) + ) + .build(); + + InsertAllResponse result = bigQueryService.insertAll(insertAllRequest); + + if (result.hasErrors()) { + throw new BigQueryException(999, "Could not insert some records into BigQuery table: " + result.getInsertErrors().toString()); + } + } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index f5cdadf45..4d3a4c1ee 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -26,7 +26,7 @@ public class HCatRecordToBigQueryMapConvertor { static private final ObjectMapper jsonConvertor = new ObjectMapper(); - private static class Constructor implements HCatSchemaTransformer.Constructor, Map> { + private static final HCatSchemaTransformer.Constructor, Map> c = new HCatSchemaTransformer.Constructor, Map>() { @Override public Object accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { @@ -157,10 +157,8 @@ public Pair constructArrayArrayField(HCatFieldSchema field, List public Pair constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List> maps) { return new ImmutablePair<>(field.getName(), maps); } - } - + }; - private static final Constructor c = new Constructor(); static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) throws JsonProcessingException { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index cc56afec7..3c0916672 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -27,7 +27,7 @@ public class HCatSchemaToBigQuerySchemaConverter { static private final Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); - static private class Constructor implements HCatSchemaTransformer.Constructor { + static private final HCatSchemaTransformer.Constructor c = new HCatSchemaTransformer.Constructor() { private Field.Type translatePrimitiveType(PrimitiveTypeInfo primitiveTypeInfo) { switch (primitiveTypeInfo.getTypeName()) { @@ -158,9 +158,7 @@ public Field constructStructArrayField(HCatSchema schema, HCatFieldSchema field, .setMode(Field.Mode.REPEATED) .build(); } - } - - private final static Constructor c = new Constructor(); + }; static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index 194a3dcdf..ed4d8914c 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -4,6 +4,8 @@ import org.junit.AfterClass; import org.junit.BeforeClass; +import java.util.Map; + public abstract class BigQueryBaseTest { final private static boolean CALL_BIG_QUERY = false; @@ -19,17 +21,45 @@ public void createTable(TableInfo tableInfo) { if (CALL_BIG_QUERY) { - execute.dropTable(bigQuery, tableInfo); - execute.createTable(bigQuery, tableInfo); - try { - Thread.currentThread().sleep(500); - } catch (InterruptedException e) { + + execute.dropTable(bigQuery, tableInfo); + execute.createTable(bigQuery, tableInfo); + + } catch (Throwable t) { + t.printStackTrace(); + + try { + Thread.currentThread().sleep(500); + } catch (InterruptedException e) { + } + + createTable(tableInfo); } } } + public void insertIntoTable(String dataset, String table, Schema schema, Map... data) { + if (CALL_BIG_QUERY) { + TableId tableId = TableId.of(dataset, table); + TableInfo tableInfo = TableInfo.of(tableId, StandardTableDefinition.newBuilder().setSchema(schema).build()); + createTable(tableInfo); + + try { + execute.insertIntoTable(bigQuery, tableId, data); + } catch (Throwable t) { + t.printStackTrace(); + try { + Thread.currentThread().sleep(500); + } catch (InterruptedException e) { + } + + insertIntoTable(dataset, table, schema, data); + } + } + } + @BeforeClass public static void createBigQueryDataSet() { if (!CALL_BIG_QUERY) @@ -40,7 +70,11 @@ public static void createBigQueryDataSet() { if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) + execute.dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + execute.createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + execute.createDataset(bigQuery, "schedoscope_export_big_query_record_test"); } @AfterClass @@ -50,6 +84,9 @@ public static void dropBigQueryDataSets() { if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + + if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) + execute.dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index 27ce0f5dc..03199e70b 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -1,8 +1,10 @@ package org.schedoscope.export.bigquery.outputschema; +import com.fasterxml.jackson.core.JsonProcessingException; import com.google.cloud.bigquery.*; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.DefaultHCatRecord; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.junit.Before; @@ -11,9 +13,12 @@ import java.io.IOException; import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo; @@ -23,6 +28,10 @@ public class HCatSchemaToBigQueryTransformerTest extends BigQueryBaseTest { private Schema flatBigQuerySchema, bigQuerySchemaWithPrimitiveList, bigQuerySchemaWithRecord, bigQuerySchemaWithListOfRecord, bigQuerySchemaWithListOfList, bigQuerySchemaWithMap, bigQuerySchemaWithListOfMaps; + private DefaultHCatRecord flatHcatRecord, hcatRecordWithPrimitiveList, hCatRecordWithStruct, hcatRecordWithListOfStruct, hcatRecordWithListOfList, hcatRecordWithMap, hcatRecordWithListOfMap; + + private Map flatBigQueryRecord, bigQueryRecordWithPrimitiveList, bigQueryRecordWithStruct, bigQueryRecordWithListOfStruct, bigQueryRecordWithListOfList, bigQueryRecordWithMap, bigQueryRecordWithListOfMap; + @Before public void setUp() throws HCatException { @@ -241,6 +250,181 @@ public void setUp() throws HCatException { Field.newBuilder("anInt", Field.Type.integer()).setDescription("an int field").setMode(Field.Mode.NULLABLE).build(), Field.newBuilder("listOfMap", Field.Type.string()).setDescription("a list of maps field").setMode(Field.Mode.REPEATED).build() ); + + + flatHcatRecord = new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString"); + set("anInt", flatHcatSchema, 1); + set("aLong", flatHcatSchema, 2L); + set("aByte", flatHcatSchema, (byte) 3); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 3.4d); + set("aFloat", flatHcatSchema, 3.5f); + + }}; + + flatBigQueryRecord = new HashMap() {{ + put("aString", "someString"); + put("anInt", 1); + put("aLong", 2L); + put("aByte", (byte) 3); + put("aBoolean", true); + put("aDouble", 3.4d); + put("aFloat", 3.5f); + }}; + + hcatRecordWithPrimitiveList = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithPrimitiveList, 2); + set("listOfInts", hcatSchemaWithPrimitiveList, Arrays.asList(1, 2, 3)); + }}; + + bigQueryRecordWithPrimitiveList = new HashMap() {{ + put("anInt", 2); + put("listOfInts", Arrays.asList(1, 2, 3)); + }}; + + hCatRecordWithStruct = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithStruct, 2); + set("aStruct", hcatSchemaWithStruct, Arrays.asList( + "someString", + 1, + 2L, + (byte) 3, + true, + 3.14d, + 3.14f, + Arrays.asList( + "someMoreString" + ) + )); + }}; + + bigQueryRecordWithStruct = new HashMap() {{ + put("anInt", 2); + put("aStruct", new HashMap() {{ + put("aString", "someString"); + put("anInt", 1); + put("aLong", 2L); + put("aByte", (byte) 3); + put("aBoolean", true); + put("aDouble", 3.14d); + put("aFloat", 3.14f); + put("aNestedStruct", new HashMap() {{ + put("aString", "someMoreString"); + }}); + }} + ); + }}; + + hcatRecordWithListOfStruct = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithListOfStruct, 2); + set("listOfStructs", hcatSchemaWithListOfStruct, Arrays.asList( + Arrays.asList("someString"), + Arrays.asList("someMoreString"), + Arrays.asList("someMoreAndMoreString"), + Arrays.asList("evenSomeMoreString") + )); + }}; + + bigQueryRecordWithListOfStruct = new HashMap() {{ + put("anInt", 2); + put("listOfStructs", Arrays.asList( + new HashMap() {{ + put("aString", "someString"); + }}, + new HashMap() {{ + put("aString", "someMoreString"); + }}, + new HashMap() {{ + put("aString", "someMoreAndMoreString"); + }}, + new HashMap() {{ + put("aString", "evenSomeMoreString"); + }} + )); + }}; + + hcatRecordWithListOfList = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithListOfList, 2); + set("listOfList", hcatSchemaWithListOfList, Arrays.asList( + Arrays.asList(1, 2, 3, 4), + Arrays.asList(5, 6, 7, 8), + Arrays.asList(9, 10, 11, 12) + )); + }}; + + bigQueryRecordWithListOfList = new HashMap() {{ + put("anInt", 2); + put("listOfList", Arrays.asList( + "[1,2,3,4]", + "[5,6,7,8]", + "[9,10,11,12]" + )); + }}; + + hcatRecordWithMap = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithMap, 2); + set("aMap", hcatSchemaWithMap, new HashMap() {{ + put("a", 1); + put("b", 2); + put("c", 3); + }}); + }}; + + bigQueryRecordWithMap = new HashMap() {{ + put("anInt", 2); + put("aMap", "{" + + "\"a\":1," + + "\"b\":2," + + "\"c\":3" + + "}"); + }}; + + hcatRecordWithListOfMap = new DefaultHCatRecord(2) {{ + set("anInt", hcatSchemaWithListOfMaps, 2); + set("listOfMap", hcatSchemaWithListOfMaps, Arrays.asList( + new HashMap() {{ + put("a", 1); + put("b", 2); + put("c", 3); + }}, + new HashMap() {{ + put("d", 4); + put("e", 5); + put("f", 6); + }}, + new HashMap() {{ + put("g", 7); + put("h", 8); + put("i", 9); + }}) + ); + }}; + + bigQueryRecordWithListOfMap = new HashMap() {{ + put("anInt", 2); + put("listOfMap", Arrays.asList( + "{" + + "\"a\":1," + + "\"b\":2," + + "\"c\":3" + + "}", + "{" + + "\"d\":4," + + "\"e\":5," + + "\"f\":6" + + "}", + "{" + + "\"g\":7," + + "\"h\":8," + + "\"i\":9" + + "}" + + ) + ); + }}; + + } @Test @@ -366,4 +550,68 @@ public void testTableWithListOfMapConversion() throws IOException { createTable(converted); } + + @Test + public void testFlatHCatRecordConversion() throws IOException { + Map converted = convertHCatRecordToBigQueryMap(flatHcatSchema, flatHcatRecord); + + assertEquals(flatBigQueryRecord, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "flat_table", flatBigQuerySchema, converted); + } + + @Test + public void testHCatRecordWithListConversion() throws IOException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithPrimitiveList, hcatRecordWithPrimitiveList); + + assertEquals(bigQueryRecordWithPrimitiveList, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_primitive_list", bigQuerySchemaWithPrimitiveList, converted); + } + + @Test + public void testHCatRecordWithStructConversion() throws JsonProcessingException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithStruct, hCatRecordWithStruct); + + assertEquals(bigQueryRecordWithStruct, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_struct", bigQuerySchemaWithRecord, converted); + } + + @Test + public void testHCatRecordWithListOfStructConversion() throws JsonProcessingException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithListOfStruct, hcatRecordWithListOfStruct); + + assertEquals(bigQueryRecordWithListOfStruct, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_list_struct", bigQuerySchemaWithListOfRecord, converted); + } + + @Test + public void testHCatRecordWithListOfListConversion() throws JsonProcessingException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithListOfList, hcatRecordWithListOfList); + + assertEquals(bigQueryRecordWithListOfList, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_list_of_lists", bigQuerySchemaWithListOfList, converted); + } + + @Test + public void testHCatRecordWithMapConversion() throws JsonProcessingException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithMap, hcatRecordWithMap); + + assertEquals(bigQueryRecordWithMap, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_map", bigQuerySchemaWithMap, converted); + } + + @Test + public void testHCatRecordWithListOfMapConversion() throws JsonProcessingException { + Map converted = convertHCatRecordToBigQueryMap(hcatSchemaWithListOfMaps, hcatRecordWithListOfMap); + + assertEquals(bigQueryRecordWithListOfMap, converted); + + insertIntoTable("schedoscope_export_big_query_record_test", "table_with_list_of_map", bigQuerySchemaWithListOfMaps, converted); + + } } From 7d3f4bb6dd3c791737257afa51f8bafaa58abfd3 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Mon, 11 Dec 2017 11:14:46 +0100 Subject: [PATCH 16/34] Added retry function to bigqueryutils --- .../export/bigquery/BigQueryUtils.java | 81 ++++++++++++++----- .../export/bigquery/BigQueryBaseTest.java | 61 +++++--------- 2 files changed, 80 insertions(+), 62 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java index e814dd759..a321600c1 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java @@ -8,16 +8,20 @@ import java.nio.charset.Charset; import java.util.Arrays; import java.util.Map; +import java.util.Random; +import java.util.function.Supplier; import java.util.stream.Collectors; public class BigQueryUtils { - public BigQuery bigQueryService() { + final static private Random rnd = new Random(); + + public static BigQuery bigQueryService() { return BigQueryOptions.getDefaultInstance().getService(); } - public BigQuery bigQueryService(String gcpKey) throws IOException { + public static BigQuery bigQueryService(String gcpKey) throws IOException { if (gcpKey == null) return bigQueryService(); @@ -29,33 +33,67 @@ public BigQuery bigQueryService(String gcpKey) throws IOException { return BigQueryOptions.newBuilder().setCredentials(credentials).build().getService(); } - public boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { + public static T retry(int numberOfRetries, Supplier action) { + try { + return action.get(); + } catch (Throwable t) { + if (numberOfRetries > 0) { + + try { + Thread.currentThread().sleep(rnd.nextInt(2000)); + } catch (InterruptedException e) { + } + + return retry(numberOfRetries - 1, action); + } else + throw t; + } + } + + public static void retry(int numberOfRetries, Runnable action) { + try { + action.run(); + } catch (Throwable t) { + if (numberOfRetries > 0) { + + try { + Thread.currentThread().sleep(rnd.nextInt(2000)); + } catch (InterruptedException e) { + } + + retry(numberOfRetries - 1, action); + } else + throw t; + } + } + + public static boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { return bigQueryService.getDataset(project == null ? DatasetId.of(dataset) : DatasetId.of(project, dataset)) != null; } - public boolean existsDataset(BigQuery bigQueryService, String dataset) { + public static boolean existsDataset(BigQuery bigQueryService, String dataset) { return existsDataset(bigQueryService, null, dataset); } - public boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + public static boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { return existsDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public void createDataset(BigQuery bigQueryService, String project, String dataset) { + public static void createDataset(BigQuery bigQueryService, String project, String dataset) { if (!existsDataset(bigQueryService, project, dataset)) { bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build()); } } - public void createDataset(BigQuery bigQueryService, String dataset) { + public static void createDataset(BigQuery bigQueryService, String dataset) { createDataset(bigQueryService, null, dataset); } - public void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + public static void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { createDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public void dropDataset(BigQuery bigQueryService, String project, String dataset) { + public static void dropDataset(BigQuery bigQueryService, String project, String dataset) { if (existsDataset(bigQueryService, project, dataset)) { bigQueryService.delete( (project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build().getDatasetId(), @@ -64,27 +102,27 @@ public void dropDataset(BigQuery bigQueryService, String project, String dataset } } - public void dropDataset(BigQuery bigQueryService, String dataset) { + public static void dropDataset(BigQuery bigQueryService, String dataset) { dropDataset(bigQueryService, null, dataset); } - public void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + public static void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { dropDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { + public static boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { return bigQueryService.getTable(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)) != null; } - public boolean existsTable(BigQuery bigQueryService, String dataset, String table) { + public static boolean existsTable(BigQuery bigQueryService, String dataset, String table) { return existsTable(bigQueryService, null, table); } - public boolean existsTable(BigQuery bigQueryService, TableInfo tableInfo) { + public static boolean existsTable(BigQuery bigQueryService, TableInfo tableInfo) { return existsTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); } - public void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { + public static void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { createDataset(bigQueryService, project, dataset); if (!existsTable(bigQueryService, project, dataset, table)) { @@ -97,27 +135,27 @@ public void createTable(BigQuery bigQueryService, String project, String dataset } } - public void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { + public static void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { createTable(bigQueryService, null, dataset, table, tableDefinition); } - public void createTable(BigQuery bigQueryService, TableInfo tableInfo) { + public static void createTable(BigQuery bigQueryService, TableInfo tableInfo) { createTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable(), tableInfo.getDefinition()); } - public void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { + public static void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { bigQueryService.delete(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); } - public void dropTable(BigQuery bigQueryService, String dataset, String table) { + public static void dropTable(BigQuery bigQueryService, String dataset, String table) { dropTable(bigQueryService, null, table); } - public void dropTable(BigQuery bigQueryService, TableInfo tableInfo) { + public static void dropTable(BigQuery bigQueryService, TableInfo tableInfo) { dropTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); } - public void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { + public static void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { InsertAllRequest insertAllRequest = InsertAllRequest.newBuilder(table) .setRows( @@ -133,4 +171,5 @@ public void insertIntoTable(BigQuery bigQueryService, TableId table, Map { + dropTable(bigQuery, tableInfo); + BigQueryUtils.createTable(bigQuery, tableInfo); + }); - } catch (Throwable t) { - t.printStackTrace(); - try { - Thread.currentThread().sleep(500); - } catch (InterruptedException e) { - } - - createTable(tableInfo); - } - - } } public void insertIntoTable(String dataset, String table, Schema schema, Map... data) { if (CALL_BIG_QUERY) { TableId tableId = TableId.of(dataset, table); TableInfo tableInfo = TableInfo.of(tableId, StandardTableDefinition.newBuilder().setSchema(schema).build()); + createTable(tableInfo); + retry(3, () -> BigQueryUtils.insertIntoTable(bigQuery, tableId, data)); - try { - execute.insertIntoTable(bigQuery, tableId, data); - } catch (Throwable t) { - t.printStackTrace(); - try { - Thread.currentThread().sleep(500); - } catch (InterruptedException e) { - } - - insertIntoTable(dataset, table, schema, data); - } } } @@ -65,16 +44,16 @@ public static void createBigQueryDataSet() { if (!CALL_BIG_QUERY) return; - bigQuery = execute.bigQueryService(); + bigQuery = bigQueryService(); - if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) - execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + if (existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); - if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) - execute.dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); - execute.createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); - execute.createDataset(bigQuery, "schedoscope_export_big_query_record_test"); + createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + createDataset(bigQuery, "schedoscope_export_big_query_record_test"); } @AfterClass @@ -82,11 +61,11 @@ public static void dropBigQueryDataSets() { if (!CALL_BIG_QUERY || !CLEAN_UP_BIG_QUERY) return; - if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) - execute.dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + if (existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); - if (execute.existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) - execute.dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); } From 724026d62c3de1f43d349baa6e3e98df5aee3ad4 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 12 Dec 2017 10:41:06 +0100 Subject: [PATCH 17/34] Implemented OutputFormat and RecordWriter for BigQuery export job --- .../outputformat/BigQueryOutputFormat.java | 71 +++++++++++------- .../HCatSchemaToBigQuerySchemaConverter.java | 4 +- .../{bigquery => utils}/BigQueryUtils.java | 73 +++++++++---------- .../HCatSchemaTransformer.java | 2 +- .../export/bigquery/BigQueryBaseTest.java | 5 +- 5 files changed, 87 insertions(+), 68 deletions(-) rename schedoscope-export/src/main/java/org/schedoscope/export/{bigquery => utils}/BigQueryUtils.java (65%) rename schedoscope-export/src/main/java/org/schedoscope/export/{bigquery/outputschema => utils}/HCatSchemaTransformer.java (99%) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 15495c9b3..a88b7ec21 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -2,18 +2,22 @@ import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.TableDefinition; +import com.google.cloud.bigquery.TableId; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hive.hcatalog.data.schema.HCatSchema; -import org.schedoscope.export.bigquery.BigQueryUtils; -import org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter; import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; import java.io.IOException; +import java.util.Arrays; +import java.util.Map; -public class BigQueryOutputFormat extends OutputFormat { +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; +import static org.schedoscope.export.utils.BigQueryUtils.*; + +public class BigQueryOutputFormat> extends OutputFormat { private static Configuration configuration; private static String project; @@ -23,12 +27,11 @@ public class BigQueryOutputFormat extends OutputFormat { private static HCatSchema hcatSchema; private static String gcpKey; private static String tableNamePostfix; - private static HCatSchemaToBigQuerySchemaConverter HCatSchemaToBigQuerySchemaConverter = new HCatSchemaToBigQuerySchemaConverter(); - private static BigQueryUtils execute; + private static int commitSize; private static BigQuery bigQueryService; - public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, BigQueryUtils bigQueryUtils) throws IOException { + public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, int commitSize) throws IOException { configuration = conf; BigQueryOutputFormat.project = project; BigQueryOutputFormat.database = database; @@ -36,52 +39,66 @@ public static void setOutput(Configuration conf, String project, String gcpKey, BigQueryOutputFormat.usedHCatFilter = usedHCatFilter; BigQueryOutputFormat.hcatSchema = hcatSchema; BigQueryOutputFormat.gcpKey = gcpKey; - execute = bigQueryUtils; - bigQueryService = execute.bigQueryService(gcpKey); + BigQueryOutputFormat.commitSize = commitSize; + bigQueryService = bigQueryService(gcpKey); } - public static void setOutput(Configuration conf, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, BigQueryUtils bigQueryUtils) { - setOutput(conf, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix, bigQueryUtils); - } + public class BiqQueryRecordWriter extends RecordWriter { - public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix) throws IOException { - setOutput(conf, project, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix, new BigQueryUtils()); - } + private TableId tableId; + private int commitSize; + private Map[] batch; + private int elementsInBatch = 0; + private BigQuery bigQueryService; - public static void setOutput(Configuration conf, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix) throws IOException { - setOutput(conf, null, gcpKey, database, table, hcatSchema, usedHCatFilter, tableNamePostfix); - } + @Override + public void write(K key, V value) { + batch[elementsInBatch] = value; + elementsInBatch++; + if (elementsInBatch == commitSize) { + retry(3, () -> insertIntoTable(bigQueryService, tableId, batch)); - public class BiqQueryRecordWriter extends RecordWriter { - - @Override - public void write(K key, V value) throws IOException, InterruptedException { + elementsInBatch = 0; + } } @Override - public void close(TaskAttemptContext context) throws IOException, InterruptedException { + public void close(TaskAttemptContext context) { + + if (elementsInBatch > 0) { + retry(3, () -> insertIntoTable(bigQueryService, tableId, Arrays.copyOf(batch, elementsInBatch))); + } } - } + public BiqQueryRecordWriter(BigQuery bigQueryService, TableId tableId, int commitSize) { + this.bigQueryService = bigQueryService; + this.tableId = tableId; + this.commitSize = commitSize; + this.batch = new Map[commitSize]; + } + } @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { - TableDefinition outputSchema = HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition(hcatSchema, new PartitioningScheme()); + TableDefinition outputSchema = convertSchemaToTableDefinition(hcatSchema, new PartitioningScheme()); String tmpOutputTable = table + (tableNamePostfix != null ? "_" + tableNamePostfix : "") + "_" + context.getTaskAttemptID().getTaskID().getId(); + TableId tmpTableId = project == null ? TableId.of(database, tmpOutputTable) : TableId.of(project, database, tmpOutputTable); - execute.dropTable(bigQueryService, project, database, tmpOutputTable); - execute.createTable(bigQueryService, project, database, tmpOutputTable, outputSchema); + retry(3, () -> { + dropTable(bigQueryService, tmpTableId); + createTable(bigQueryService, tmpTableId, outputSchema); + }); - return null; + return new BiqQueryRecordWriter(bigQueryService, tmpTableId, commitSize); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 3c0916672..c6595e81b 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -7,13 +7,15 @@ import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.schedoscope.export.utils.HCatSchemaTransformer; import java.io.IOException; import java.util.Arrays; import java.util.LinkedList; import java.util.List; -import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; +import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; + public class HCatSchemaToBigQuerySchemaConverter { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java similarity index 65% rename from schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java rename to schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index a321600c1..ea1b2be0c 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -1,4 +1,4 @@ -package org.schedoscope.export.bigquery; +package org.schedoscope.export.utils; import com.google.auth.oauth2.GoogleCredentials; import com.google.cloud.bigquery.*; @@ -16,12 +16,12 @@ public class BigQueryUtils { final static private Random rnd = new Random(); - public static BigQuery bigQueryService() { + static public BigQuery bigQueryService() { return BigQueryOptions.getDefaultInstance().getService(); } - public static BigQuery bigQueryService(String gcpKey) throws IOException { + static public BigQuery bigQueryService(String gcpKey) throws IOException { if (gcpKey == null) return bigQueryService(); @@ -33,7 +33,7 @@ public static BigQuery bigQueryService(String gcpKey) throws IOException { return BigQueryOptions.newBuilder().setCredentials(credentials).build().getService(); } - public static T retry(int numberOfRetries, Supplier action) { + static public T retry(int numberOfRetries, Supplier action) { try { return action.get(); } catch (Throwable t) { @@ -50,7 +50,7 @@ public static T retry(int numberOfRetries, Supplier action) { } } - public static void retry(int numberOfRetries, Runnable action) { + static public void retry(int numberOfRetries, Runnable action) { try { action.run(); } catch (Throwable t) { @@ -67,33 +67,33 @@ public static void retry(int numberOfRetries, Runnable action) { } } - public static boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { + static public boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { return bigQueryService.getDataset(project == null ? DatasetId.of(dataset) : DatasetId.of(project, dataset)) != null; } - public static boolean existsDataset(BigQuery bigQueryService, String dataset) { + static public boolean existsDataset(BigQuery bigQueryService, String dataset) { return existsDataset(bigQueryService, null, dataset); } - public static boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + static public boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { return existsDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public static void createDataset(BigQuery bigQueryService, String project, String dataset) { + static public void createDataset(BigQuery bigQueryService, String project, String dataset) { if (!existsDataset(bigQueryService, project, dataset)) { bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build()); } } - public static void createDataset(BigQuery bigQueryService, String dataset) { + static public void createDataset(BigQuery bigQueryService, String dataset) { createDataset(bigQueryService, null, dataset); } - public static void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + static public void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { createDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public static void dropDataset(BigQuery bigQueryService, String project, String dataset) { + static public void dropDataset(BigQuery bigQueryService, String project, String dataset) { if (existsDataset(bigQueryService, project, dataset)) { bigQueryService.delete( (project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build().getDatasetId(), @@ -102,60 +102,59 @@ public static void dropDataset(BigQuery bigQueryService, String project, String } } - public static void dropDataset(BigQuery bigQueryService, String dataset) { + static public void dropDataset(BigQuery bigQueryService, String dataset) { dropDataset(bigQueryService, null, dataset); } - public static void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { + static public void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { dropDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); } - public static boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { - return bigQueryService.getTable(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)) != null; + static public boolean existsTable(BigQuery bigQueryService, TableId tableId) { + return bigQueryService.getTable(tableId) != null; } - public static boolean existsTable(BigQuery bigQueryService, String dataset, String table) { - return existsTable(bigQueryService, null, table); + static public boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { + return existsTable(bigQueryService, project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); } - public static boolean existsTable(BigQuery bigQueryService, TableInfo tableInfo) { - return existsTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); + static public boolean existsTable(BigQuery bigQueryService, String dataset, String table) { + return existsTable(bigQueryService, null, dataset, table); } - public static void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { - createDataset(bigQueryService, project, dataset); + static public void createTable(BigQuery bigQueryService, TableId tableId, TableDefinition tableDefinition) { + createDataset(bigQueryService, tableId.getProject(), tableId.getDataset()); - if (!existsTable(bigQueryService, project, dataset, table)) { - bigQueryService.create( - TableInfo.of( - project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table), - tableDefinition - ) - ); - } + if (!existsTable(bigQueryService, tableId)) + bigQueryService.create(TableInfo.of(tableId, tableDefinition)); + + } + + static public void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { + createTable(bigQueryService, project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table), tableDefinition); } - public static void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { + static public void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { createTable(bigQueryService, null, dataset, table, tableDefinition); } - public static void createTable(BigQuery bigQueryService, TableInfo tableInfo) { + static public void createTable(BigQuery bigQueryService, TableInfo tableInfo) { createTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable(), tableInfo.getDefinition()); } - public static void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { + static public void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { bigQueryService.delete(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); } - public static void dropTable(BigQuery bigQueryService, String dataset, String table) { + static public void dropTable(BigQuery bigQueryService, String dataset, String table) { dropTable(bigQueryService, null, table); } - public static void dropTable(BigQuery bigQueryService, TableInfo tableInfo) { - dropTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable()); + static public void dropTable(BigQuery bigQueryService, TableId tableId) { + dropTable(bigQueryService, tableId.getProject(), tableId.getDataset(), tableId.getTable()); } - public static void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { + static public void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { InsertAllRequest insertAllRequest = InsertAllRequest.newBuilder(table) .setRows( diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java similarity index 99% rename from schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java rename to schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java index eebeccd97..38b5e0d95 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaTransformer.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java @@ -1,4 +1,4 @@ -package org.schedoscope.export.bigquery.outputschema; +package org.schedoscope.export.utils; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index 40b6e40d0..afe0de840 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -3,10 +3,11 @@ import com.google.cloud.bigquery.*; import org.junit.AfterClass; import org.junit.BeforeClass; +import org.schedoscope.export.utils.BigQueryUtils; import java.util.Map; -import static org.schedoscope.export.bigquery.BigQueryUtils.*; +import static org.schedoscope.export.utils.BigQueryUtils.*; public abstract class BigQueryBaseTest { @@ -21,7 +22,7 @@ public void createTable(TableInfo tableInfo) { if (CALL_BIG_QUERY) retry(3, () -> { - dropTable(bigQuery, tableInfo); + dropTable(bigQuery, tableInfo.getTableId()); BigQueryUtils.createTable(bigQuery, tableInfo); }); From 2159f04ae05fa50c2706ac94915781fff23693c7 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 12 Dec 2017 16:33:54 +0100 Subject: [PATCH 18/34] Minor refactorings --- .../HCatRecordToBigQueryMapConvertor.java | 4 +++- .../schedoscope/export/utils/BigQueryUtils.java | 16 +++------------- 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index 4d3a4c1ee..30b2178df 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -12,13 +12,15 @@ import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.schedoscope.export.utils.HCatSchemaTransformer; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import static org.schedoscope.export.bigquery.outputschema.HCatSchemaTransformer.transformSchema; +import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; + public class HCatRecordToBigQueryMapConvertor { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index ea1b2be0c..d99423cfd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -51,20 +51,10 @@ static public T retry(int numberOfRetries, Supplier action) { } static public void retry(int numberOfRetries, Runnable action) { - try { + retry(numberOfRetries, () -> { action.run(); - } catch (Throwable t) { - if (numberOfRetries > 0) { - - try { - Thread.currentThread().sleep(rnd.nextInt(2000)); - } catch (InterruptedException e) { - } - - retry(numberOfRetries - 1, action); - } else - throw t; - } + return null; + }); } static public boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { From 038f03c477301a3aaba008afebbeb719d10fe8fc Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 13 Dec 2017 11:23:35 +0100 Subject: [PATCH 19/34] Corrected passing of configuration parameters to BigQueryOutputFormat --- .../outputformat/BigQueryOutputFormat.java | 169 ++++++++++++++---- .../HCatSchemaToBigQuerySchemaConverter.java | 4 +- 2 files changed, 136 insertions(+), 37 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index a88b7ec21..e950e96e7 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -7,59 +7,149 @@ import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; -import java.io.IOException; +import java.io.*; import java.util.Arrays; +import java.util.Base64; import java.util.Map; +import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; import static org.schedoscope.export.utils.BigQueryUtils.*; -public class BigQueryOutputFormat> extends OutputFormat { +public class BigQueryOutputFormat extends OutputFormat { - private static Configuration configuration; - private static String project; - private static String database; - private static String table; - private static String usedHCatFilter; - private static HCatSchema hcatSchema; - private static String gcpKey; - private static String tableNamePostfix; - private static int commitSize; - private static BigQuery bigQueryService; + public static final String BIGQUERY_PROJECT = "bigquery.project"; + public static final String BIGQUERY_DATASET = "bigquery.dataset"; + public static final String BIGQUERY_TABLE = "bigquery.table"; + public static final String BIGQUERY_TABLE_NAME_POSTFIX = "bigquery.tableNamePostfix"; + public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; + public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; + public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; + public static final String BIGQUERY_COMMIT_SIZE = "bigquery.commitSize"; + public static final String BIGQUERY_NO_OF_PARTITIONS = "bigquery.noOfPartitions"; + public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; - public static void setOutput(Configuration conf, String project, String gcpKey, String database, String table, HCatSchema hcatSchema, String usedHCatFilter, String tableNamePostfix, int commitSize) throws IOException { - configuration = conf; - BigQueryOutputFormat.project = project; - BigQueryOutputFormat.database = database; - BigQueryOutputFormat.table = table; - BigQueryOutputFormat.usedHCatFilter = usedHCatFilter; - BigQueryOutputFormat.hcatSchema = hcatSchema; - BigQueryOutputFormat.gcpKey = gcpKey; - BigQueryOutputFormat.commitSize = commitSize; - bigQueryService = bigQueryService(gcpKey); + + private static String serializeHCatSchema(HCatSchema schema) throws IOException { + + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + ObjectOutputStream serializer = new ObjectOutputStream(bytes); + serializer.writeObject(schema); + serializer.close(); + + return Base64.getEncoder().encodeToString(bytes.toByteArray()); + + } + + private static HCatSchema deserializeHCatSchema(String serializedSchema) throws IOException, ClassNotFoundException { + byte[] bytes = Base64.getDecoder().decode(serializedSchema); + ObjectInputStream deserializer = new ObjectInputStream(new ByteArrayInputStream(bytes)); + + HCatSchema schema = (HCatSchema) deserializer.readObject(); + + deserializer.close(); + + return schema; + + } + + public static String getBigQueryProject(Configuration conf) { + return conf.get(BIGQUERY_PROJECT); + } + + public static String getBigQueryGcpKey(Configuration conf) { + return conf.get(BIGQUERY_GCP_KEY); } - public class BiqQueryRecordWriter extends RecordWriter { + public static String getBigQueryDataset(Configuration conf) { + return conf.get(BIGQUERY_DATASET); + } + + public static String getBigQueryTable(Configuration conf) { + return conf.get(BIGQUERY_TABLE); + } + + public static String getBigQueryTableNamePostfix(Configuration conf) { + return conf.get(BIGQUERY_TABLE_NAME_POSTFIX); + } + + public static String getBigQueryUsedHcatFilter(Configuration conf) { + return conf.get(BIGQUERY_USED_HCAT_FILTER); + } + + public static String getBigqueryTablePartitionDate(Configuration conf) { + return conf.get(BIGQUERY_TABLE_PARTITION_DATE); + } + + public static int getBigQueryCommitSize(Configuration conf) { + return Integer.parseInt(conf.get(BIGQUERY_COMMIT_SIZE)); + } + + public static int getBigQueryNoOfPartitions(Configuration conf) { + return Integer.parseInt(conf.get(BIGQUERY_NO_OF_PARTITIONS)); + } + + public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { + try { + return deserializeHCatSchema(conf.get(BIGQUERY_HCAT_SCHEMA)); + } catch (ClassNotFoundException e) { + throw new IOException("Error while deserializing HCatSchema", e); + } + } + + public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tableNamePostfix, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { + + currentConf.set(BIGQUERY_PROJECT, project); + currentConf.set(BIGQUERY_GCP_KEY, gcpKey); + currentConf.set(BIGQUERY_DATASET, database); + currentConf.set(BIGQUERY_TABLE, table); + currentConf.set(BIGQUERY_TABLE_NAME_POSTFIX, tableNamePostfix); + currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); + currentConf.set(BIGQUERY_USED_HCAT_FILTER, usedHCatFilter); + currentConf.set(BIGQUERY_COMMIT_SIZE, String.valueOf(commitSize)); + currentConf.set(BIGQUERY_NO_OF_PARTITIONS, String.valueOf(noOfPartitions)); + currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); + + return currentConf; + + } + + + public class BiqQueryHCatRecordWriter extends RecordWriter { private TableId tableId; private int commitSize; private Map[] batch; private int elementsInBatch = 0; + private HCatSchema hcatSchema; private BigQuery bigQueryService; + private String usedHCatFilter; @Override - public void write(K key, V value) { - batch[elementsInBatch] = value; - elementsInBatch++; + public void write(K key, V value) throws IOException { + + try { - if (elementsInBatch == commitSize) { - retry(3, () -> insertIntoTable(bigQueryService, tableId, batch)); + Map bigQueryMap = convertHCatRecordToBigQueryMap(hcatSchema, value); + if (usedHCatFilter != null) + bigQueryMap.put(USED_FILTER_FIELD_NAME, usedHCatFilter); - elementsInBatch = 0; + batch[elementsInBatch] = bigQueryMap; + elementsInBatch++; + + if (elementsInBatch == commitSize) { + retry(3, () -> insertIntoTable(bigQueryService, tableId, batch)); + elementsInBatch = 0; + } + + } catch (Throwable t) { + throw new IOException("Exception encountered while writing HCatRecord to BigQuery", t); } } @@ -73,32 +163,39 @@ public void close(TaskAttemptContext context) { } - public BiqQueryRecordWriter(BigQuery bigQueryService, TableId tableId, int commitSize) { + public BiqQueryHCatRecordWriter(BigQuery bigQueryService, TableId tableId, HCatSchema hcatSchema, String usedHCatFilter, int commitSize) { this.bigQueryService = bigQueryService; this.tableId = tableId; this.commitSize = commitSize; this.batch = new Map[commitSize]; + this.hcatSchema = hcatSchema; + this.usedHCatFilter = usedHCatFilter; } + } @Override - public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { + public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { - TableDefinition outputSchema = convertSchemaToTableDefinition(hcatSchema, new PartitioningScheme()); + Configuration conf = context.getConfiguration(); - String tmpOutputTable = table - + (tableNamePostfix != null ? "_" + tableNamePostfix : "") + TableDefinition outputSchema = convertSchemaToTableDefinition(getBigQueryHCatSchema(conf), new PartitioningScheme()); + + String tmpOutputTable = getBigQueryTable(conf) + + (getBigQueryTableNamePostfix(conf) != null ? "_" + getBigQueryTableNamePostfix(conf) : "") + "_" + context.getTaskAttemptID().getTaskID().getId(); - TableId tmpTableId = project == null ? TableId.of(database, tmpOutputTable) : TableId.of(project, database, tmpOutputTable); + TableId tmpTableId = getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), tmpOutputTable) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), tmpOutputTable); + + BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); retry(3, () -> { dropTable(bigQueryService, tmpTableId); createTable(bigQueryService, tmpTableId, outputSchema); }); - return new BiqQueryRecordWriter(bigQueryService, tmpTableId, commitSize); + return new BiqQueryHCatRecordWriter(bigQueryService, tmpTableId, getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf), getBigQueryCommitSize(conf)); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index c6595e81b..84499b22a 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -27,7 +27,9 @@ public class HCatSchemaToBigQuerySchemaConverter { stringTypeInfo.setTypeName("string"); } - static private final Field usedFilterField = Field.newBuilder("_USED_HCAT_FILTER", Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); + static public final String USED_FILTER_FIELD_NAME = "_USED_HCAT_FILTER"; + + static private final Field usedFilterField = Field.newBuilder(USED_FILTER_FIELD_NAME, Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); static private final HCatSchemaTransformer.Constructor c = new HCatSchemaTransformer.Constructor() { From 3b3b7799ac4be24c46e34fe23b15cda24bdcd4f1 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 13 Dec 2017 11:27:38 +0100 Subject: [PATCH 20/34] Corrected passing of configuration parameters to BigQueryOutputFormat --- .../org/schedoscope/export/bigquery/BigQueryBaseTest.java | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index afe0de840..e868b4ef4 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -53,8 +53,12 @@ public static void createBigQueryDataSet() { if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + if (existsDataset(bigQuery, "schedoscope_export_big_query_output_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_output_test"); + createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); createDataset(bigQuery, "schedoscope_export_big_query_record_test"); + createDataset(bigQuery, "schedoscope_export_big_query_output_test"); } @AfterClass @@ -67,6 +71,9 @@ public static void dropBigQueryDataSets() { if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + + if (existsDataset(bigQuery, "schedoscope_export_big_query_output_test")) + dropDataset(bigQuery, "schedoscope_export_big_query_output_test"); } From 1bdf46992f91ec1bcf75bca1336bef7da0b7828c Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 13 Dec 2017 12:19:54 +0100 Subject: [PATCH 21/34] Changing streaming approach to avoid temporal tables --- .../outputformat/BigQueryOutputFormat.java | 62 +++++++++++-------- .../HCatSchemaToBigQuerySchemaConverter.java | 6 +- .../outputschema/PartitioningScheme.java | 53 +--------------- 3 files changed, 42 insertions(+), 79 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index e950e96e7..5471dab4a 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -19,6 +19,8 @@ import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; +import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.DAILY; +import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.NONE; import static org.schedoscope.export.utils.BigQueryUtils.*; public class BigQueryOutputFormat extends OutputFormat { @@ -27,12 +29,11 @@ public class BigQueryOutputFormat extends OutputFormat< public static final String BIGQUERY_PROJECT = "bigquery.project"; public static final String BIGQUERY_DATASET = "bigquery.dataset"; public static final String BIGQUERY_TABLE = "bigquery.table"; - public static final String BIGQUERY_TABLE_NAME_POSTFIX = "bigquery.tableNamePostfix"; public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; public static final String BIGQUERY_COMMIT_SIZE = "bigquery.commitSize"; - public static final String BIGQUERY_NO_OF_PARTITIONS = "bigquery.noOfPartitions"; + public static final String BIGQUERY_NO_OF_WORKERS = "bigquery.noOfPartitions"; public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; @@ -75,15 +76,11 @@ public static String getBigQueryTable(Configuration conf) { return conf.get(BIGQUERY_TABLE); } - public static String getBigQueryTableNamePostfix(Configuration conf) { - return conf.get(BIGQUERY_TABLE_NAME_POSTFIX); - } - public static String getBigQueryUsedHcatFilter(Configuration conf) { return conf.get(BIGQUERY_USED_HCAT_FILTER); } - public static String getBigqueryTablePartitionDate(Configuration conf) { + public static String getBigQueryTablePartitionDate(Configuration conf) { return conf.get(BIGQUERY_TABLE_PARTITION_DATE); } @@ -91,8 +88,8 @@ public static int getBigQueryCommitSize(Configuration conf) { return Integer.parseInt(conf.get(BIGQUERY_COMMIT_SIZE)); } - public static int getBigQueryNoOfPartitions(Configuration conf) { - return Integer.parseInt(conf.get(BIGQUERY_NO_OF_PARTITIONS)); + public static int getBigQueryNoOfWorkers(Configuration conf) { + return Integer.parseInt(conf.get(BIGQUERY_NO_OF_WORKERS)); } public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { @@ -103,23 +100,49 @@ public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOExce } } - public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tableNamePostfix, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { + public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { currentConf.set(BIGQUERY_PROJECT, project); currentConf.set(BIGQUERY_GCP_KEY, gcpKey); currentConf.set(BIGQUERY_DATASET, database); currentConf.set(BIGQUERY_TABLE, table); - currentConf.set(BIGQUERY_TABLE_NAME_POSTFIX, tableNamePostfix); currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); currentConf.set(BIGQUERY_USED_HCAT_FILTER, usedHCatFilter); currentConf.set(BIGQUERY_COMMIT_SIZE, String.valueOf(commitSize)); - currentConf.set(BIGQUERY_NO_OF_PARTITIONS, String.valueOf(noOfPartitions)); + currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); return currentConf; } + public static TableId getBigQueryTableId(Configuration conf, boolean includingPartition) { + String bigQueryTableName = getBigQueryTable(conf) + (includingPartition && getBigQueryTablePartitionDate(conf) != null ? "$" + getBigQueryTablePartitionDate(conf) : ""); + + return getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), bigQueryTableName) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), bigQueryTableName); + } + + public static TableId getBigQueryTableId(Configuration conf) { + return getBigQueryTableId(conf, false); + } + + public static void prepareBigQueryTable(Configuration conf) throws IOException { + + PartitioningScheme partitioning = getBigQueryTablePartitionDate(conf) != null ? DAILY : NONE; + + TableDefinition outputSchema = convertSchemaToTableDefinition(getBigQueryHCatSchema(conf), partitioning); + + BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); + + retry(3, () -> { + createTable(bigQueryService, getBigQueryTableId(conf), outputSchema); + }); + + } + + public static void rollback(Configuration conf) throws IOException { + + } public class BiqQueryHCatRecordWriter extends RecordWriter { @@ -180,22 +203,9 @@ public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOE Configuration conf = context.getConfiguration(); - TableDefinition outputSchema = convertSchemaToTableDefinition(getBigQueryHCatSchema(conf), new PartitioningScheme()); - - String tmpOutputTable = getBigQueryTable(conf) - + (getBigQueryTableNamePostfix(conf) != null ? "_" + getBigQueryTableNamePostfix(conf) : "") - + "_" + context.getTaskAttemptID().getTaskID().getId(); - - TableId tmpTableId = getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), tmpOutputTable) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), tmpOutputTable); - BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); - retry(3, () -> { - dropTable(bigQueryService, tmpTableId); - createTable(bigQueryService, tmpTableId, outputSchema); - }); - - return new BiqQueryHCatRecordWriter(bigQueryService, tmpTableId, getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf), getBigQueryCommitSize(conf)); + return new BiqQueryHCatRecordWriter(bigQueryService, getBigQueryTableId(conf, true), getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf), getBigQueryCommitSize(conf)); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index 84499b22a..d793e76dd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -175,7 +175,7 @@ static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSche .newBuilder() .setSchema(Schema.of(fields)); - if (partitioning.isTemporallyPartitioned()) { + if (partitioning != PartitioningScheme.NONE) { tableDefinitionBuilder.setTimePartitioning(TimePartitioning.of(TimePartitioning.Type.DAY)); } @@ -209,7 +209,7 @@ static public TableInfo convertSchemaToTableInfo(String database, String table, } static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { - return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme(), postfix); + return convertSchemaToTableInfo(project, database, table, hcatSchema, PartitioningScheme.NONE, postfix); } static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { @@ -217,7 +217,7 @@ static public TableInfo convertSchemaToTableInfo(String database, String table, } static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { - return convertSchemaToTableInfo(project, database, table, hcatSchema, new PartitioningScheme()); + return convertSchemaToTableInfo(project, database, table, hcatSchema, PartitioningScheme.NONE); } static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java index 77e7c2195..23a6c4643 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java @@ -1,52 +1,5 @@ package org.schedoscope.export.bigquery.outputschema; -import java.util.Arrays; -import java.util.LinkedList; -import java.util.List; -import java.util.Optional; - -import static java.util.Optional.empty; - -public class PartitioningScheme { - - public enum Granularity { - DAILY, MONTHLY - } - - private Optional temporalPartitionColumn = empty(); - - private Optional granularity = empty(); - - private List logicalPartitionColumns = new LinkedList<>(); - - - public Optional getTemporalPartitionColumn() { - return temporalPartitionColumn; - } - - public Optional getGranularity() { - return granularity; - } - - public List getLogicalPartitionColumns() { - return logicalPartitionColumns; - } - - public boolean isTemporallyPartitioned() { - return getTemporalPartitionColumn().isPresent() && getGranularity().isPresent(); - } - - public boolean isLogicallyPartitioned() { - return !logicalPartitionColumns.isEmpty(); - } - - public PartitioningScheme(String temporalPartitionColumn, Granularity granularity, String... logicalPartitionColumns) { - this.granularity = Optional.of(granularity); - this.temporalPartitionColumn = Optional.of(temporalPartitionColumn); - this.logicalPartitionColumns.addAll(Arrays.asList(logicalPartitionColumns)); - } - - public PartitioningScheme() { - } - -} +public enum PartitioningScheme { + NONE, DAILY, MONTHLY +} \ No newline at end of file From d311eb0ed442501483baab0cc57e32484cbf2339 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Thu, 14 Dec 2017 10:53:19 +0100 Subject: [PATCH 22/34] Implemented Job Rollback in case of Error --- .../export/bigquery/outputformat/BigQueryOutputFormat.java | 7 +++++++ .../org/schedoscope/export/bigquery/BigQueryBaseTest.java | 6 +++--- .../outputschema/HCatSchemaToBigQueryTransformerTest.java | 5 +++-- 3 files changed, 13 insertions(+), 5 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 5471dab4a..2e90b8976 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -142,6 +142,13 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { public static void rollback(Configuration conf) throws IOException { + BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); + TableId tableId = getBigQueryTableId(conf, true); + + retry(3, () -> { + dropTable(bigQueryService, tableId); + }); + } public class BiqQueryHCatRecordWriter extends RecordWriter { diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index e868b4ef4..8768796e6 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -11,11 +11,11 @@ public abstract class BigQueryBaseTest { - final private static boolean CALL_BIG_QUERY = false; + final private static boolean CALL_BIG_QUERY = true; - final private static boolean CLEAN_UP_BIG_QUERY = true; + final private static boolean CLEAN_UP_BIG_QUERY = false; - private static BigQuery bigQuery; + protected static BigQuery bigQuery; public void createTable(TableInfo tableInfo) { diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index 03199e70b..a48f23c8f 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -447,7 +447,7 @@ public void testTableConversionWithPostfix() throws IOException { @Test public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { - PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); + PartitioningScheme partitioning = PartitioningScheme.MONTHLY; TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); @@ -466,7 +466,7 @@ public void testTableConversionWithPartitioning() throws IOException, NoSuchFiel @Test public void testTableConversionWithPartitioningAndPHCatSchemaToBigQuerySchemaConverterostfix() throws IOException, NoSuchFieldException, IllegalAccessException { - PartitioningScheme partitioning = new PartitioningScheme("aString", PartitioningScheme.Granularity.MONTHLY); + PartitioningScheme partitioning = PartitioningScheme.MONTHLY; TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); @@ -613,5 +613,6 @@ public void testHCatRecordWithListOfMapConversion() throws JsonProcessingExcepti insertIntoTable("schedoscope_export_big_query_record_test", "table_with_list_of_map", bigQuerySchemaWithListOfMaps, converted); + System.out.println("cdsddsds"); } } From 8291993e63e50da4cf8e7ee700d4130538955556 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 15 Dec 2017 09:21:03 +0100 Subject: [PATCH 23/34] Implemented OutputFormat Tests, failed though at streaming into older partitions. Refactoring to use Cloud Storage required. --- schedoscope-export/pom.xml | 5 + .../BigQueryOutputConfiguration.java | 127 +++++++++++++ .../outputformat/BigQueryOutputFormat.java | 104 +--------- .../BigQueryOutputFormatTest.java | 178 ++++++++++++++++++ 4 files changed, 312 insertions(+), 102 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java create mode 100644 schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java diff --git a/schedoscope-export/pom.xml b/schedoscope-export/pom.xml index e42b942fd..4c895737a 100644 --- a/schedoscope-export/pom.xml +++ b/schedoscope-export/pom.xml @@ -234,6 +234,11 @@ google-cloud-bigquery 0.20.0-beta + + com.google.cloud + google-cloud-storage + 1.14.0 + com.101tec zkclient diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java new file mode 100644 index 000000000..f791aaf06 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -0,0 +1,127 @@ +package org.schedoscope.export.bigquery.outputformat; + +import com.google.cloud.bigquery.TableId; +import org.apache.hadoop.conf.Configuration; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.io.*; +import java.util.Base64; + +public class BigQueryOutputConfiguration { + + public static final String BIGQUERY_PROJECT = "bigquery.project"; + public static final String BIGQUERY_DATASET = "bigquery.dataset"; + public static final String BIGQUERY_TABLE = "bigquery.table"; + public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; + public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; + public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; + public static final String BIGQUERY_COMMIT_SIZE = "bigquery.commitSize"; + public static final String BIGQUERY_NO_OF_WORKERS = "bigquery.noOfPartitions"; + public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; + public static final String BIGQUERY_EXPORT_STORAGE_BUCKET = "bigquery.exportStorageBucket"; + + + private static String serializeHCatSchema(HCatSchema schema) throws IOException { + + ByteArrayOutputStream bytes = new ByteArrayOutputStream(); + ObjectOutputStream serializer = new ObjectOutputStream(bytes); + serializer.writeObject(schema); + serializer.close(); + + return Base64.getEncoder().encodeToString(bytes.toByteArray()); + + } + + private static HCatSchema deserializeHCatSchema(String serializedSchema) throws IOException, ClassNotFoundException { + byte[] bytes = Base64.getDecoder().decode(serializedSchema); + ObjectInputStream deserializer = new ObjectInputStream(new ByteArrayInputStream(bytes)); + + HCatSchema schema = (HCatSchema) deserializer.readObject(); + + deserializer.close(); + + return schema; + + } + + public static String getBigQueryProject(Configuration conf) { + return conf.get(BIGQUERY_PROJECT); + } + + public static String getBigQueryGcpKey(Configuration conf) { + return conf.get(BIGQUERY_GCP_KEY); + } + + public static String getBigQueryDataset(Configuration conf) { + return conf.get(BIGQUERY_DATASET); + } + + public static String getBigQueryTable(Configuration conf) { + return conf.get(BIGQUERY_TABLE); + } + + public static String getBigQueryUsedHcatFilter(Configuration conf) { + return conf.get(BIGQUERY_USED_HCAT_FILTER); + } + + public static String getBigQueryTablePartitionDate(Configuration conf) { + return conf.get(BIGQUERY_TABLE_PARTITION_DATE); + } + + public static int getBigQueryCommitSize(Configuration conf) { + return Integer.parseInt(conf.get(BIGQUERY_COMMIT_SIZE)); + } + + public static int getBigQueryNoOfWorkers(Configuration conf) { + return Integer.parseInt(conf.get(BIGQUERY_NO_OF_WORKERS)); + } + + public static String getBigQueryExportStorageBucket(Configuration conf) { + return conf.get(BIGQUERY_EXPORT_STORAGE_BUCKET); + } + + public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { + try { + return deserializeHCatSchema(conf.get(BIGQUERY_HCAT_SCHEMA)); + } catch (ClassNotFoundException e) { + throw new IOException("Error while deserializing HCatSchema", e); + } + } + + public static TableId getBigQueryTableId(Configuration conf, boolean includingPartition) { + String bigQueryTableName = getBigQueryTable(conf) + (includingPartition && getBigQueryTablePartitionDate(conf) != null ? "$" + getBigQueryTablePartitionDate(conf) : ""); + + return getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), bigQueryTableName) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), bigQueryTableName); + } + + public static TableId getBigQueryTableId(Configuration conf) { + return getBigQueryTableId(conf, false); + } + + + public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { + + if (project != null) + currentConf.set(BIGQUERY_PROJECT, project); + + if (gcpKey != null) + currentConf.set(BIGQUERY_GCP_KEY, gcpKey); + + currentConf.set(BIGQUERY_DATASET, database); + currentConf.set(BIGQUERY_TABLE, table); + + if (tablePartitionDate != null) + currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); + + if (usedHCatFilter != null) + currentConf.set(BIGQUERY_USED_HCAT_FILTER, usedHCatFilter); + + currentConf.set(BIGQUERY_COMMIT_SIZE, String.valueOf(commitSize)); + currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); + currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); + + return currentConf; + + } + +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 2e90b8976..9c4956f48 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -11,11 +11,11 @@ import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; -import java.io.*; +import java.io.IOException; import java.util.Arrays; -import java.util.Base64; import java.util.Map; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.*; import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; @@ -26,106 +26,6 @@ public class BigQueryOutputFormat extends OutputFormat { - public static final String BIGQUERY_PROJECT = "bigquery.project"; - public static final String BIGQUERY_DATASET = "bigquery.dataset"; - public static final String BIGQUERY_TABLE = "bigquery.table"; - public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; - public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; - public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; - public static final String BIGQUERY_COMMIT_SIZE = "bigquery.commitSize"; - public static final String BIGQUERY_NO_OF_WORKERS = "bigquery.noOfPartitions"; - public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; - - - private static String serializeHCatSchema(HCatSchema schema) throws IOException { - - ByteArrayOutputStream bytes = new ByteArrayOutputStream(); - ObjectOutputStream serializer = new ObjectOutputStream(bytes); - serializer.writeObject(schema); - serializer.close(); - - return Base64.getEncoder().encodeToString(bytes.toByteArray()); - - } - - private static HCatSchema deserializeHCatSchema(String serializedSchema) throws IOException, ClassNotFoundException { - byte[] bytes = Base64.getDecoder().decode(serializedSchema); - ObjectInputStream deserializer = new ObjectInputStream(new ByteArrayInputStream(bytes)); - - HCatSchema schema = (HCatSchema) deserializer.readObject(); - - deserializer.close(); - - return schema; - - } - - public static String getBigQueryProject(Configuration conf) { - return conf.get(BIGQUERY_PROJECT); - } - - public static String getBigQueryGcpKey(Configuration conf) { - return conf.get(BIGQUERY_GCP_KEY); - } - - public static String getBigQueryDataset(Configuration conf) { - return conf.get(BIGQUERY_DATASET); - } - - public static String getBigQueryTable(Configuration conf) { - return conf.get(BIGQUERY_TABLE); - } - - public static String getBigQueryUsedHcatFilter(Configuration conf) { - return conf.get(BIGQUERY_USED_HCAT_FILTER); - } - - public static String getBigQueryTablePartitionDate(Configuration conf) { - return conf.get(BIGQUERY_TABLE_PARTITION_DATE); - } - - public static int getBigQueryCommitSize(Configuration conf) { - return Integer.parseInt(conf.get(BIGQUERY_COMMIT_SIZE)); - } - - public static int getBigQueryNoOfWorkers(Configuration conf) { - return Integer.parseInt(conf.get(BIGQUERY_NO_OF_WORKERS)); - } - - public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { - try { - return deserializeHCatSchema(conf.get(BIGQUERY_HCAT_SCHEMA)); - } catch (ClassNotFoundException e) { - throw new IOException("Error while deserializing HCatSchema", e); - } - } - - public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { - - currentConf.set(BIGQUERY_PROJECT, project); - currentConf.set(BIGQUERY_GCP_KEY, gcpKey); - currentConf.set(BIGQUERY_DATASET, database); - currentConf.set(BIGQUERY_TABLE, table); - currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); - currentConf.set(BIGQUERY_USED_HCAT_FILTER, usedHCatFilter); - currentConf.set(BIGQUERY_COMMIT_SIZE, String.valueOf(commitSize)); - currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); - currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); - - return currentConf; - - } - - public static TableId getBigQueryTableId(Configuration conf, boolean includingPartition) { - String bigQueryTableName = getBigQueryTable(conf) + (includingPartition && getBigQueryTablePartitionDate(conf) != null ? "$" + getBigQueryTablePartitionDate(conf) : ""); - - return getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), bigQueryTableName) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), bigQueryTableName); - } - - public static TableId getBigQueryTableId(Configuration conf) { - return getBigQueryTableId(conf, false); - } - public static void prepareBigQueryTable(Configuration conf) throws IOException { PartitioningScheme partitioning = getBigQueryTablePartitionDate(conf) != null ? DAILY : NONE; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java new file mode 100644 index 000000000..65795913c --- /dev/null +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java @@ -0,0 +1,178 @@ +package org.schedoscope.export.bigquery.outputschema; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hadoop.mapreduce.TaskAttemptID; +import org.apache.hadoop.mapreduce.task.TaskAttemptContextImpl; +import org.apache.hive.hcatalog.data.DefaultHCatRecord; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.junit.Before; +import org.junit.Test; +import org.schedoscope.export.bigquery.BigQueryBaseTest; +import org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat; + +import java.io.IOException; +import java.util.Arrays; + +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.configureBigQueryOutput; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.prepareBigQueryTable; + +public class BigQueryOutputFormatTest extends BigQueryBaseTest { + + private HCatSchema flatHcatSchema; + + private Configuration unpartitionedExport, partitionedExport; + + private TaskAttemptContext unpartitionedContext, partitionedContext; + + private DefaultHCatRecord[] inputData; + + private RecordWriter recordWriterPartitioned; + private RecordWriter recordWriterUnpartitioned; + + @Before + public void setUp() throws IOException { + + PrimitiveTypeInfo hcatStringType = new PrimitiveTypeInfo(); + hcatStringType.setTypeName("string"); + PrimitiveTypeInfo hcatIntType = new PrimitiveTypeInfo(); + hcatIntType.setTypeName("int"); + PrimitiveTypeInfo hcatLongType = new PrimitiveTypeInfo(); + hcatLongType.setTypeName("bigint"); + PrimitiveTypeInfo hcatByteType = new PrimitiveTypeInfo(); + hcatByteType.setTypeName("tinyint"); + PrimitiveTypeInfo hcatBooleanType = new PrimitiveTypeInfo(); + hcatBooleanType.setTypeName("boolean"); + PrimitiveTypeInfo hcatDoubleType = new PrimitiveTypeInfo(); + hcatDoubleType.setTypeName("double"); + PrimitiveTypeInfo hcatFloatType = new PrimitiveTypeInfo(); + hcatFloatType.setTypeName("float"); + + flatHcatSchema = new HCatSchema( + Arrays.asList( + new HCatFieldSchema("aString", hcatStringType, "a string field"), + new HCatFieldSchema("anInt", hcatIntType, "an int field"), + new HCatFieldSchema("aLong", hcatLongType, "a long field"), + new HCatFieldSchema("aByte", hcatByteType, "a byte field"), + new HCatFieldSchema("aBoolean", hcatBooleanType, "a boolean field"), + new HCatFieldSchema("aDouble", hcatDoubleType, "a double field"), + new HCatFieldSchema("aFloat", hcatFloatType, "a float field") + ) + ); + + + unpartitionedExport = configureBigQueryOutput( + new Configuration(), + null, + null, + "schedoscope_export_big_query_output_test", + "flat_table", + null, + "aString=y", + flatHcatSchema, + 2, + 1 + ); + + unpartitionedContext = new TaskAttemptContextImpl(unpartitionedExport, new TaskAttemptID()); + + recordWriterUnpartitioned = new BigQueryOutputFormat().getRecordWriter(unpartitionedContext); + + + partitionedExport = configureBigQueryOutput( + new Configuration(), + null, + null, + "schedoscope_export_big_query_output_test", + "flat_table_partitioned", + "20171001", + "aString=y", + flatHcatSchema, + 2, + 1 + ); + + partitionedContext = new TaskAttemptContextImpl(partitionedExport, new TaskAttemptID()); + + recordWriterPartitioned = new BigQueryOutputFormat().getRecordWriter(partitionedContext); + + inputData = new DefaultHCatRecord[]{ + new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString1"); + set("anInt", flatHcatSchema, 1); + set("aLong", flatHcatSchema, 1L); + set("aByte", flatHcatSchema, (byte) 1); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 1.4d); + set("aFloat", flatHcatSchema, 1.5f); + + }}, + new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString2"); + set("anInt", flatHcatSchema, 2); + set("aLong", flatHcatSchema, 2L); + set("aByte", flatHcatSchema, (byte) 2); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 2.4d); + set("aFloat", flatHcatSchema, 2.5f); + + }}, + new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString3"); + set("anInt", flatHcatSchema, 3); + set("aLong", flatHcatSchema, 3L); + set("aByte", flatHcatSchema, (byte) 3); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 3.4d); + set("aFloat", flatHcatSchema, 3.5f); + + }}, + new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString4"); + set("anInt", flatHcatSchema, 4); + set("aLong", flatHcatSchema, 4L); + set("aByte", flatHcatSchema, (byte) 4); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 4.4d); + set("aFloat", flatHcatSchema, 4.5f); + + }}, + new DefaultHCatRecord(7) {{ + set("aString", flatHcatSchema, "someString5"); + set("anInt", flatHcatSchema, 5); + set("aLong", flatHcatSchema, 5L); + set("aByte", flatHcatSchema, (byte) 5); + set("aBoolean", flatHcatSchema, true); + set("aDouble", flatHcatSchema, 5.4d); + set("aFloat", flatHcatSchema, 5.5f); + + }} + }; + } + + @Test + public void testUnpartitionedExport() throws IOException, InterruptedException { + prepareBigQueryTable(unpartitionedExport); + + for (DefaultHCatRecord r : inputData) + recordWriterUnpartitioned.write(null, r); + + recordWriterUnpartitioned.close(unpartitionedContext); + + } + + @Test + public void testPartitionedExport() throws IOException, InterruptedException { + prepareBigQueryTable(partitionedExport); + + for (DefaultHCatRecord r : inputData) + recordWriterPartitioned.write(null, r); + + recordWriterPartitioned.close(unpartitionedContext); + + } + +} From ca6ab70dc32eb73b0bb014c23fd7dcca7ba80b9f Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 15 Dec 2017 17:15:50 +0100 Subject: [PATCH 24/34] Temporary commit --- .../BigQueryOutputConfiguration.java | 20 ++++--- .../outputformat/BigQueryOutputFormat.java | 55 +++++++------------ 2 files changed, 34 insertions(+), 41 deletions(-) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java index f791aaf06..32b0002c8 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -15,7 +15,6 @@ public class BigQueryOutputConfiguration { public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; - public static final String BIGQUERY_COMMIT_SIZE = "bigquery.commitSize"; public static final String BIGQUERY_NO_OF_WORKERS = "bigquery.noOfPartitions"; public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; public static final String BIGQUERY_EXPORT_STORAGE_BUCKET = "bigquery.exportStorageBucket"; @@ -68,10 +67,6 @@ public static String getBigQueryTablePartitionDate(Configuration conf) { return conf.get(BIGQUERY_TABLE_PARTITION_DATE); } - public static int getBigQueryCommitSize(Configuration conf) { - return Integer.parseInt(conf.get(BIGQUERY_COMMIT_SIZE)); - } - public static int getBigQueryNoOfWorkers(Configuration conf) { return Integer.parseInt(conf.get(BIGQUERY_NO_OF_WORKERS)); } @@ -98,8 +93,18 @@ public static TableId getBigQueryTableId(Configuration conf) { return getBigQueryTableId(conf, false); } + public static String getBigQueryFullTableName(Configuration conf, boolean includingPartition) { + TableId tableId = getBigQueryTableId(conf, includingPartition); - public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, int commitSize, int noOfPartitions) throws IOException { + return (tableId.getProject() != null ? tableId.getProject() + "." : "") + + tableId.getDataset() + "." + tableId.getTable(); + } + + public static String getBigQueryFullTableName(Configuration conf) { + return getBigQueryFullTableName(conf, false); + } + + public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, String exportStorageBucket, int commitSize, int noOfPartitions) throws IOException { if (project != null) currentConf.set(BIGQUERY_PROJECT, project); @@ -116,7 +121,8 @@ public static Configuration configureBigQueryOutput(Configuration currentConf, S if (usedHCatFilter != null) currentConf.set(BIGQUERY_USED_HCAT_FILTER, usedHCatFilter); - currentConf.set(BIGQUERY_COMMIT_SIZE, String.valueOf(commitSize)); + currentConf.set(BIGQUERY_EXPORT_STORAGE_BUCKET, exportStorageBucket); + currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 9c4956f48..b55d83adb 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -3,6 +3,7 @@ import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.TableDefinition; import com.google.cloud.bigquery.TableId; +import com.google.cloud.storage.BlobId; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; @@ -12,12 +13,9 @@ import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; import java.io.IOException; -import java.util.Arrays; -import java.util.Map; +import java.nio.channels.WritableByteChannel; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.*; -import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; -import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.DAILY; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.NONE; @@ -40,6 +38,10 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { } + public static void commit(Configuration conf) throws IOException { + + } + public static void rollback(Configuration conf) throws IOException { BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); @@ -53,51 +55,38 @@ public static void rollback(Configuration conf) throws IOException { public class BiqQueryHCatRecordWriter extends RecordWriter { - private TableId tableId; - private int commitSize; - private Map[] batch; - private int elementsInBatch = 0; private HCatSchema hcatSchema; private BigQuery bigQueryService; private String usedHCatFilter; + private String bucket; + private String blobName; + + private BlobId blobId; + private WritableByteChannel channel; + @Override public void write(K key, V value) throws IOException { - try { + if (blobId == null && channel == null) { - Map bigQueryMap = convertHCatRecordToBigQueryMap(hcatSchema, value); - if (usedHCatFilter != null) - bigQueryMap.put(USED_FILTER_FIELD_NAME, usedHCatFilter); + } - batch[elementsInBatch] = bigQueryMap; - elementsInBatch++; - if (elementsInBatch == commitSize) { - retry(3, () -> insertIntoTable(bigQueryService, tableId, batch)); - elementsInBatch = 0; - } - } catch (Throwable t) { - throw new IOException("Exception encountered while writing HCatRecord to BigQuery", t); - } } @Override - public void close(TaskAttemptContext context) { - - if (elementsInBatch > 0) { - retry(3, () -> insertIntoTable(bigQueryService, tableId, Arrays.copyOf(batch, elementsInBatch))); - } - + public void close(TaskAttemptContext context) throws IOException { + if (channel != null) + channel.close(); } - public BiqQueryHCatRecordWriter(BigQuery bigQueryService, TableId tableId, HCatSchema hcatSchema, String usedHCatFilter, int commitSize) { + public BiqQueryHCatRecordWriter(BigQuery bigQueryService, String bucket, String blobName, HCatSchema hcatSchema, String usedHCatFilter) { this.bigQueryService = bigQueryService; - this.tableId = tableId; - this.commitSize = commitSize; - this.batch = new Map[commitSize]; + this.bucket = bucket; + this.blobName = blobName; this.hcatSchema = hcatSchema; this.usedHCatFilter = usedHCatFilter; } @@ -107,13 +96,11 @@ public BiqQueryHCatRecordWriter(BigQuery bigQueryService, TableId tableId, HCatS @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { - Configuration conf = context.getConfiguration(); BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); - return new BiqQueryHCatRecordWriter(bigQueryService, getBigQueryTableId(conf, true), getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf), getBigQueryCommitSize(conf)); - + return new BiqQueryHCatRecordWriter(bigQueryService, getBigQueryExportStorageBucket(conf), getBigQueryFullTableName(conf, true) + "/" + context.getTaskAttemptID().toString(), getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf)); } From f25e11a11cbe2bf1017791d9305396c3ee09b489 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Mon, 18 Dec 2017 19:27:23 +0100 Subject: [PATCH 25/34] Finally managed to get BigQuery output format running and loading data into historical partitioned tables --- schedoscope-export/pom.xml | 25 +++--- .../BigQueryOutputConfiguration.java | 36 +++++++- .../outputformat/BigQueryOutputFormat.java | 84 +++++++++++++++--- .../HCatRecordToBigQueryMapConvertor.java | 20 +---- .../export/utils/BigQueryUtils.java | 63 +++++--------- .../export/utils/CloudStorageUtils.java | 85 +++++++++++++++++++ .../export/bigquery/BigQueryBaseTest.java | 47 ++++++---- .../BigQueryOutputFormatTest.java | 31 +++++-- 8 files changed, 277 insertions(+), 114 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java diff --git a/schedoscope-export/pom.xml b/schedoscope-export/pom.xml index 4c895737a..a1b2aee32 100644 --- a/schedoscope-export/pom.xml +++ b/schedoscope-export/pom.xml @@ -23,6 +23,16 @@ guava 19.0 + + com.google.cloud + google-cloud-storage + 1.2.0 + + + com.google.cloud + google-cloud-bigquery + 0.20.0-beta + com.twitter parquet-hadoop-bundle @@ -224,21 +234,6 @@ - - com.google.api-client - google-api-client - 1.23.0 - - - com.google.cloud - google-cloud-bigquery - 0.20.0-beta - - - com.google.cloud - google-cloud-storage - 1.14.0 - com.101tec zkclient diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java index 32b0002c8..cddc7c442 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -13,12 +13,14 @@ public class BigQueryOutputConfiguration { public static final String BIGQUERY_DATASET = "bigquery.dataset"; public static final String BIGQUERY_TABLE = "bigquery.table"; public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; + public static final String BIGQUERY_DATASET_LOCATION = "bigquery.datasetLocation"; public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; public static final String BIGQUERY_NO_OF_WORKERS = "bigquery.noOfPartitions"; public static final String BIGQUERY_GCP_KEY = "bigquery.gcpKey"; public static final String BIGQUERY_EXPORT_STORAGE_BUCKET = "bigquery.exportStorageBucket"; - + public static final String BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX = "bigquery.exportStorageFolderPrefix"; + public static final String BIGQUERY_EXPORT_STORAGE_REGION = "bigquery.exportStorageRegion"; private static String serializeHCatSchema(HCatSchema schema) throws IOException { @@ -55,6 +57,10 @@ public static String getBigQueryDataset(Configuration conf) { return conf.get(BIGQUERY_DATASET); } + public static String getBigQueryDatasetLocation(Configuration conf) { + return conf.get(BIGQUERY_DATASET_LOCATION); + } + public static String getBigQueryTable(Configuration conf) { return conf.get(BIGQUERY_TABLE); } @@ -75,6 +81,18 @@ public static String getBigQueryExportStorageBucket(Configuration conf) { return conf.get(BIGQUERY_EXPORT_STORAGE_BUCKET); } + public static String getBigqueryExportStorageFolderPrefix(Configuration conf) { + return conf.get(BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX); + } + + public static String getBigqueryExportStorageRegion(Configuration conf) { + return conf.get(BIGQUERY_EXPORT_STORAGE_REGION); + } + + public static String getBigQueryExportStorageFolder(Configuration conf) { + return !getBigqueryExportStorageFolderPrefix(conf).isEmpty() ? getBigqueryExportStorageFolderPrefix(conf) + "/" + getBigQueryFullTableName(conf, true) : getBigQueryFullTableName(conf, true); + } + public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { try { return deserializeHCatSchema(conf.get(BIGQUERY_HCAT_SCHEMA)); @@ -104,7 +122,7 @@ public static String getBigQueryFullTableName(Configuration conf) { return getBigQueryFullTableName(conf, false); } - public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, String exportStorageBucket, int commitSize, int noOfPartitions) throws IOException { + public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String dataLocation, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, String exportStorageBucket, String exportStorageFolderPrefix, String exportStorageRegion, int noOfPartitions) throws IOException { if (project != null) currentConf.set(BIGQUERY_PROJECT, project); @@ -115,6 +133,9 @@ public static Configuration configureBigQueryOutput(Configuration currentConf, S currentConf.set(BIGQUERY_DATASET, database); currentConf.set(BIGQUERY_TABLE, table); + if (dataLocation != null) + currentConf.set(BIGQUERY_DATASET_LOCATION, dataLocation); + if (tablePartitionDate != null) currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); @@ -123,6 +144,17 @@ public static Configuration configureBigQueryOutput(Configuration currentConf, S currentConf.set(BIGQUERY_EXPORT_STORAGE_BUCKET, exportStorageBucket); + if (exportStorageFolderPrefix != null) + currentConf.set(BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX, exportStorageFolderPrefix); + else + currentConf.set(BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX, ""); + + if (exportStorageRegion != null) + currentConf.set(BIGQUERY_EXPORT_STORAGE_REGION, exportStorageRegion); + else + currentConf.set(BIGQUERY_EXPORT_STORAGE_REGION, "europe-west3"); + + currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index b55d83adb..55a9b7049 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -1,9 +1,10 @@ package org.schedoscope.export.bigquery.outputformat; +import com.fasterxml.jackson.databind.ObjectMapper; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.TableDefinition; import com.google.cloud.bigquery.TableId; -import com.google.cloud.storage.BlobId; +import com.google.cloud.storage.Storage; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.*; import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; @@ -13,13 +14,20 @@ import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; import java.io.IOException; +import java.nio.ByteBuffer; import java.nio.channels.WritableByteChannel; +import java.util.List; +import java.util.Map; +import java.util.concurrent.TimeoutException; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.*; +import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.DAILY; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.NONE; import static org.schedoscope.export.utils.BigQueryUtils.*; +import static org.schedoscope.export.utils.CloudStorageUtils.*; public class BigQueryOutputFormat extends OutputFormat { @@ -38,43 +46,85 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { } - public static void commit(Configuration conf) throws IOException { + public static void commit(Configuration conf) throws IOException, TimeoutException, InterruptedException { + + BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); + Storage storageService = storageService(getBigQueryGcpKey(conf)); + + List blobsToLoad = listBlobs(storageService, getBigQueryExportStorageBucket(conf), getBigQueryExportStorageFolder(conf)); + TableId tableId = getBigQueryTableId(conf, true); + + retry(3, () -> loadTable(bigQueryService, tableId, blobsToLoad)); + + try { + rollbackStorage(conf); + } catch (Throwable t) { + t.printStackTrace(); + } } - public static void rollback(Configuration conf) throws IOException { + public static void rollback(Configuration conf) { + + try { + rollbackBigQuery(conf); + } catch (Throwable t) { + t.printStackTrace(); + } + + try { + rollbackStorage(conf); + } catch (Throwable t) { + t.printStackTrace(); + } + } + + private static void rollbackBigQuery(Configuration conf) throws IOException { BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); TableId tableId = getBigQueryTableId(conf, true); retry(3, () -> { dropTable(bigQueryService, tableId); }); + } + + private static void rollbackStorage(Configuration conf) throws IOException { + Storage storageService = storageService(getBigQueryGcpKey(conf)); + String bucket = getBigQueryExportStorageBucket(conf); + String blobPrefix = getBigQueryExportStorageFolder(conf); + retry(3, () -> { + deleteBlob(storageService, bucket, blobPrefix); + }); } public class BiqQueryHCatRecordWriter extends RecordWriter { private HCatSchema hcatSchema; - private BigQuery bigQueryService; + private Storage storageService; + private String usedHCatFilter; private String bucket; private String blobName; + private String region; - private BlobId blobId; private WritableByteChannel channel; + private ObjectMapper jsonFactory = new ObjectMapper(); @Override public void write(K key, V value) throws IOException { - - if (blobId == null && channel == null) { - + if (channel == null) { + channel = createBlobIfNotExists(storageService, bucket, blobName, region).writer(); } + Map recordMap = convertHCatRecordToBigQueryMap(hcatSchema, value); + recordMap.put(USED_FILTER_FIELD_NAME, this.usedHCatFilter); + String output = jsonFactory.writeValueAsString(recordMap) + "\n"; - + channel.write(ByteBuffer.wrap(output.getBytes("UTF-8"))); } @Override @@ -83,10 +133,11 @@ public void close(TaskAttemptContext context) throws IOException { channel.close(); } - public BiqQueryHCatRecordWriter(BigQuery bigQueryService, String bucket, String blobName, HCatSchema hcatSchema, String usedHCatFilter) { - this.bigQueryService = bigQueryService; + public BiqQueryHCatRecordWriter(Storage storageService, String bucket, String blobName, String region, HCatSchema hcatSchema, String usedHCatFilter) { + this.storageService = storageService; this.bucket = bucket; this.blobName = blobName; + this.region = region; this.hcatSchema = hcatSchema; this.usedHCatFilter = usedHCatFilter; } @@ -98,12 +149,17 @@ public BiqQueryHCatRecordWriter(BigQuery bigQueryService, String bucket, String public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { Configuration conf = context.getConfiguration(); - BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); + Storage storageService = storageService(getBigQueryGcpKey(conf)); - return new BiqQueryHCatRecordWriter(bigQueryService, getBigQueryExportStorageBucket(conf), getBigQueryFullTableName(conf, true) + "/" + context.getTaskAttemptID().toString(), getBigQueryHCatSchema(conf), getBigQueryUsedHcatFilter(conf)); + return new BiqQueryHCatRecordWriter( + storageService, + getBigQueryExportStorageBucket(conf), + getBigQueryExportStorageFolder(conf) + "/" + context.getTaskAttemptID().toString(), + getBigqueryExportStorageRegion(conf), + getBigQueryHCatSchema(conf), + getBigQueryUsedHcatFilter(conf)); } - @Override public void checkOutputSpecs(JobContext context) throws IOException, InterruptedException { // do nothing diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index 30b2178df..38a1480a8 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -162,23 +162,7 @@ public Pair constructStructArrayField(HCatSchema schema, HCatFie }; - static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) throws JsonProcessingException { - - try { - LOG.info("Incoming HCat record: " + record.toString() + " of Schema: " + schema.toString()); - - Map bigQueryMap = transformSchema(c, schema).apply(record); - - LOG.info("Outgoing BigQuery map: " + jsonConvertor.writeValueAsString(bigQueryMap)); - - return bigQueryMap; - - } catch (JsonProcessingException e) { - // should not happen - LOG.error("Error converting HCatRecord", e); - - throw e; - } - + static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) { + return transformSchema(c, schema).apply(record); } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index d99423cfd..56a188990 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -7,8 +7,10 @@ import java.io.IOException; import java.nio.charset.Charset; import java.util.Arrays; +import java.util.List; import java.util.Map; import java.util.Random; +import java.util.concurrent.TimeoutException; import java.util.function.Supplier; import java.util.stream.Collectors; @@ -61,27 +63,12 @@ static public boolean existsDataset(BigQuery bigQueryService, String project, St return bigQueryService.getDataset(project == null ? DatasetId.of(dataset) : DatasetId.of(project, dataset)) != null; } - static public boolean existsDataset(BigQuery bigQueryService, String dataset) { - return existsDataset(bigQueryService, null, dataset); - } - - static public boolean existsDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { - return existsDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); - } - - static public void createDataset(BigQuery bigQueryService, String project, String dataset) { + static public void createDataset(BigQuery bigQueryService, String project, String dataset, String dataLocation) { if (!existsDataset(bigQueryService, project, dataset)) { - bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).build()); + bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).setLocation(dataLocation != null ? dataLocation : "EU").build()); } } - static public void createDataset(BigQuery bigQueryService, String dataset) { - createDataset(bigQueryService, null, dataset); - } - - static public void createDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { - createDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); - } static public void dropDataset(BigQuery bigQueryService, String project, String dataset) { if (existsDataset(bigQueryService, project, dataset)) { @@ -92,28 +79,12 @@ static public void dropDataset(BigQuery bigQueryService, String project, String } } - static public void dropDataset(BigQuery bigQueryService, String dataset) { - dropDataset(bigQueryService, null, dataset); - } - - static public void dropDataset(BigQuery bigQueryService, DatasetInfo datasetInfo) { - dropDataset(bigQueryService, datasetInfo.getDatasetId().getProject(), datasetInfo.getDatasetId().getDataset()); - } - static public boolean existsTable(BigQuery bigQueryService, TableId tableId) { return bigQueryService.getTable(tableId) != null; } - static public boolean existsTable(BigQuery bigQueryService, String project, String dataset, String table) { - return existsTable(bigQueryService, project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); - } - - static public boolean existsTable(BigQuery bigQueryService, String dataset, String table) { - return existsTable(bigQueryService, null, dataset, table); - } - static public void createTable(BigQuery bigQueryService, TableId tableId, TableDefinition tableDefinition) { - createDataset(bigQueryService, tableId.getProject(), tableId.getDataset()); + createDataset(bigQueryService, tableId.getProject(), tableId.getDataset(), null); if (!existsTable(bigQueryService, tableId)) bigQueryService.create(TableInfo.of(tableId, tableDefinition)); @@ -124,9 +95,6 @@ static public void createTable(BigQuery bigQueryService, String project, String createTable(bigQueryService, project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table), tableDefinition); } - static public void createTable(BigQuery bigQueryService, String dataset, String table, TableDefinition tableDefinition) { - createTable(bigQueryService, null, dataset, table, tableDefinition); - } static public void createTable(BigQuery bigQueryService, TableInfo tableInfo) { createTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable(), tableInfo.getDefinition()); @@ -136,14 +104,27 @@ static public void dropTable(BigQuery bigQueryService, String project, String da bigQueryService.delete(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); } - static public void dropTable(BigQuery bigQueryService, String dataset, String table) { - dropTable(bigQueryService, null, table); - } - static public void dropTable(BigQuery bigQueryService, TableId tableId) { dropTable(bigQueryService, tableId.getProject(), tableId.getDataset(), tableId.getTable()); } + static public void loadTable(BigQuery bigQueryService, TableId table, List cloudStoragePathsToData) { + Table t = bigQueryService.getTable(table); + Job loadJob = t.load(FormatOptions.json(), cloudStoragePathsToData); + + try { + loadJob = loadJob.waitFor(); + } catch (InterruptedException e) { + e.printStackTrace(); + } catch (TimeoutException e) { + e.printStackTrace(); + } + + if (loadJob.getStatus().getError() != null) { + throw new BigQueryException(999, "Could not insert some records into BigQuery table: " + loadJob.getStatus().getError()); + } + } + static public void insertIntoTable(BigQuery bigQueryService, TableId table, Map... rowsToInsert) { InsertAllRequest insertAllRequest = InsertAllRequest.newBuilder(table) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java new file mode 100644 index 000000000..3f1b57ddd --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java @@ -0,0 +1,85 @@ +package org.schedoscope.export.utils; + +import com.google.api.gax.paging.Page; +import com.google.auth.oauth2.GoogleCredentials; +import com.google.cloud.storage.*; + +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.util.LinkedList; +import java.util.List; +import java.util.Random; + +public class CloudStorageUtils { + + final static private Random rnd = new Random(); + + static public Storage storageService() { + return StorageOptions.getDefaultInstance().getService(); + } + + + static public Storage storageService(String gcpKey) throws IOException { + if (gcpKey == null) + return storageService(); + + GoogleCredentials credentials = GoogleCredentials + .fromStream( + new ByteArrayInputStream(Charset.forName("UTF-8").encode(gcpKey).array()) + ); + + return StorageOptions.newBuilder().setCredentials(credentials).build().getService(); + } + + static public boolean existsBucket(Storage storageService, String bucket) { + return storageService.get(bucket) != null; + } + + static public Bucket createBucket(Storage storageService, String bucket, String region) { + if (!existsBucket(storageService, bucket)) + return storageService.create(BucketInfo.newBuilder(bucket).setLocation(region != null ? region : "europe-west3").build()); + else + return storageService.get(bucket); + } + + static public void deleteBucket(Storage storageService, String bucket) { + deleteBlob(storageService, bucket, ""); + storageService.delete(bucket); + } + + static public void deleteBlob(Storage storageService, String bucket, String blobNameOrPrefix) { + if (!existsBucket(storageService, bucket)) + return; + + Page blobsToDelete = storageService.list(bucket, Storage.BlobListOption.prefix(blobNameOrPrefix)); + + for (Blob blob : blobsToDelete.iterateAll()) { + storageService.delete(blob.getBlobId()); + } + } + + static public List listBlobs(Storage storageService, String bucket, String blobNameOrPrefix) { + List result = new LinkedList<>(); + + Page blobs = storageService.list(bucket, Storage.BlobListOption.prefix(blobNameOrPrefix)); + + for (Blob blob : blobs.iterateAll()) { + result.add("gs://" + blob.getBucket() + "/" + blob.getName()); + } + + return result; + } + + static public Blob createBlobIfNotExists(Storage storageService, BlobId blob, String region) { + if (!existsBucket(storageService, blob.getBucket())) + createBucket(storageService, blob.getBucket(), region); + + return storageService.create(BlobInfo.newBuilder(blob).setContentType("application/json").build()); + + } + + static public Blob createBlobIfNotExists(Storage storageService, String bucket, String blobName, String region) { + return createBlobIfNotExists(storageService, BlobId.of(bucket, blobName), region); + } +} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index 8768796e6..e62037f21 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -1,6 +1,7 @@ package org.schedoscope.export.bigquery; import com.google.cloud.bigquery.*; +import com.google.cloud.storage.Storage; import org.junit.AfterClass; import org.junit.BeforeClass; import org.schedoscope.export.utils.BigQueryUtils; @@ -8,14 +9,16 @@ import java.util.Map; import static org.schedoscope.export.utils.BigQueryUtils.*; +import static org.schedoscope.export.utils.CloudStorageUtils.*; public abstract class BigQueryBaseTest { - final private static boolean CALL_BIG_QUERY = true; + final protected static boolean CALL_BIG_QUERY = false; - final private static boolean CLEAN_UP_BIG_QUERY = false; + final protected static boolean CLEAN_UP_BIG_QUERY = true; protected static BigQuery bigQuery; + protected static Storage storage; public void createTable(TableInfo tableInfo) { @@ -46,19 +49,26 @@ public static void createBigQueryDataSet() { return; bigQuery = bigQueryService(); + storage = storageService(); - if (existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_schema_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_schema_test"); - if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_record_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_record_test"); - if (existsDataset(bigQuery, "schedoscope_export_big_query_output_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_output_test"); + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_output_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_output_test"); + + if (existsBucket(storage, "schedoscope_export_big_query_output_test")) + deleteBucket(storage, "schedoscope_export_big_query_output_test"); + + createDataset(bigQuery, null, "schedoscope_export_big_query_schema_test", "EU"); + createDataset(bigQuery, null, "schedoscope_export_big_query_record_test", "EU"); + createDataset(bigQuery, null, "schedoscope_export_big_query_output_test", "EU"); + + createBucket(storage, "schedoscope_export_big_query_output_test", "europe-west3"); - createDataset(bigQuery, "schedoscope_export_big_query_schema_test"); - createDataset(bigQuery, "schedoscope_export_big_query_record_test"); - createDataset(bigQuery, "schedoscope_export_big_query_output_test"); } @AfterClass @@ -66,14 +76,17 @@ public static void dropBigQueryDataSets() { if (!CALL_BIG_QUERY || !CLEAN_UP_BIG_QUERY) return; - if (existsDataset(bigQuery, "schedoscope_export_big_query_schema_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_schema_test"); + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_schema_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_schema_test"); + + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_record_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_record_test"); - if (existsDataset(bigQuery, "schedoscope_export_big_query_record_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_record_test"); + if (existsDataset(bigQuery, null, "schedoscope_export_big_query_output_test")) + dropDataset(bigQuery, null, "schedoscope_export_big_query_output_test"); - if (existsDataset(bigQuery, "schedoscope_export_big_query_output_test")) - dropDataset(bigQuery, "schedoscope_export_big_query_output_test"); + if (existsBucket(storage, "schedoscope_export_big_query_output_test")) + deleteBucket(storage, "schedoscope_export_big_query_output_test"); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java index 65795913c..844b4dc16 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java @@ -16,8 +16,10 @@ import java.io.IOException; import java.util.Arrays; +import java.util.concurrent.TimeoutException; -import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.configureBigQueryOutput; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.configureBigQueryOutput; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.commit; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.prepareBigQueryTable; public class BigQueryOutputFormatTest extends BigQueryBaseTest { @@ -70,10 +72,13 @@ public void setUp() throws IOException { null, "schedoscope_export_big_query_output_test", "flat_table", + "EU", null, "aString=y", flatHcatSchema, - 2, + "schedoscope_export_big_query_output_test", + null, + null, 1 ); @@ -87,14 +92,18 @@ public void setUp() throws IOException { null, null, "schedoscope_export_big_query_output_test", - "flat_table_partitioned", + "flat_table_part", + "EU", "20171001", "aString=y", flatHcatSchema, - 2, + "schedoscope_export_big_query_output_test", + null, + null, 1 ); + partitionedContext = new TaskAttemptContextImpl(partitionedExport, new TaskAttemptID()); recordWriterPartitioned = new BigQueryOutputFormat().getRecordWriter(partitionedContext); @@ -154,7 +163,10 @@ public void setUp() throws IOException { } @Test - public void testUnpartitionedExport() throws IOException, InterruptedException { + public void testUnpartitionedExport() throws IOException, InterruptedException, TimeoutException { + if (!CALL_BIG_QUERY) + return; + prepareBigQueryTable(unpartitionedExport); for (DefaultHCatRecord r : inputData) @@ -162,17 +174,22 @@ public void testUnpartitionedExport() throws IOException, InterruptedException { recordWriterUnpartitioned.close(unpartitionedContext); + commit(unpartitionedExport); } @Test - public void testPartitionedExport() throws IOException, InterruptedException { + public void testPartitionedExport() throws IOException, InterruptedException, TimeoutException { + if (!CALL_BIG_QUERY) + return; + prepareBigQueryTable(partitionedExport); for (DefaultHCatRecord r : inputData) recordWriterPartitioned.write(null, r); - recordWriterPartitioned.close(unpartitionedContext); + recordWriterPartitioned.close(partitionedContext); + commit(partitionedExport); } } From 6ce87b5db2e8af16a17cc75921708abcd3f17d72 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 19 Dec 2017 08:58:33 +0100 Subject: [PATCH 26/34] Fixed logging properties in tests --- schedoscope-export/pom.xml | 20 +++++++++---------- .../src/test/resources/log4j.properties | 2 +- 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/schedoscope-export/pom.xml b/schedoscope-export/pom.xml index a1b2aee32..34b0a5d41 100644 --- a/schedoscope-export/pom.xml +++ b/schedoscope-export/pom.xml @@ -23,16 +23,6 @@ guava 19.0 - - com.google.cloud - google-cloud-storage - 1.2.0 - - - com.google.cloud - google-cloud-bigquery - 0.20.0-beta - com.twitter parquet-hadoop-bundle @@ -249,6 +239,16 @@ avro-serde 0.0.1 + + com.google.cloud + google-cloud-storage + 1.2.0 + + + com.google.cloud + google-cloud-bigquery + 0.20.0-beta + org.slf4j jul-to-slf4j diff --git a/schedoscope-export/src/test/resources/log4j.properties b/schedoscope-export/src/test/resources/log4j.properties index 827c48b90..7fc2fec78 100644 --- a/schedoscope-export/src/test/resources/log4j.properties +++ b/schedoscope-export/src/test/resources/log4j.properties @@ -1,6 +1,6 @@ #log4j.rootLogger=OFF # Root logger option -log4j.rootLogger=INFO, stdout +log4j.rootLogger=ERROR, stdout # Direct log messages to stdout log4j.appender.stdout=org.apache.log4j.ConsoleAppender From d6a09fe58e3d4230a5cf1581fc086748a5735382 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Tue, 19 Dec 2017 17:37:39 +0100 Subject: [PATCH 27/34] Added java doc, support for proxies, and table postfixes --- .../BigQueryOutputConfiguration.java | 212 +++++++++++++++++- .../outputformat/BigQueryOutputFormat.java | 96 ++++---- .../BiqQueryHCatRecordWriter.java | 78 +++++++ .../HCatRecordToBigQueryMapConvertor.java | 11 + .../HCatSchemaToBigQuerySchemaConverter.java | 45 ++-- .../export/utils/BigQueryUtils.java | 114 +++++++++- .../export/utils/CloudStorageUtils.java | 76 ++++++- .../BigQueryOutputFormatTest.java | 16 +- .../HCatSchemaToBigQueryTransformerTest.java | 27 --- 9 files changed, 555 insertions(+), 120 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java index cddc7c442..e7d1fd9fd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -7,12 +7,17 @@ import java.io.*; import java.util.Base64; +/** + * Maintains the Hadoop configuration keys and values required for BigQueryOutputFormat to work. + * The configurations are distributed to the various mappers using Hadoop's Configuration object mechanism. + */ public class BigQueryOutputConfiguration { public static final String BIGQUERY_PROJECT = "bigquery.project"; - public static final String BIGQUERY_DATASET = "bigquery.dataset"; + public static final String BIGQUERY_DATABASE = "bigquery.dataset"; public static final String BIGQUERY_TABLE = "bigquery.table"; public static final String BIGQUERY_TABLE_PARTITION_DATE = "bigquery.tablePartitionDate"; + public static final String BIGQUERY_TABLE_POSTFIX = "bigquery.tablePartitionPostfix"; public static final String BIGQUERY_DATASET_LOCATION = "bigquery.datasetLocation"; public static final String BIGQUERY_USED_HCAT_FILTER = "bigquery.usedHCatFilter"; public static final String BIGQUERY_HCAT_SCHEMA = "bigquery.hcatSchema"; @@ -21,6 +26,8 @@ public class BigQueryOutputConfiguration { public static final String BIGQUERY_EXPORT_STORAGE_BUCKET = "bigquery.exportStorageBucket"; public static final String BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX = "bigquery.exportStorageFolderPrefix"; public static final String BIGQUERY_EXPORT_STORAGE_REGION = "bigquery.exportStorageRegion"; + public static final String BIGQUERY_PROXY_HOST = "bigquery.proxyHost"; + public static final String BIGQUERY_PROXY_PORT = "bigquery.proxyPort"; private static String serializeHCatSchema(HCatSchema schema) throws IOException { @@ -45,54 +52,153 @@ private static HCatSchema deserializeHCatSchema(String serializedSchema) throws } + /** + * Return the name of the GCP project to use for BigQuery access from the Hadoop configuration. If not set, + * this will default to the default GCP project of the current user. + * + * @param conf the Hadoop configuration object + * @return the GCP project ID + */ public static String getBigQueryProject(Configuration conf) { return conf.get(BIGQUERY_PROJECT); } + /** + * Return the GCP key to use for BigQuery access from the Hadoop configuration + * + * @param conf the Hadoop configuration object + * @return the GCP key + */ public static String getBigQueryGcpKey(Configuration conf) { return conf.get(BIGQUERY_GCP_KEY); } - public static String getBigQueryDataset(Configuration conf) { - return conf.get(BIGQUERY_DATASET); + /** + * Return the name of database of the Hive table to export from the Hadoop configuration. This will become the dataset + * in BigQuery. + * + * @param conf the Hadoop configuration object + * @return the dataset + */ + public static String getBigQueryDatabase(Configuration conf) { + return conf.get(BIGQUERY_DATABASE); } + /** + * Return the storage location of the BigQuery dataset from the Hadoop configuration. + * If not set, this will default to EU. + * + * @param conf the Hadoop configuration object + * @return the dataset + */ public static String getBigQueryDatasetLocation(Configuration conf) { return conf.get(BIGQUERY_DATASET_LOCATION); } + /** + * Return the name of the Hive table to export from the Hadoop configuration. The BigQuery table name will be identical, + * unless augmented with a postfix. + * + * @param conf the Hadoop configuration object + * @return the table name + */ public static String getBigQueryTable(Configuration conf) { return conf.get(BIGQUERY_TABLE); } + /** + * Return the postfix to append to the BigQuery table name to export a given Hive table to from the Hadoop configuration. + * + * @param conf the Hadoop configuration object + * @return the table name postfix + */ + public static String getBigqueryTablePostfix(Configuration conf) { + return conf.get(BIGQUERY_TABLE_POSTFIX); + } + + /** + * Return the HCAT filter expression to apply to the Hive table being exported to BigQuery for partition selection + * from the Hadoop configuration. + * + * @param conf the Hadoop configuration object + * @return the filter expression + */ public static String getBigQueryUsedHcatFilter(Configuration conf) { return conf.get(BIGQUERY_USED_HCAT_FILTER); } + /** + * Return the partition date (YYYYMMDD) of the BigQuery table to which to export the Hive table to from the Hadoop + * configuration. If not set, the table will not be partitioned. + * + * @param conf the Hadoop configuration object + * @return the partition date. + */ public static String getBigQueryTablePartitionDate(Configuration conf) { return conf.get(BIGQUERY_TABLE_PARTITION_DATE); } + /** + * Return the number of parallel workers (mappers) exporting the Hive table data to BigQuery from the Hadoop + * configuration. + * + * @param conf the Hadoop configuration object + * @return the number of workers. + */ public static int getBigQueryNoOfWorkers(Configuration conf) { return Integer.parseInt(conf.get(BIGQUERY_NO_OF_WORKERS)); } + /** + * Return the name of the Google Cloud Storage bucket used for temporal storage of exported Hive table data from the + * Hadoop configuration. + * + * @param conf the Hadoop configuration object + * @return the GCP storage bucket. + */ public static String getBigQueryExportStorageBucket(Configuration conf) { return conf.get(BIGQUERY_EXPORT_STORAGE_BUCKET); } + /** + * Return the folder prefix to prepend to the temporal storage files in the GCP storage bucket from the Hadoop + * configuration. Defaults to "" + * + * @param conf the Hadoop configuration object + * @return the prefix. + */ public static String getBigqueryExportStorageFolderPrefix(Configuration conf) { return conf.get(BIGQUERY_EXPORT_STORAGE_FOLDER_PREFIX); } + /** + * Return the region in which temporal Hive table export data should be kept in GCP Cloud storage. Defaults + * to "europe-west3" + * + * @param conf the Hadoop configuration object + * @return the storage region + */ public static String getBigqueryExportStorageRegion(Configuration conf) { return conf.get(BIGQUERY_EXPORT_STORAGE_REGION); } + /** + * Return the folder within the GCP storage bucket temporal data exported from Hive should be exported to. + * + * @param conf the Hadoop configuration object + * @return the folder + */ public static String getBigQueryExportStorageFolder(Configuration conf) { return !getBigqueryExportStorageFolderPrefix(conf).isEmpty() ? getBigqueryExportStorageFolderPrefix(conf) + "/" + getBigQueryFullTableName(conf, true) : getBigQueryFullTableName(conf, true); } + /** + * Return the HCat schema of the Hive table being exported to BigQuery + * + * @param conf the Hadoop configuration object + * @return the HCat schema + * @throws IOException + */ public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOException { try { return deserializeHCatSchema(conf.get(BIGQUERY_HCAT_SCHEMA)); @@ -101,16 +207,38 @@ public static HCatSchema getBigQueryHCatSchema(Configuration conf) throws IOExce } } + /** + * Return a table ID address the BigQuery export table. + * + * @param conf the Hadoop configuration + * @param includingPartition should the table name include the partition selector in case of a partitioned target. + * @return the table ID + */ public static TableId getBigQueryTableId(Configuration conf, boolean includingPartition) { - String bigQueryTableName = getBigQueryTable(conf) + (includingPartition && getBigQueryTablePartitionDate(conf) != null ? "$" + getBigQueryTablePartitionDate(conf) : ""); + String bigQueryTableName = getBigQueryTable(conf) + + (getBigqueryTablePostfix(conf) != null ? "_" + getBigqueryTablePostfix(conf) : "") + + (includingPartition && getBigQueryTablePartitionDate(conf) != null ? "$" + getBigQueryTablePartitionDate(conf) : ""); - return getBigQueryProject(conf) == null ? TableId.of(getBigQueryDataset(conf), bigQueryTableName) : TableId.of(getBigQueryProject(conf), getBigQueryDataset(conf), bigQueryTableName); + return getBigQueryProject(conf) == null ? TableId.of(getBigQueryDatabase(conf), bigQueryTableName) : TableId.of(getBigQueryProject(conf), getBigQueryDatabase(conf), bigQueryTableName); } + /** + * Return a table ID address the BigQuery export table with no partition selector. + * + * @param conf the Hadoop configuration + * @return the table ID + */ public static TableId getBigQueryTableId(Configuration conf) { return getBigQueryTableId(conf, false); } + /** + * Return the fully qualified BigQuery table name serving as the Hive table export target. + * + * @param conf the Hadoop configuration + * @param includingPartition should the table name include the partition selector in case of a partitioned target. + * @return the table name + */ public static String getBigQueryFullTableName(Configuration conf, boolean includingPartition) { TableId tableId = getBigQueryTableId(conf, includingPartition); @@ -118,11 +246,69 @@ public static String getBigQueryFullTableName(Configuration conf, boolean includ + tableId.getDataset() + "." + tableId.getTable(); } + /** + * Return the fully qualified BigQuery table name serving as the Hive table export target without partition designation. + * + * @param conf the Hadoop configuration + * @return the table name + */ public static String getBigQueryFullTableName(Configuration conf) { return getBigQueryFullTableName(conf, false); } - public static Configuration configureBigQueryOutput(Configuration currentConf, String project, String gcpKey, String database, String table, String dataLocation, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, String exportStorageBucket, String exportStorageFolderPrefix, String exportStorageRegion, int noOfPartitions) throws IOException { + /** + * Return the configured host running the HTTPS proxy to route proxy requests through. + * + * @param conf the Hadoop configuration + * @return the HTTPS proxy host + */ + public static String getBigQueryProxyHost(Configuration conf) { + return conf.get(BIGQUERY_PROXY_HOST); + } + + /** + * Return the configured port of the HTTPS proxy to route proxy requests through. + * + * @param conf the Hadoop configuration + * @return the HTTPS proxy port + */ + public static String getBigQueryProxyPort(Configuration conf) { + return conf.get(BIGQUERY_PROXY_PORT); + } + + + /** + * Augment a given Hadoop configuration with additional parameters required for BigQuery Hive table export. + * + * @param currentConf the Hadoop configuration to augment + * @param project the name of the GCP project to use for BigQuery access from the Hadoop + * configuration. If null, this will default to the default GCP project of the + * current user. + * @param gcpKey the GCP key to use for GCP access. If null, the default key according to the GCP + * authentication mechanism will be used. + * @param database the name of the database of the Hive table being exported. + * @param table the name of the Hive table being exported. + * @param tablePostfix the postfix to append to the table name with _. Useful to model non-temporal + * Hive partition values. + * @param dataLocation the storage location where the resulting BigQuery table should be stored. Default to EU + * @param tablePartitionDate if set, the BigQuery table will be partitioned by day. The Hive table data will + * be exported into the given partition. If null, the resulting table is not partitioned. + * @param usedHCatFilter if set with a HCat table filter, only Hive data matching that filter will be + * exported. This is used to export only a partition of a given table. Pass null if + * you do not want filtering. + * @param hcatSchema the HCat schema of the Hive table that is being exported. + * @param exportStorageBucket the GCP Cloud Storage bucket to use for storing temporal data during export. + * @param exportStorageFolderPrefix a path prefix to append to the storage folder for temporal data in the GCP + * storage bucket. "" if null is given. + * @param exportStorageRegion the region where to store the GCP bucket. If null, the default region is chosen + * to be "europe-west3" + * @param noOfPartitions the parallelism with which to perform the export. Defaults to 1 in case you pass null. + * @param proxyHost the host running the HTTPS proxy to route traffic through. Set to null if no proxy is to be used. + * @param proxyPort the port of the HTTPS proxy to route traffic through. Set to null if no proxy is to be used. + * @return + * @throws IOException + */ + public static Configuration configureBigQueryOutputFormat(Configuration currentConf, String project, String gcpKey, String database, String table, String tablePostfix, String dataLocation, String tablePartitionDate, String usedHCatFilter, HCatSchema hcatSchema, String exportStorageBucket, String exportStorageFolderPrefix, String exportStorageRegion, Integer noOfPartitions, String proxyHost, String proxyPort) throws IOException { if (project != null) currentConf.set(BIGQUERY_PROJECT, project); @@ -130,11 +316,16 @@ public static Configuration configureBigQueryOutput(Configuration currentConf, S if (gcpKey != null) currentConf.set(BIGQUERY_GCP_KEY, gcpKey); - currentConf.set(BIGQUERY_DATASET, database); + currentConf.set(BIGQUERY_DATABASE, database); currentConf.set(BIGQUERY_TABLE, table); + if (tablePostfix != null) + currentConf.set(BIGQUERY_TABLE_POSTFIX, tablePostfix); + if (dataLocation != null) currentConf.set(BIGQUERY_DATASET_LOCATION, dataLocation); + else + currentConf.set(BIGQUERY_DATASET_LOCATION, "EU"); if (tablePartitionDate != null) currentConf.set(BIGQUERY_TABLE_PARTITION_DATE, tablePartitionDate); @@ -155,9 +346,14 @@ public static Configuration configureBigQueryOutput(Configuration currentConf, S currentConf.set(BIGQUERY_EXPORT_STORAGE_REGION, "europe-west3"); - currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions)); + currentConf.set(BIGQUERY_NO_OF_WORKERS, String.valueOf(noOfPartitions != null && noOfPartitions > 0 ? noOfPartitions : 1)); currentConf.set(BIGQUERY_HCAT_SCHEMA, serializeHCatSchema(hcatSchema)); + if (proxyHost != null && proxyPort != null) { + currentConf.set(BIGQUERY_PROXY_HOST, proxyHost); + currentConf.set(BIGQUERY_PROXY_PORT, proxyPort); + } + return currentConf; } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 55a9b7049..a946ffd0e 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -1,6 +1,5 @@ package org.schedoscope.export.bigquery.outputformat; -import com.fasterxml.jackson.databind.ObjectMapper; import com.google.cloud.bigquery.BigQuery; import com.google.cloud.bigquery.TableDefinition; import com.google.cloud.bigquery.TableId; @@ -10,30 +9,47 @@ import org.apache.hadoop.mapreduce.lib.output.FileOutputCommitter; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; import org.apache.hive.hcatalog.data.HCatRecord; -import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.schedoscope.export.bigquery.outputschema.PartitioningScheme; import java.io.IOException; -import java.nio.ByteBuffer; -import java.nio.channels.WritableByteChannel; import java.util.List; -import java.util.Map; import java.util.concurrent.TimeoutException; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.*; -import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; -import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableDefinition; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.DAILY; import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.NONE; import static org.schedoscope.export.utils.BigQueryUtils.*; import static org.schedoscope.export.utils.CloudStorageUtils.*; +/** + * Hadoop output format to write HCat records to GCP Cloud Storage and then forward them to BigQuery. + * + * @param we do not care about this type parameter. + * @param a subtype of HCatRecord. + */ public class BigQueryOutputFormat extends OutputFormat { + private static void setProxies(Configuration conf) { + if (getBigQueryProxyHost(conf) != null && getBigQueryProxyPort(conf) != null) { + System.setProperty("https.proxyHost", getBigQueryProxyHost(conf)); + System.setProperty("https.proxyPort", getBigQueryProxyPort(conf)); + } + } + + /** + * Given a Hadoop configuration with the BigQuery output format configuration values, create an equivalent BigQuery + * table. It HCatSchema passed in the configuration is considered, as well as a potentially given partition date + * to decide about daily partitioning of the table (or not). + * + * @param conf the BigQuery augmented Hadoop configuration (see {@link BigQueryOutputConfiguration}) + * @throws IOException in case the table could not be created. + */ public static void prepareBigQueryTable(Configuration conf) throws IOException { + setProxies(conf); + PartitioningScheme partitioning = getBigQueryTablePartitionDate(conf) != null ? DAILY : NONE; TableDefinition outputSchema = convertSchemaToTableDefinition(getBigQueryHCatSchema(conf), partitioning); @@ -46,7 +62,17 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { } + /** + * After the output format has written a Hive table's data to GCP cloud storage, commit the export by loading + * the data into the prepared BigQuery table and then delete the data in the storage bucket afterwards. + * + * @param conf the BigQuery augmented Hadoop configuration (see {@link BigQueryOutputConfiguration}) + * @throws IOException + * @throws TimeoutException + * @throws InterruptedException + */ public static void commit(Configuration conf) throws IOException, TimeoutException, InterruptedException { + setProxies(conf); BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); Storage storageService = storageService(getBigQueryGcpKey(conf)); @@ -64,7 +90,14 @@ public static void commit(Configuration conf) throws IOException, TimeoutExcepti } + /** + * Call the method in case of a problem. It drops the BigQuery table or table partition and deletes all data + * on cloud storage. + * + * @param conf the BigQuery augmented Hadoop configuration (see {@link BigQueryOutputConfiguration}) + */ public static void rollback(Configuration conf) { + setProxies(conf); try { rollbackBigQuery(conf); @@ -99,59 +132,16 @@ private static void rollbackStorage(Configuration conf) throws IOException { }); } - public class BiqQueryHCatRecordWriter extends RecordWriter { - - private HCatSchema hcatSchema; - private Storage storageService; - - private String usedHCatFilter; - private String bucket; - private String blobName; - private String region; - - private WritableByteChannel channel; - - private ObjectMapper jsonFactory = new ObjectMapper(); - - @Override - public void write(K key, V value) throws IOException { - if (channel == null) { - channel = createBlobIfNotExists(storageService, bucket, blobName, region).writer(); - } - - Map recordMap = convertHCatRecordToBigQueryMap(hcatSchema, value); - recordMap.put(USED_FILTER_FIELD_NAME, this.usedHCatFilter); - - String output = jsonFactory.writeValueAsString(recordMap) + "\n"; - - channel.write(ByteBuffer.wrap(output.getBytes("UTF-8"))); - } - - @Override - public void close(TaskAttemptContext context) throws IOException { - if (channel != null) - channel.close(); - } - - public BiqQueryHCatRecordWriter(Storage storageService, String bucket, String blobName, String region, HCatSchema hcatSchema, String usedHCatFilter) { - this.storageService = storageService; - this.bucket = bucket; - this.blobName = blobName; - this.region = region; - this.hcatSchema = hcatSchema; - this.usedHCatFilter = usedHCatFilter; - } - - } - @Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException { Configuration conf = context.getConfiguration(); + setProxies(conf); + Storage storageService = storageService(getBigQueryGcpKey(conf)); - return new BiqQueryHCatRecordWriter( + return new BiqQueryHCatRecordWriter<>( storageService, getBigQueryExportStorageBucket(conf), getBigQueryExportStorageFolder(conf) + "/" + context.getTaskAttemptID().toString(), diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java new file mode 100644 index 000000000..e27df27fe --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java @@ -0,0 +1,78 @@ +package org.schedoscope.export.bigquery.outputformat; + +import com.fasterxml.jackson.databind.ObjectMapper; +import com.google.cloud.storage.Storage; +import org.apache.hadoop.mapreduce.RecordWriter; +import org.apache.hadoop.mapreduce.TaskAttemptContext; +import org.apache.hive.hcatalog.data.HCatRecord; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.io.IOException; +import java.nio.ByteBuffer; +import java.nio.channels.WritableByteChannel; +import java.util.Map; + +import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; +import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.USED_FILTER_FIELD_NAME; +import static org.schedoscope.export.utils.CloudStorageUtils.createBlobIfNotExists; + +/** + * A writer for the BigQuery output format that transforms HCatRecords to JSON, and stores them to a cloud storage bucket. + * + * @param ingored + * @param a subtype of HCatRecord + */ +public class BiqQueryHCatRecordWriter extends RecordWriter { + + private HCatSchema hcatSchema; + private Storage storageService; + + private String usedHCatFilter; + private String bucket; + private String blobName; + private String region; + + private WritableByteChannel channel; + + private ObjectMapper jsonFactory = new ObjectMapper(); + + @Override + public void write(K key, V value) throws IOException { + if (channel == null) { + channel = createBlobIfNotExists(storageService, bucket, blobName, region).writer(); + } + + Map recordMap = convertHCatRecordToBigQueryMap(hcatSchema, value); + recordMap.put(USED_FILTER_FIELD_NAME, this.usedHCatFilter); + + String output = jsonFactory.writeValueAsString(recordMap) + "\n"; + + channel.write(ByteBuffer.wrap(output.getBytes("UTF-8"))); + } + + @Override + public void close(TaskAttemptContext context) throws IOException { + if (channel != null) + channel.close(); + } + + /** + * Constructor for the record writer. + * + * @param storageService reference to Google Cloud Storage web service + * @param bucket the bucket to write data to. The bucket gets created if it does not exist + * @param blobName the name of the blob to write data to + * @param region the storage region where the bucket is created if created. + * @param hcatSchema the HCat schema to which the records conform. + * @param usedHCatFilter the HCat filter expression that was used to read the HCat records passing through the writer. + */ + public BiqQueryHCatRecordWriter(Storage storageService, String bucket, String blobName, String region, HCatSchema hcatSchema, String usedHCatFilter) { + this.storageService = storageService; + this.bucket = bucket; + this.blobName = blobName; + this.region = region; + this.hcatSchema = hcatSchema; + this.usedHCatFilter = usedHCatFilter; + } + +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index 38a1480a8..cd0e5b45d 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -22,6 +22,9 @@ import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; +/** + * Convert HCat records to maps for use with BigQuery APIs + */ public class HCatRecordToBigQueryMapConvertor { static private final Log LOG = LogFactory.getLog(HCatRecordToBigQueryMapConvertor.class); @@ -162,6 +165,14 @@ public Pair constructStructArrayField(HCatSchema schema, HCatFie }; + /** + * Given an HCat schema, convert a record to a map representation for use with the BigQuery API. + * + * @param schema the HCatSchema to which records conform + * @param record the record to transform. + * @return a nested map representing the record sucht that it can be converted to the JSON format expected by + * the BigQuery API. + */ static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) { return transformSchema(c, schema).apply(record); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index d793e76dd..dfde98519 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -16,7 +16,9 @@ import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; - +/** + * Convertor for transforming HCat schemas to BigQuery schemas. + */ public class HCatSchemaToBigQuerySchemaConverter { static private final Log LOG = LogFactory.getLog(HCatSchemaToBigQuerySchemaConverter.class); @@ -164,6 +166,13 @@ public Field constructStructArrayField(HCatSchema schema, HCatFieldSchema field, } }; + /** + * Convert a given HCat schema to a BigQuery table definition. + * + * @param hcatSchema the HCat schema to convert + * @param partitioning should the table be partitioned? If so, with what granularity. + * @return the BigQuery table definition for a table equivalent to the HCat schema. + */ static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSchema, PartitioningScheme partitioning) { LOG.info("Incoming HCat table schema: " + hcatSchema.getSchemaAsTypeString()); @@ -186,36 +195,30 @@ static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSche return tableDefinition; } - static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { - - TableId tableId = project == null ? TableId.of(database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)) : TableId.of(project, database, table + (postfix == null || postfix.isEmpty() ? "" : "_" + postfix)); + /** + * Convert a given HCat schema to a BigQuery table information. + * + * @param project the ID of the GCP project where to create the dataset for the BigQuery table. If null, this is the configured default project. + * @param dataset the dataset to create the table in. The dataset will be created if it does not exist yet. + * @param table the name of the resulting BigQuery table. + * @param hcatSchema the HCat schema to convert + * @param partitioning should the table be partitioned? If so, with what granularity. + * @return the BigQuery table info for a table equivalent to the HCat schema. + * @throws IOException + */ + static public TableInfo convertSchemaToTableInfo(String project, String dataset, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { + + TableId tableId = project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table); TableInfo tableInfo = TableInfo.of(tableId, convertSchemaToTableDefinition(hcatSchema, partitioning)); return tableInfo; } - static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning, String postfix) throws IOException { - return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning, postfix); - } - - - static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { - return convertSchemaToTableInfo(project, database, table, hcatSchema, partitioning, ""); - } - static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning); } - static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { - return convertSchemaToTableInfo(project, database, table, hcatSchema, PartitioningScheme.NONE, postfix); - } - - static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, String postfix) throws IOException { - return convertSchemaToTableInfo(null, database, table, hcatSchema, postfix); - } - static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { return convertSchemaToTableInfo(project, database, table, hcatSchema, PartitioningScheme.NONE); } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index 56a188990..fec4e4d23 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -14,15 +14,30 @@ import java.util.function.Supplier; import java.util.stream.Collectors; +/** + * Helpers for dealing with BigQuery. + */ public class BigQueryUtils { final static private Random rnd = new Random(); + + /** + * Retrieve an instance of the BigQuery web service, authenticated using the default GCP authentication mechanism. + * + * @return the service instance. + */ static public BigQuery bigQueryService() { return BigQueryOptions.getDefaultInstance().getService(); } - + /** + * Retrieve an instance of the BigQuery web service, authenticated using the given GCP key. + * + * @param gcpKey the JSON formatted GCP key. + * @return the service instance. + * @throws IOException + */ static public BigQuery bigQueryService(String gcpKey) throws IOException { if (gcpKey == null) return bigQueryService(); @@ -35,6 +50,14 @@ static public BigQuery bigQueryService(String gcpKey) throws IOException { return BigQueryOptions.newBuilder().setCredentials(credentials).build().getService(); } + /** + * Helper to retry a lambda expression for a given number of times, until no exception is thrown. + * + * @param numberOfRetries the number of retries. + * @param action the lambda + * @param the return type + * @return the result of the lambda. + */ static public T retry(int numberOfRetries, Supplier action) { try { return action.get(); @@ -52,6 +75,12 @@ static public T retry(int numberOfRetries, Supplier action) { } } + /** + * Helper to retry a lambda returning void for a given number of times, until no exception is thrown. + * + * @param numberOfRetries the number of retries. + * @param action the lambda + */ static public void retry(int numberOfRetries, Runnable action) { retry(numberOfRetries, () -> { action.run(); @@ -59,17 +88,38 @@ static public void retry(int numberOfRetries, Runnable action) { }); } + /** + * Check whether a given dataset already exists. + * + * @param bigQueryService the BigQuery web service instance to use for the check. + * @param project the project owning the dataset or null, if the default project should be used. + * @param dataset the name of the dataset. + * @return true iff the dataset already exists. + */ static public boolean existsDataset(BigQuery bigQueryService, String project, String dataset) { return bigQueryService.getDataset(project == null ? DatasetId.of(dataset) : DatasetId.of(project, dataset)) != null; } + /** + * Create a dataset, if it does not exist. + * + * @param bigQueryService the BigQuery web service instance to use + * @param project the project to create the dataset in or null, if the default project should be used. + * @param dataset the name of the dataset to create. + */ static public void createDataset(BigQuery bigQueryService, String project, String dataset, String dataLocation) { if (!existsDataset(bigQueryService, project, dataset)) { bigQueryService.create((project == null ? DatasetInfo.newBuilder(dataset) : DatasetInfo.newBuilder(project, dataset)).setLocation(dataLocation != null ? dataLocation : "EU").build()); } } - + /** + * Drop a dataset + * + * @param bigQueryService the BigQuery web service instance to use + * @param project the project to owning the dataset to drop or null, if the default project should be used. + * @param dataset the name of the dataset to drop. + */ static public void dropDataset(BigQuery bigQueryService, String project, String dataset) { if (existsDataset(bigQueryService, project, dataset)) { bigQueryService.delete( @@ -79,10 +129,24 @@ static public void dropDataset(BigQuery bigQueryService, String project, String } } + /** + * Check whether a given table already exists. + * + * @param bigQueryService the BigQuery web service instance to use for the check. + * @param tableId the ID of the table to check. + * @return true iff the table already exists. + */ static public boolean existsTable(BigQuery bigQueryService, TableId tableId) { return bigQueryService.getTable(tableId) != null; } + /** + * Create a table, if it does not exist. If the dataset for the table does not exist, create that as well. + * + * @param bigQueryService the BigQuery web service instance to use + * @param tableId the ID of the table to create. + * @param tableDefinition the schema of the table to create. + */ static public void createTable(BigQuery bigQueryService, TableId tableId, TableDefinition tableDefinition) { createDataset(bigQueryService, tableId.getProject(), tableId.getDataset(), null); @@ -91,23 +155,61 @@ static public void createTable(BigQuery bigQueryService, TableId tableId, TableD } + /** + * Create a table, if it does not exist. If the dataset for the table does not exist, create that as well. + * + * @param bigQueryService the BigQuery web service instance to use + * @param project the project to create the table in or null, if the default project should be used. + * @param dataset the name of the dataset to create the table in. + * @param table the name of the table to create. + * @param tableDefinition the schema of the table to create. + */ static public void createTable(BigQuery bigQueryService, String project, String dataset, String table, TableDefinition tableDefinition) { createTable(bigQueryService, project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table), tableDefinition); } + /** + * Create a table, if it does not exist. If the dataset for the table does not exist, create that as well. + * + * @param bigQueryService the BigQuery web service instance to use + * @param tableInfo the complete table info of the table to create. + */ static public void createTable(BigQuery bigQueryService, TableInfo tableInfo) { createTable(bigQueryService, tableInfo.getTableId().getProject(), tableInfo.getTableId().getDataset(), tableInfo.getTableId().getTable(), tableInfo.getDefinition()); } + + /** + * Drop a table. + * + * @param bigQueryService the BigQuery web service instance to use + * @param project the project to drop the table from or null, if the default project should be used. + * @param dataset the name of the dataset to drop the table from. + * @param table the name of the table to drop. + */ static public void dropTable(BigQuery bigQueryService, String project, String dataset, String table) { bigQueryService.delete(project == null ? TableId.of(dataset, table) : TableId.of(project, dataset, table)); } + /** + * Drop a table. + * + * @param bigQueryService the BigQuery web service instance to use + * @param tableId the ID of the table to drop. + */ static public void dropTable(BigQuery bigQueryService, TableId tableId) { dropTable(bigQueryService, tableId.getProject(), tableId.getDataset(), tableId.getTable()); } + /** + * Load a table (partition) from a list of GCP Cloud Storage blobs. To load a partition, use a partition selector + * in the table ID. + * + * @param bigQueryService the BigQuery web service instance to use + * @param table the ID of the table to load + * @param cloudStoragePathsToData the list of gs:// URLs to the blobs to load into the table. + */ static public void loadTable(BigQuery bigQueryService, TableId table, List cloudStoragePathsToData) { Table t = bigQueryService.getTable(table); Job loadJob = t.load(FormatOptions.json(), cloudStoragePathsToData); @@ -125,6 +227,14 @@ static public void loadTable(BigQuery bigQueryService, TableId table, List... rowsToInsert) { InsertAllRequest insertAllRequest = InsertAllRequest.newBuilder(table) diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java index 3f1b57ddd..976b09f18 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java @@ -9,17 +9,32 @@ import java.nio.charset.Charset; import java.util.LinkedList; import java.util.List; -import java.util.Random; public class CloudStorageUtils { - final static private Random rnd = new Random(); - + /** + * Return an instance of the Google Cloud Storage web service authenticated using the GCP standard authentication + * mechanism. + * + * @return the instance + */ static public Storage storageService() { return StorageOptions.getDefaultInstance().getService(); } - + /** + * Return an instance of the Google Cloud Storage web service. + * + * @return the instance + */ + + /** + * Return an instance of the Google Cloud Storage web service authenticated using the given key. + * + * @param gcpKey the JSON formatted GCP key. + * @return the instance + * @throws IOException if a problem occurs parsing the key. + */ static public Storage storageService(String gcpKey) throws IOException { if (gcpKey == null) return storageService(); @@ -32,10 +47,25 @@ static public Storage storageService(String gcpKey) throws IOException { return StorageOptions.newBuilder().setCredentials(credentials).build().getService(); } + /** + * Check whether a bucket exists. + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket to check. + * @return true iff the bucket exists. + */ static public boolean existsBucket(Storage storageService, String bucket) { return storageService.get(bucket) != null; } + /** + * Create a bucket if it does not exist already. + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket to create + * @param region the region to create the bucket in, europe-west3 if null + * @return the bucket created or the bucket that already existed + */ static public Bucket createBucket(Storage storageService, String bucket, String region) { if (!existsBucket(storageService, bucket)) return storageService.create(BucketInfo.newBuilder(bucket).setLocation(region != null ? region : "europe-west3").build()); @@ -43,11 +73,24 @@ static public Bucket createBucket(Storage storageService, String bucket, String return storageService.get(bucket); } + /** + * Delete a bucket including all the blobs within. + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket to delete + */ static public void deleteBucket(Storage storageService, String bucket) { deleteBlob(storageService, bucket, ""); storageService.delete(bucket); } + /** + * Delete blobs in a bucket + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket in which to delete blobs + * @param blobNameOrPrefix the blob name prefix of the blobs to delete + */ static public void deleteBlob(Storage storageService, String bucket, String blobNameOrPrefix) { if (!existsBucket(storageService, bucket)) return; @@ -59,6 +102,14 @@ static public void deleteBlob(Storage storageService, String bucket, String blob } } + /** + * List blobs matching a blob name prefix. + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket in which to list blobs + * @param blobNameOrPrefix the blob name prefix of the blobs to delete + * @return the list of matching blob names. + */ static public List listBlobs(Storage storageService, String bucket, String blobNameOrPrefix) { List result = new LinkedList<>(); @@ -71,6 +122,14 @@ static public List listBlobs(Storage storageService, String bucket, Stri return result; } + /** + * Create a blob in a bucket if it does not exist. If the bucket does not exist, it will be created. + * + * @param storageService the storage service instance to use + * @param blob the ID of the blob to create + * @param region the region where to create the bucket if it does not exist + * @return the blob created or the blob that already existed. + */ static public Blob createBlobIfNotExists(Storage storageService, BlobId blob, String region) { if (!existsBucket(storageService, blob.getBucket())) createBucket(storageService, blob.getBucket(), region); @@ -79,6 +138,15 @@ static public Blob createBlobIfNotExists(Storage storageService, BlobId blob, St } + /** + * Create a blob in a bucket if it does not exist. If the bucket does not exist, it will be created. + * + * @param storageService the storage service instance to use + * @param bucket the name of the bucket to create the blob in. + * @param blobName the name of the blob to create + * @param region the region where to create the bucket if it does not exist + * @return the blob created or the blob that already existed. + */ static public Blob createBlobIfNotExists(Storage storageService, String bucket, String blobName, String region) { return createBlobIfNotExists(storageService, BlobId.of(bucket, blobName), region); } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java index 844b4dc16..89a126e37 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java @@ -18,7 +18,7 @@ import java.util.Arrays; import java.util.concurrent.TimeoutException; -import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.configureBigQueryOutput; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.configureBigQueryOutputFormat; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.commit; import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.prepareBigQueryTable; @@ -66,12 +66,13 @@ public void setUp() throws IOException { ); - unpartitionedExport = configureBigQueryOutput( + unpartitionedExport = configureBigQueryOutputFormat( new Configuration(), null, null, "schedoscope_export_big_query_output_test", "flat_table", + null, "EU", null, "aString=y", @@ -79,7 +80,9 @@ public void setUp() throws IOException { "schedoscope_export_big_query_output_test", null, null, - 1 + 1, + null, + null ); unpartitionedContext = new TaskAttemptContextImpl(unpartitionedExport, new TaskAttemptID()); @@ -87,12 +90,13 @@ public void setUp() throws IOException { recordWriterUnpartitioned = new BigQueryOutputFormat().getRecordWriter(unpartitionedContext); - partitionedExport = configureBigQueryOutput( + partitionedExport = configureBigQueryOutputFormat( new Configuration(), null, null, "schedoscope_export_big_query_output_test", "flat_table_part", + null, "EU", "20171001", "aString=y", @@ -100,7 +104,9 @@ public void setUp() throws IOException { "schedoscope_export_big_query_output_test", null, null, - 1 + 1, + null, + null ); diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index a48f23c8f..7f9372ca0 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -17,7 +17,6 @@ import java.util.Map; import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo; @@ -438,13 +437,6 @@ public void testFlatTableConversion() throws IOException { createTable(converted); } - @Test - public void testTableConversionWithPostfix() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, "test"); - - assertTrue(converted.getTableId().getTable().endsWith("_test")); - } - @Test public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { PartitioningScheme partitioning = PartitioningScheme.MONTHLY; @@ -464,25 +456,6 @@ public void testTableConversionWithPartitioning() throws IOException, NoSuchFiel } - @Test - public void testTableConversionWithPartitioningAndPHCatSchemaToBigQuerySchemaConverterostfix() throws IOException, NoSuchFieldException, IllegalAccessException { - PartitioningScheme partitioning = PartitioningScheme.MONTHLY; - - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning, "test"); - - assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); - assertEquals("flat_table_test", converted.getTableId().getTable()); - - StandardTableDefinition bigQueryTableDefinition = converted.getDefinition(); - - java.lang.reflect.Field field = StandardTableDefinition.class.getDeclaredField("timePartitioning"); - field.setAccessible(true); - TimePartitioning timePartitioning = (TimePartitioning) field.get(bigQueryTableDefinition); - - assertEquals(TimePartitioning.Type.DAY, timePartitioning.getType()); - - } - @Test public void testTableWithPrimitiveListConversion() throws IOException { From 0b024dad3cb5b8bd0391c83c41136801d20e830c Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 20 Dec 2017 10:13:27 +0100 Subject: [PATCH 28/34] Additional Javadoc --- .../outputformat/BigQueryOutputFormat.java | 2 + .../HCatRecordToBigQueryMapConvertor.java | 12 +- .../HCatSchemaToBigQuerySchemaConverter.java | 21 +- .../HCatSchemaToBigQueryTransformer.java | 350 ++++++++++++++++++ .../export/utils/HCatSchemaTransformer.java | 164 -------- .../HCatSchemaToBigQueryTransformerTest.java | 20 +- 6 files changed, 371 insertions(+), 198 deletions(-) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java delete mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index a946ffd0e..7ab4b15c7 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -72,6 +72,7 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { * @throws InterruptedException */ public static void commit(Configuration conf) throws IOException, TimeoutException, InterruptedException { + setProxies(conf); BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); @@ -97,6 +98,7 @@ public static void commit(Configuration conf) throws IOException, TimeoutExcepti * @param conf the BigQuery augmented Hadoop configuration (see {@link BigQueryOutputConfiguration}) */ public static void rollback(Configuration conf) { + setProxies(conf); try { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index cd0e5b45d..f4e622e4f 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -4,22 +4,20 @@ import com.fasterxml.jackson.databind.ObjectMapper; import org.apache.commons.lang3.tuple.ImmutablePair; import org.apache.commons.lang3.tuple.Pair; -import org.apache.commons.logging.Log; -import org.apache.commons.logging.LogFactory; import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.DefaultHCatRecord; import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; -import org.schedoscope.export.utils.HCatSchemaTransformer; +import org.schedoscope.export.utils.HCatSchemaToBigQueryTransformer; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.stream.Collectors; -import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; +import static org.schedoscope.export.utils.HCatSchemaToBigQueryTransformer.transformSchema; /** @@ -27,11 +25,9 @@ */ public class HCatRecordToBigQueryMapConvertor { - static private final Log LOG = LogFactory.getLog(HCatRecordToBigQueryMapConvertor.class); - static private final ObjectMapper jsonConvertor = new ObjectMapper(); - private static final HCatSchemaTransformer.Constructor, Map> c = new HCatSchemaTransformer.Constructor, Map>() { + private static final HCatSchemaToBigQueryTransformer.Constructor, Map> c = new HCatSchemaToBigQueryTransformer.Constructor, Map>() { @Override public Object accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, HCatRecord hCatRecord) { @@ -174,6 +170,6 @@ public Pair constructStructArrayField(HCatSchema schema, HCatFie * the BigQuery API. */ static public Map convertHCatRecordToBigQueryMap(HCatSchema schema, HCatRecord record) { - return transformSchema(c, schema).apply(record); + return transformSchema(c, schema, record); } } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index dfde98519..d5106bad6 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -7,14 +7,14 @@ import org.apache.hive.hcatalog.common.HCatException; import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; import org.apache.hive.hcatalog.data.schema.HCatSchema; -import org.schedoscope.export.utils.HCatSchemaTransformer; +import org.schedoscope.export.utils.HCatSchemaToBigQueryTransformer; import java.io.IOException; import java.util.Arrays; import java.util.LinkedList; import java.util.List; -import static org.schedoscope.export.utils.HCatSchemaTransformer.transformSchema; +import static org.schedoscope.export.utils.HCatSchemaToBigQueryTransformer.transformSchema; /** * Convertor for transforming HCat schemas to BigQuery schemas. @@ -33,7 +33,7 @@ public class HCatSchemaToBigQuerySchemaConverter { static private final Field usedFilterField = Field.newBuilder(USED_FILTER_FIELD_NAME, Field.Type.string()).setMode(Field.Mode.NULLABLE).setDescription("HCatInputFormat filter used to export the present record.").build(); - static private final HCatSchemaTransformer.Constructor c = new HCatSchemaTransformer.Constructor() { + static private final HCatSchemaToBigQueryTransformer.Constructor c = new HCatSchemaToBigQueryTransformer.Constructor() { private Field.Type translatePrimitiveType(PrimitiveTypeInfo primitiveTypeInfo) { switch (primitiveTypeInfo.getTypeName()) { @@ -178,7 +178,7 @@ static public TableDefinition convertSchemaToTableDefinition(HCatSchema hcatSche List fields = new LinkedList<>(); fields.add(usedFilterField); - fields.addAll(transformSchema(c, hcatSchema).apply(hcatSchema).getFields()); + fields.addAll(transformSchema(c, hcatSchema, hcatSchema).getFields()); StandardTableDefinition.Builder tableDefinitionBuilder = StandardTableDefinition .newBuilder() @@ -214,17 +214,4 @@ static public TableInfo convertSchemaToTableInfo(String project, String dataset, return tableInfo; } - - static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema, PartitioningScheme partitioning) throws IOException { - return convertSchemaToTableInfo(null, database, table, hcatSchema, partitioning); - } - - static public TableInfo convertSchemaToTableInfo(String project, String database, String table, HCatSchema hcatSchema) throws IOException { - return convertSchemaToTableInfo(project, database, table, hcatSchema, PartitioningScheme.NONE); - } - - static public TableInfo convertSchemaToTableInfo(String database, String table, HCatSchema hcatSchema) throws IOException { - return convertSchemaToTableInfo(null, database, table, hcatSchema); - } - } diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java new file mode 100644 index 000000000..a221d30df --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java @@ -0,0 +1,350 @@ +package org.schedoscope.export.utils; + + +import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; +import org.apache.hive.hcatalog.common.HCatException; +import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; +import org.apache.hive.hcatalog.data.schema.HCatSchema; + +import java.util.List; +import java.util.stream.Collectors; + +/** + * Library functions for traversing an HCatSchema and transforming a data structure to something else based on the schema + * according to the capabilities of BigQuery. As BigQuery is less expressive, the transformer functions already perform + * the correct downgrades of data structures for you. You just need to provide an implementation of the {@link Constructor} + * interface with appropriate callbacks. + */ +public class HCatSchemaToBigQueryTransformer { + + + /** + * Inferface of callback functions that need to be implemented for a transformation. + * + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + */ + public interface Constructor { + + /** + * Return a primitive field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + F accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return a map field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + F accessMapField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return a struct field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + S accessStructField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return a primitive array field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + List accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return an array of array field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + List accessArrayArrayField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return an array of map equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + List accessMapArrayField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Return an array of struct field equivalent of the schema / record equivalent before transformation. + * + * @param schema the HCat schema + * @param field the field + * @param s the schema / record equivalent under transformation + * @return the field equivalent + */ + List accessStructArrayField(HCatSchema schema, HCatFieldSchema field, S s); + + /** + * Construct a schema / record equivalent transform from a list of field equivalent transforms + * + * @param fts the field transforms + * @return the schema / record equivalent transform + */ + ST constructSchema(List fts); + + /** + * Construct a transform from a primitive field equivalent + * + * @param field the field + * @param f the field equivalent + * @return the field equivalent transform + */ + FT constructPrimitiveField(HCatFieldSchema field, F f); + + /** + * Construct a transform from a map field equivalent + * + * @param field the field + * @param f the field equivalent + * @return the field equivalent transform + */ + FT constructMapField(HCatFieldSchema field, F f); + + /** + * Construct a transform from a struct field equivalent + * + * @param schema the record schema + * @param field the field + * @param st the record transform representing the struct equivalent + * @return the field equivalent transform + */ + FT constructStructField(HCatSchema schema, HCatFieldSchema field, ST st); + + /** + * Construct a transform from a primitive array field equivalent + * + * @param field the field + * @param elementType the type of the array elements + * @param fs the field equivalents + * @return the field equivalent transform + */ + FT constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType, List fs); + + /** + * Construct a transform from an array of map field equivalent + * + * @param field the field + * @param fs the map field equivalents + * @return the field equivalent transform + */ + FT constructMapArrayField(HCatFieldSchema field, List fs); + + /** + * Construct a transform from an array of array field equivalent + * + * @param field the field + * @param fs the array field equivalents + * @return the field equivalent transform + */ + FT constructArrayArrayField(HCatFieldSchema field, List fs); + + /** + * Construct a transform from an array of struct field equivalent + * + * @param field the field + * @param sts the struct field equivalents + * @return the field equivalent transform + */ + FT constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List sts); + } + + + /** + * Transform a schema / record equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public ST transformSchema(Constructor c, HCatSchema schema, S s) { + + return + c.constructSchema( + schema + .getFields() + .stream() + .map(field -> transformField(c, schema, field, s)) + .collect(Collectors.toList()) + ); + + } + + /** + * Transform a field equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param field the field to transform + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public FT transformField(Constructor c, HCatSchema schema, HCatFieldSchema field, S s) { + + if (HCatFieldSchema.Category.ARRAY == field.getCategory()) + + return transformArrayField(c, schema, field, s); + + else if (HCatFieldSchema.Category.STRUCT == field.getCategory()) + + return transformStructField(c, schema, field, s); + + else if (HCatFieldSchema.Category.MAP == field.getCategory()) + + return transformMapField(c, schema, field, s); + + else + + return transformPrimitiveField(c, schema, field, s); + + } + + /** + * Transform a primitive field equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param field the field to transform + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public FT transformPrimitiveField(Constructor c, HCatSchema schema, HCatFieldSchema field, S s) { + + return c.constructPrimitiveField(field, c.accessPrimitiveField(schema, field, s)); + + } + + /** + * Transform an array field equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param field the field to transform + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public FT transformArrayField(Constructor c, HCatSchema schema, HCatFieldSchema field, S s) { + + try { + + HCatFieldSchema elementSchema = field.getArrayElementSchema().get(0); + PrimitiveTypeInfo elementType = elementSchema.getTypeInfo(); + + if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) + + return c.constructPrimitiveArrayField(field, elementType, c.accessPrimitiveArrayField(schema, field, s)); + + else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) + + return c.constructMapArrayField(field, c.accessMapArrayField(schema, field, s)); + + else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) + + return c.constructArrayArrayField(field, c.accessArrayArrayField(schema, field, s)); + + else { + + HCatSchema structSchema = elementSchema.getStructSubSchema(); + + return c.constructStructArrayField(structSchema, field, + c.accessStructArrayField(schema, field, s) + .stream() + .map(saf -> transformSchema(c, structSchema, saf)) + .collect(Collectors.toList())); + + } + + } catch (HCatException e) { + // not going to happen + + return null; + } + + } + + /** + * Transform a map field equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param field the field to transform + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public FT transformMapField(Constructor c, HCatSchema schema, HCatFieldSchema field, S s) { + + return c.constructMapField(field, c.accessMapField(schema, field, s)); + + } + + /** + * Transform a struct field equivalent for a given use case captured by a set of constructors. + * + * @param c the constructors + * @param schema the HCat schema to traverse + * @param field the field to transform + * @param s the schema / record equivalent + * @param Type of the schema / record equivalent of the data structure before transformation + * @param Type of the field equivalent of the data structure before transformation + * @param Type of the schema / record equivalent of the transformation result + * @param Type of the field equivalent of the transformation result + * @return the transformation result + */ + static public FT transformStructField(Constructor c, HCatSchema schema, HCatFieldSchema field, S s) { + + try { + + HCatSchema structSchema = field.getStructSubSchema(); + + return c.constructStructField(structSchema, field, transformSchema(c, structSchema, c.accessStructField(schema, field, s))); + + } catch (HCatException e) { + // not going to happen + return null; + } + } +} diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java deleted file mode 100644 index 38b5e0d95..000000000 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaTransformer.java +++ /dev/null @@ -1,164 +0,0 @@ -package org.schedoscope.export.utils; - - -import org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo; -import org.apache.hive.hcatalog.common.HCatException; -import org.apache.hive.hcatalog.data.schema.HCatFieldSchema; -import org.apache.hive.hcatalog.data.schema.HCatSchema; - -import java.util.List; -import java.util.function.Function; -import java.util.stream.Collectors; - -public class HCatSchemaTransformer { - - - public interface Constructor { - - F accessPrimitiveField(HCatSchema schema, HCatFieldSchema field, S s); - - F accessMapField(HCatSchema schema, HCatFieldSchema field, S s); - - S accessStructField(HCatSchema schema, HCatFieldSchema field, S s); - - List accessPrimitiveArrayField(HCatSchema schema, HCatFieldSchema field, S s); - - List accessArrayArrayField(HCatSchema schema, HCatFieldSchema field, S s); - - List accessMapArrayField(HCatSchema schema, HCatFieldSchema field, S s); - - List accessStructArrayField(HCatSchema schema, HCatFieldSchema field, S s); - - ST constructSchema(List fts); - - FT constructPrimitiveField(HCatFieldSchema field, F f); - - FT constructMapField(HCatFieldSchema field, F f); - - FT constructStructField(HCatSchema schema, HCatFieldSchema field, ST st); - - FT constructPrimitiveArrayField(HCatFieldSchema field, PrimitiveTypeInfo elementType, List fs); - - FT constructMapArrayField(HCatFieldSchema field, List fs); - - FT constructArrayArrayField(HCatFieldSchema field, List fs); - - FT constructStructArrayField(HCatSchema schema, HCatFieldSchema field, List sts); - } - - - static public Function transformSchema(Constructor c, HCatSchema schema) { - - return s -> - c.constructSchema( - schema - .getFields() - .stream() - .map(field -> transformField(c, schema, field).apply(s)) - .collect(Collectors.toList()) - ); - - } - - static public Function transformField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - - if (HCatFieldSchema.Category.ARRAY == field.getCategory()) - - return transformArrayField(c, schema, field); - - else if (HCatFieldSchema.Category.STRUCT == field.getCategory()) - - return transformStructField(c, schema, field); - - else if (HCatFieldSchema.Category.MAP == field.getCategory()) - - return transformMapField(c, schema, field); - - else - - return transformPrimitiveField(c, schema, field); - - } - - static public Function transformPrimitiveField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - - return s -> c.constructPrimitiveField(field, - c.accessPrimitiveField(schema, field, s) - ); - - } - - static public Function transformArrayField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - - try { - - HCatFieldSchema elementSchema = field.getArrayElementSchema().get(0); - PrimitiveTypeInfo elementType = elementSchema.getTypeInfo(); - - if (HCatFieldSchema.Category.PRIMITIVE == elementSchema.getCategory()) - - return s -> c.constructPrimitiveArrayField( - field, elementType, - c.accessPrimitiveArrayField(schema, field, s) - ); - - else if (HCatFieldSchema.Category.MAP == elementSchema.getCategory()) - - return s -> c.constructMapArrayField(field, - c.accessMapArrayField(schema, field, s) - ); - - else if (HCatFieldSchema.Category.ARRAY == elementSchema.getCategory()) - - return s -> c.constructArrayArrayField(field, - c.accessArrayArrayField(schema, field, s) - ); - - else { - - HCatSchema structSchema = elementSchema.getStructSubSchema(); - - return s -> c.constructStructArrayField(structSchema, field, - c.accessStructArrayField(schema, field, s) - .stream() - .map(saf -> transformSchema(c, structSchema).apply(saf)) - .collect(Collectors.toList()) - ); - - } - - } catch (HCatException e) { - // not going to happen - - return null; - } - - } - - static public Function transformMapField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - - return s -> c.constructMapField(field, - c.accessMapField(schema, field, s) - ); - - } - - static public Function transformStructField(Constructor c, HCatSchema schema, HCatFieldSchema field) { - - try { - - HCatSchema structSchema = field.getStructSubSchema(); - - return s -> c.constructStructField( - structSchema, field, - transformSchema(c, structSchema).apply( - c.accessStructField(schema, field, s) - ) - ); - - } catch (HCatException e) { - // not going to happen - return null; - } - } -} diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index 7f9372ca0..e4f327998 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -19,6 +19,8 @@ import static org.junit.Assert.assertEquals; import static org.schedoscope.export.bigquery.outputschema.HCatRecordToBigQueryMapConvertor.convertHCatRecordToBigQueryMap; import static org.schedoscope.export.bigquery.outputschema.HCatSchemaToBigQuerySchemaConverter.convertSchemaToTableInfo; +import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.MONTHLY; +import static org.schedoscope.export.bigquery.outputschema.PartitioningScheme.NONE; public class HCatSchemaToBigQueryTransformerTest extends BigQueryBaseTest { @@ -428,7 +430,7 @@ public void setUp() throws HCatException { @Test public void testFlatTableConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -439,9 +441,9 @@ public void testFlatTableConversion() throws IOException { @Test public void testTableConversionWithPartitioning() throws IOException, NoSuchFieldException, IllegalAccessException { - PartitioningScheme partitioning = PartitioningScheme.MONTHLY; + PartitioningScheme partitioning = MONTHLY; - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "flat_table", flatHcatSchema, partitioning); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("flat_table", converted.getTableId().getTable()); @@ -459,7 +461,7 @@ public void testTableConversionWithPartitioning() throws IOException, NoSuchFiel @Test public void testTableWithPrimitiveListConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_primitive_list", hcatSchemaWithPrimitiveList, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_primitive_list", converted.getTableId().getTable()); @@ -470,7 +472,7 @@ public void testTableWithPrimitiveListConversion() throws IOException { @Test public void testTableWithStructConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_struct", hcatSchemaWithStruct, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_struct", converted.getTableId().getTable()); @@ -481,7 +483,7 @@ public void testTableWithStructConversion() throws IOException { @Test public void testTableWithListStructConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_list_struct", hcatSchemaWithListOfStruct, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_struct", converted.getTableId().getTable()); @@ -492,7 +494,7 @@ public void testTableWithListStructConversion() throws IOException { @Test public void testTableWithListOfListsConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_list_of_lists", hcatSchemaWithListOfList, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_lists", converted.getTableId().getTable()); @@ -504,7 +506,7 @@ public void testTableWithListOfListsConversion() throws IOException { @Test public void testTableWithMapConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_map", hcatSchemaWithMap, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_map", converted.getTableId().getTable()); @@ -515,7 +517,7 @@ public void testTableWithMapConversion() throws IOException { @Test public void testTableWithListOfMapConversion() throws IOException { - TableInfo converted = convertSchemaToTableInfo("schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps); + TableInfo converted = convertSchemaToTableInfo(null, "schedoscope_export_big_query_schema_test", "table_with_list_of_map", hcatSchemaWithListOfMaps, NONE); assertEquals("schedoscope_export_big_query_schema_test", converted.getTableId().getDataset()); assertEquals("table_with_list_of_map", converted.getTableId().getTable()); From 98f0391481145cdd0cb518c9fd905c0ed751a358 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Wed, 20 Dec 2017 11:52:19 +0100 Subject: [PATCH 29/34] Created MR Job skeleton --- .../export/bigquery/BigQueryExportJob.java | 155 ++++++++++++++++++ 1 file changed, 155 insertions(+) create mode 100644 schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java new file mode 100644 index 000000000..e67889942 --- /dev/null +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java @@ -0,0 +1,155 @@ +package org.schedoscope.export.bigquery; + +import org.apache.commons.io.FileUtils; +import org.apache.commons.logging.Log; +import org.apache.commons.logging.LogFactory; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.hive.conf.HiveConf; +import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.util.ToolRunner; +import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.apache.thrift.TException; +import org.kohsuke.args4j.CmdLineException; +import org.kohsuke.args4j.CmdLineParser; +import org.kohsuke.args4j.Option; +import org.schedoscope.export.BaseExportJob; +import org.schedoscope.export.jdbc.JdbcExportJob; + +import java.io.File; +import java.io.IOException; +import java.util.concurrent.TimeoutException; + +import static org.apache.hive.hcatalog.common.HCatUtil.getTable; +import static org.apache.hive.hcatalog.common.HCatUtil.getTableSchemaWithPtnCols; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputConfiguration.configureBigQueryOutputFormat; +import static org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat.*; + +public class BigQueryExportJob extends BaseExportJob { + + private static final Log LOG = LogFactory.getLog(BigQueryExportJob.class); + + @Option(name = "-P", usage = "the GCP project ID under which to create the resulting BigQuery dataset, e.g., project 4711. If not passed, the default GCP project will be used") + private String project; + + @Option(name = "-D", usage = "the BigQuery table partition date into which to insert the exported data, e.g., 20171001. If not passed, it is assumed that the resulting BigQuery table is not partitioned") + private String partitionDate; + + @Option(name = "-x", usage = "the postfix to append to the resulting BigQuery table name, e.g., EC0101. If not passed, no postfix will be appended") + private String tablePostfix; + + @Option(name = "-l", usage = "the location where to store the resulting BigQuery table, e.g., US. If not passed, EU will be used") + private String tableStorageLocation; + + + @Option(name = "-k", usage = "GCP key to use for authentication in JSON format. If not passed, the gcloud default user will be used") + private String gcpKey; + + @Option(name = "-K", usage = "file with the GCP key to use for authentication in JSON format. If not passed, the gcloud default user will be used") + private String gcpKeyFile; + + @Option(name = "-b", usage = "GCP storage bucket to use for temporal storage, e.g., my-storage-bucket-for-export.", required = true) + private String exportStorageBucket; + + @Option(name = "-f", usage = "GCP storage bucket folder prefix to prepend to temporal storage blobs, e.g., scratch") + private String exportStoragePrefix; + + @Option(name = "-r", usage = "GCP storage bucket region to use, e.g., europe-west1. Defaults to europe-west3") + private String exportStorageRegion; + + @Option(name = "-y", usage = "Proxy host to use for GCP access") + private String proxyHost; + + @Option(name = "-Y", usage = "Proxy port to use for GCP access") + private String proxyPort; + + + @Override + public int run(String[] args) throws CmdLineException, IOException, TException, ClassNotFoundException, InterruptedException, TimeoutException { + + CmdLineParser cmd = new CmdLineParser(this); + + try { + cmd.parseArgument(args); + } catch (CmdLineException e) { + System.err.println(e.getMessage()); + cmd.printUsage(System.err); + throw e; + } + + Configuration conf = prepareConfiguration(); + Job job = prepareJob(conf); + + boolean success = job.waitForCompletion(true); + + if (success) { + try { + prepareBigQueryTable(conf); + commit(conf); + } catch (Throwable t) { + rollback(conf); + } + } else + rollback(conf); + + return (success ? 0 : 1); + } + + private Configuration prepareConfiguration() throws IOException, TException { + Configuration conf = getConfiguration(); + conf = configureHiveMetaStore(conf); + conf = configureKerberos(conf); + conf = configureAnonFields(conf); + + if (gcpKeyFile != null) { + gcpKey = FileUtils.readFileToString(new File(gcpKeyFile)); + } + + HCatSchema hCatSchema; + + HiveMetaStoreClient metastore = new HiveMetaStoreClient(new HiveConf(conf, HiveConf.class)); + try { + hCatSchema = getTableSchemaWithPtnCols(getTable(metastore, inputDatabase, inputTable)); + } finally { + metastore.close(); + } + + return configureBigQueryOutputFormat( + conf, + project, + gcpKey, + inputDatabase, + inputTable, + tablePostfix, + tableStorageLocation, + partitionDate, + inputFilter, + hCatSchema, + exportStorageBucket, + exportStoragePrefix, + exportStorageRegion, + numReducer, + proxyHost, + proxyPort + ); + + } + + private Job prepareJob(Configuration conf) throws IOException, TException { + + Job job = Job.getInstance(conf, "BigQueryExport: " + inputDatabase + "." + + inputTable); + + return job; + } + + public static void main(String[] args) { + try { + int exitCode = ToolRunner.run(new JdbcExportJob(), args); + System.exit(exitCode); + } catch (Exception e) { + LOG.error(e.getMessage()); + System.exit(1); + } + } +} From 079e03ff4e72b60349fc25987ca5cb3cafbfdee2 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Thu, 21 Dec 2017 14:58:21 +0100 Subject: [PATCH 30/34] Finished MapReduce job for BigQuery export and tested it --- .../export/bigquery/BigQueryExportJob.java | 43 ++++++++-- .../BigQueryOutputConfiguration.java | 2 +- .../outputformat/BigQueryOutputFormat.java | 4 + .../export/utils/BigQueryUtils.java | 4 +- .../schedoscope/export/HiveUnitBaseTest.java | 12 +-- .../export/bigquery/BigQueryExportTest.java | 80 +++++++++++++++++++ 6 files changed, 131 insertions(+), 14 deletions(-) create mode 100644 schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java index e67889942..76fa31d24 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java @@ -6,19 +6,22 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.hive.conf.HiveConf; import org.apache.hadoop.hive.metastore.HiveMetaStoreClient; +import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.Mapper; import org.apache.hadoop.util.ToolRunner; +import org.apache.hive.hcatalog.data.HCatRecord; import org.apache.hive.hcatalog.data.schema.HCatSchema; +import org.apache.hive.hcatalog.mapreduce.HCatInputFormat; import org.apache.thrift.TException; import org.kohsuke.args4j.CmdLineException; import org.kohsuke.args4j.CmdLineParser; import org.kohsuke.args4j.Option; import org.schedoscope.export.BaseExportJob; -import org.schedoscope.export.jdbc.JdbcExportJob; +import org.schedoscope.export.bigquery.outputformat.BigQueryOutputFormat; import java.io.File; import java.io.IOException; -import java.util.concurrent.TimeoutException; import static org.apache.hive.hcatalog.common.HCatUtil.getTable; import static org.apache.hive.hcatalog.common.HCatUtil.getTableSchemaWithPtnCols; @@ -64,8 +67,10 @@ public class BigQueryExportJob extends BaseExportJob { private String proxyPort; + private Configuration initialConfiguration; + @Override - public int run(String[] args) throws CmdLineException, IOException, TException, ClassNotFoundException, InterruptedException, TimeoutException { + public int run(String[] args) throws CmdLineException, IOException, TException, ClassNotFoundException, InterruptedException { CmdLineParser cmd = new CmdLineParser(this); @@ -77,7 +82,7 @@ public int run(String[] args) throws CmdLineException, IOException, TException, throw e; } - Configuration conf = prepareConfiguration(); + Configuration conf = prepareConfiguration(initialConfiguration); Job job = prepareJob(conf); boolean success = job.waitForCompletion(true); @@ -95,8 +100,7 @@ public int run(String[] args) throws CmdLineException, IOException, TException, return (success ? 0 : 1); } - private Configuration prepareConfiguration() throws IOException, TException { - Configuration conf = getConfiguration(); + private Configuration prepareConfiguration(Configuration conf) throws IOException, TException { conf = configureHiveMetaStore(conf); conf = configureKerberos(conf); conf = configureAnonFields(conf); @@ -140,12 +144,37 @@ private Job prepareJob(Configuration conf) throws IOException, TException { Job job = Job.getInstance(conf, "BigQueryExport: " + inputDatabase + "." + inputTable); + job.setJarByClass(BigQueryExportJob.class); + job.setMapperClass(Mapper.class); + job.setNumReduceTasks(0); + + if (inputFilter == null || inputFilter.trim().equals("")) { + HCatInputFormat.setInput(job, inputDatabase, inputTable); + } else { + HCatInputFormat.setInput(job, inputDatabase, inputTable, + inputFilter); + } + + job.setInputFormatClass(HCatInputFormat.class); + job.setOutputFormatClass(BigQueryOutputFormat.class); + + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(HCatRecord.class); + return job; } + public BigQueryExportJob(Configuration initialConfiguration) { + this.initialConfiguration = initialConfiguration; + } + + public BigQueryExportJob() { + this(new Configuration()); + } + public static void main(String[] args) { try { - int exitCode = ToolRunner.run(new JdbcExportJob(), args); + int exitCode = ToolRunner.run(new BigQueryExportJob(), args); System.exit(exitCode); } catch (Exception e) { LOG.error(e.getMessage()); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java index e7d1fd9fd..7ce301615 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -189,7 +189,7 @@ public static String getBigqueryExportStorageRegion(Configuration conf) { * @return the folder */ public static String getBigQueryExportStorageFolder(Configuration conf) { - return !getBigqueryExportStorageFolderPrefix(conf).isEmpty() ? getBigqueryExportStorageFolderPrefix(conf) + "/" + getBigQueryFullTableName(conf, true) : getBigQueryFullTableName(conf, true); + return (getBigqueryExportStorageFolderPrefix(conf) != null && !getBigqueryExportStorageFolderPrefix(conf).isEmpty()) ? getBigqueryExportStorageFolderPrefix(conf) + "/" + getBigQueryFullTableName(conf, true) : getBigQueryFullTableName(conf, true); } /** diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index 7ab4b15c7..d02b8129a 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -56,6 +56,10 @@ public static void prepareBigQueryTable(Configuration conf) throws IOException { BigQuery bigQueryService = bigQueryService(getBigQueryGcpKey(conf)); + retry(3, () -> { + dropTable(bigQueryService, getBigQueryTableId(conf, true)); + }); + retry(3, () -> { createTable(bigQueryService, getBigQueryTableId(conf), outputSchema); }); diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index fec4e4d23..a324594a6 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -137,7 +137,9 @@ static public void dropDataset(BigQuery bigQueryService, String project, String * @return true iff the table already exists. */ static public boolean existsTable(BigQuery bigQueryService, TableId tableId) { - return bigQueryService.getTable(tableId) != null; + Table table = bigQueryService.getTable(tableId); + + return table != null; } /** diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/HiveUnitBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/HiveUnitBaseTest.java index 6e64eebdf..33ca1ad3d 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/HiveUnitBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/HiveUnitBaseTest.java @@ -52,11 +52,11 @@ public abstract class HiveUnitBaseTest { Logger.getLogger("global").setLevel(Level.FINEST); } - private static final String DEFAUlT_HIVE_DB = "default"; + private static final String DEFAULT_HIVE_DB = "default"; private static final String DEFAULT_DERBY_DB = "jdbc:derby:memory:TestingDB;create=true"; private static final String DATA_FILE_PATH = "DATA_FILE_PATH"; - private HiveTestSuite testSuite; + protected HiveTestSuite testSuite; // use those two instances to set up // the unit test, the conf is needed @@ -73,6 +73,7 @@ public void setUp() throws Exception { testSuite = new HiveTestSuite(); testSuite.createTestCluster(); conf = testSuite.getFS().getConf(); + } @After @@ -97,15 +98,16 @@ public void setUpHiveServer(String dataFile, String hiveScript, Schema schema = SchemaFactory.getSchema(conf); // set up column type mapping - HCatInputFormat.setInput(conf, DEFAUlT_HIVE_DB, tableName); + HCatInputFormat.setInput(conf, DEFAULT_HIVE_DB, tableName); hcatInputSchema = HCatInputFormat.getTableSchema(conf); conf.setStrings(Schema.JDBC_OUTPUT_COLUMN_TYPES, SchemaUtils .getColumnTypesFromHcatSchema(hcatInputSchema, schema, new HashSet(0))); + // set up hcatalog record reader ReadEntity.Builder builder = new ReadEntity.Builder(); - ReadEntity entity = builder.withDatabase(DEFAUlT_HIVE_DB) + ReadEntity entity = builder.withDatabase(DEFAULT_HIVE_DB) .withTable(tableName).build(); Map config = new HashMap(); @@ -122,7 +124,7 @@ public void setUpHiveServerNoData(String hiveScript, String tableName) // load data into hive table testSuite.executeScript(hiveScript); - HCatInputFormat.setInput(conf, DEFAUlT_HIVE_DB, tableName); + HCatInputFormat.setInput(conf, DEFAULT_HIVE_DB, tableName); hcatInputSchema = HCatInputFormat.getTableSchema(conf); } } diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java new file mode 100644 index 000000000..a5b1feeb0 --- /dev/null +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java @@ -0,0 +1,80 @@ +package org.schedoscope.export.bigquery; + +import com.google.cloud.bigquery.BigQuery; +import com.google.cloud.storage.Storage; +import org.apache.thrift.TException; +import org.junit.Test; +import org.kohsuke.args4j.CmdLineException; +import org.schedoscope.export.HiveUnitBaseTest; + +import java.io.IOException; +import java.util.concurrent.TimeoutException; + +import static org.schedoscope.export.utils.BigQueryUtils.*; +import static org.schedoscope.export.utils.CloudStorageUtils.*; + +public class BigQueryExportTest extends HiveUnitBaseTest { + + final private static boolean CALL_BIG_QUERY = false; + + final private static boolean CLEAN_UP_BIG_QUERY = true; + + private BigQuery bigQuery; + + private Storage storage; + + @Override + public void setUp() throws Exception { + if (!CALL_BIG_QUERY) + return; + + super.setUp(); + + bigQuery = bigQueryService(); + storage = storageService(); + + if (existsDataset(bigQuery, null, "default")) + dropDataset(bigQuery, null, "default"); + + createBucket(storage, "schedoscope_export_big_query_full_test", "europe-west3"); + + setUpHiveServer("src/test/resources/test_map_data.txt", + "src/test/resources/test_map.hql", "test_map"); + + } + + @Test + public void runBigQueryExportJob() throws CmdLineException, IOException, InterruptedException, TException, TimeoutException, ClassNotFoundException { + + if (!CALL_BIG_QUERY) + return; + + BigQueryExportJob job = new BigQueryExportJob(conf); + + job.run(new String[]{ + "-m", "", + "-d", "default", + "-t", "test_map", + "-b", "schedoscope_export_big_query_full_test", + "-D", "20150801" + }); + + + } + + @Override + public void tearDown() throws Exception { + if (!CALL_BIG_QUERY) + return; + + super.tearDown(); + + if (!CLEAN_UP_BIG_QUERY) + return; + + if (existsDataset(bigQuery, null, "default")) + dropDataset(bigQuery, null, "default"); + + deleteBucket(storage, "schedoscope_export_big_query_full_test"); + } +} From f386ac4e45129a5478a36e6fc7a6a5359af81e03 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Thu, 21 Dec 2017 17:12:09 +0100 Subject: [PATCH 31/34] Outlined Schedoscope export syntax for BigQueryk --- .../src/main/resources/reference.conf | 50 +++++++++++++++ .../org/schedoscope/conf/BaseSettings.scala | 37 ++++++++++- .../dsl/transformations/Export.scala | 61 +++++++++++++++++++ .../export/bigquery/BigQueryExportJob.java | 35 ++++++++--- 4 files changed, 174 insertions(+), 9 deletions(-) diff --git a/schedoscope-conf/src/main/resources/reference.conf b/schedoscope-conf/src/main/resources/reference.conf index 4a77af4f8..5d196f08f 100644 --- a/schedoscope-conf/src/main/resources/reference.conf +++ b/schedoscope-conf/src/main/resources/reference.conf @@ -329,6 +329,56 @@ salt = "vD75MqvaasIlCf7H" + # + # BigQuery exporter settings. + # + + bigQuery { + + # + # GCP project ID under which the exported BigQuery dataset will be created + # + + projectId = "" + + # + # GCP key in JSON format to use for authentication + # + + gcpKey = "" + + # + # Number of reducers to use for parallel writing to BigQuery. + # + + numberOfReducers = 10 + + # + # GCP data storage location of exported data within BigQuery. + # + + dataLocation = "EU" + + # + # GCP Cloud Storage bucket for temporary storage to use for exporting to BigQuery. + # + + exportStorageBucket = "schedoscope_bigquery_export" + + # + # GCP Cloud Storage bucket folder prefix to apply to blobs when exporting to BigQuery + # + + exportStorageBucketFolderPrefix = "" + + # + # GCP Cloud Storage bucket region to use for exporting to BigQuery + # + + exportStorageBucketRegion = "europe-west3" + + } + # # JDBC exporter settings. # diff --git a/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala b/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala index f6f9003f0..9b6ac17d6 100644 --- a/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala +++ b/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala @@ -247,10 +247,45 @@ class BaseSettings(val config: Config) { lazy val redisExportBatchSize = config.getInt("schedoscope.export.redis.insertBatchSize") /** - * Number of reducers to use for Redis export. + * Number of reducers to use for Kafka export. */ lazy val kafkaExportNumReducers = config.getInt("schedoscope.export.kafka.numberOfReducers") + /** + * GCP project ID under which exported BigQuery dataset will be created. Defaults to the default project of the current user. + */ + lazy val bigQueryExportProjectId = config.getString("schedoscope.export.bigQuery.projectId") + + /** + * Number of reducers to use for BigQuery export. + */ + lazy val bigQueryExportNumReducers = config.getInt("schedoscope.export.bigQuery.numberOfReducers") + + /** + * GCP data storage location of exported data within BigQuery. Defaults to EU. + */ + lazy val bigQueryExportDataLocation = config.getString("schedoscope.export.bigQuery.dataLocation") + + /** + * GCP key in JSON format to use for authentication when exporting to BigQuery. If not set, the key of the current user is used. + */ + lazy val bigQueryExportGcpKey = config.getString("schedoscope.export.bigQuery.gcpKey") + + /** + * GCP Cloud Storage bucket to use for temporary storage while exporting to BigQuery. Defaults to "schedoscope_bigquery_export" + */ + lazy val bigQueryExportStorageBucket = config.getString("schedoscope.export.bigQuery.exportStorageBucket") + + /** + * Folder prefix to apply to blobs in the GCP Cloud Storage bucket while exporting to BigQuery. Defaults to "" + */ + lazy val bigQueryExportStorageBucketFolderPrefix = config.getString("schedoscope.export.bigQuery.exportStorageBucketFolderPrefix") + + /** + * GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to europe-west3 + */ + lazy val bigQueryExportStorageBucketRegion = config.getString("schedoscope.export.bigQuery.exportStorageBucketRegion") + /** * Number of reducers to use for (S)Ftp export. */ diff --git a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala index c3455fe7a..6253740f0 100644 --- a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala +++ b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala @@ -148,6 +148,67 @@ object Export { } } + /** + * This function prepares a MapReduce job for exporting a given view to BigQuery. + * + * @param v the view to export. + * @param projectId GCP project ID under which exported BigQuery dataset will be created. If not set, + * this is the default GCP project of the current user. Can be globally configured by + * setting schedoscope.export.bigQuery.projectId + * @param gcpKey GCP key in JSON format to use for authentication when exporting to BigQuery. + * If not set, the local gcloud key of the user running Schedoscope is used. + * Can be globally configured by setting schedoscope.export.bigQuery.gcpKey + * @param gcpKeyFile An absolute path pointing to the GCP key in JSON format to use for authentication + * when exporting to BigQuery. If not set, the local gcloud key of the user running + * Schedoscope is used (or gcpKey). + * @param storageBucket GCP Cloud Storage bucket to use for temporary storage while exporting to BigQuery. + * Defaults to "schedoscope_bigquery_export". Can be globally configured by + * setting schedoscope.export.bigQuery.storageBucket + * @param storageBucketFolderPrefix Folder prefix to apply to blobs in the GCP Cloud Storage bucket while exporting + * to BigQuery. Defaults to "". Can be globally configured by + * setting schedoscope.export.bigQuery.storageBucketFolderPrefix + * @param storageBucketRegion GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to + * europe-west3. + * Can be globally configured by setting schedoscope.export.bigQuery.storageBucketRegion + * @param dataLocation GCP data storage location of exported data within BigQuery. Defaults to EU. + * Can be globally configured by setting schedoscope.export.bigQuery.dataLocation + * @param numReducers Number of reducers to use for BigQuery export. Defines the parallelism. Defaults to 10. + * an be globally configured by setting schedoscope.export.bigQuery.numReducers + * @param isKerberized Is the cluster kerberized? + * @param kerberosPrincipal Kerberos principal to use. Can be globally configured by setting schedoscope.kerberos.principal + * @param metastoreUri URI of the metastore. Can be globally configured by setting schedoscope.metastore.metastoreUri + * @param exportSalt Salt to use for anonymization. schedoscope.export.salt + * @return the MapReduce transformation performing the export + */ + def BigQuery( + v: View, + projectId: String = if (Schedoscope.settings.bigQueryExportProjectId.isEmpty) null else Schedoscope.settings.bigQueryExportProjectId, + gcpKey: String = if (Schedoscope.settings.bigQueryExportGcpKey.isEmpty) null else Schedoscope.settings.bigQueryExportGcpKey, + gcpKeyFile: String = null, + storageBucket: String = Schedoscope.settings.bigQueryExportStorageBucket, + storageBucketFolderPrefix: String = Schedoscope.settings.bigQueryExportStorageBucketFolderPrefix, + storageBucketRegion: String = Schedoscope.settings.bigQueryExportStorageBucketRegion, + dataLocation: String = Schedoscope.settings.bigQueryExportDataLocation, + numReducers: Int = Schedoscope.settings.bigQueryExportNumReducers, + isKerberized: Boolean = !Schedoscope.settings.kerberosPrincipal.isEmpty(), + kerberosPrincipal: String = Schedoscope.settings.kerberosPrincipal, + metastoreUri: String = Schedoscope.settings.metastoreUri, + exportSalt: String = Schedoscope.settings.exportSalt + ) = { + + val t = MapreduceTransformation( + v, + (conf) => ??? + ) + + t.directoriesToDelete = List() + + t.configureWith( + Map( + + )) + } + /** * This function configures the Redis export job and returns a MapreduceTransformation. * diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java index 76fa31d24..275250810 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java @@ -72,6 +72,14 @@ public class BigQueryExportJob extends BaseExportJob { @Override public int run(String[] args) throws CmdLineException, IOException, TException, ClassNotFoundException, InterruptedException { + Job job = createJob(args); + + boolean success = executeJob(job); + + return (success ? 0 : 1); + } + + public Job createJob(String[] args) throws IOException, TException, CmdLineException { CmdLineParser cmd = new CmdLineParser(this); try { @@ -81,13 +89,22 @@ public int run(String[] args) throws CmdLineException, IOException, TException, cmd.printUsage(System.err); throw e; } + return prepareJobObject(prepareJobConfiguration()); + } - Configuration conf = prepareConfiguration(initialConfiguration); - Job job = prepareJob(conf); + public boolean executeJob(Job job) throws IOException, InterruptedException, ClassNotFoundException { boolean success = job.waitForCompletion(true); - if (success) { + finishJob(job, success); + + return success; + } + + public void finishJob(Job job, boolean wasSuccessful) { + Configuration conf = job.getConfiguration(); + + if (wasSuccessful) { try { prepareBigQueryTable(conf); commit(conf); @@ -96,11 +113,11 @@ public int run(String[] args) throws CmdLineException, IOException, TException, } } else rollback(conf); - - return (success ? 0 : 1); } - private Configuration prepareConfiguration(Configuration conf) throws IOException, TException { + + private Configuration prepareJobConfiguration() throws IOException, TException { + Configuration conf = initialConfiguration; conf = configureHiveMetaStore(conf); conf = configureKerberos(conf); conf = configureAnonFields(conf); @@ -118,7 +135,7 @@ private Configuration prepareConfiguration(Configuration conf) throws IOExceptio metastore.close(); } - return configureBigQueryOutputFormat( + conf = configureBigQueryOutputFormat( conf, project, gcpKey, @@ -137,9 +154,10 @@ private Configuration prepareConfiguration(Configuration conf) throws IOExceptio proxyPort ); + return conf; } - private Job prepareJob(Configuration conf) throws IOException, TException { + private Job prepareJobObject(Configuration conf) throws IOException, TException { Job job = Job.getInstance(conf, "BigQueryExport: " + inputDatabase + "." + inputTable); @@ -164,6 +182,7 @@ private Job prepareJob(Configuration conf) throws IOException, TException { return job; } + public BigQueryExportJob(Configuration initialConfiguration) { this.initialConfiguration = initialConfiguration; } From c6699b37a81e76d26fe987c43f7236ac8913078b Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 22 Dec 2017 11:31:16 +0100 Subject: [PATCH 32/34] Implemented first version of Schedoscope export syntax for BigQuery export. Made export use an identity reducer to control parallelism --- .../src/main/resources/reference.conf | 13 + .../org/schedoscope/conf/BaseSettings.scala | 10 + .../dsl/transformations/Export.scala | 322 ++++++++++++++---- schedoscope-export/README.md | 55 +++ .../export/bigquery/BigQueryExportJob.java | 31 +- 5 files changed, 343 insertions(+), 88 deletions(-) diff --git a/schedoscope-conf/src/main/resources/reference.conf b/schedoscope-conf/src/main/resources/reference.conf index 5d196f08f..0a2762c2d 100644 --- a/schedoscope-conf/src/main/resources/reference.conf +++ b/schedoscope-conf/src/main/resources/reference.conf @@ -377,6 +377,19 @@ exportStorageBucketRegion = "europe-west3" + # + # Host of proxy to use for GCP API access + # + + proxyHost = "" + + # + # Port of proxy to use for GCP API access + # + + proxyPort = "" + + } # diff --git a/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala b/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala index 9b6ac17d6..fa4470924 100644 --- a/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala +++ b/schedoscope-conf/src/main/scala/org/schedoscope/conf/BaseSettings.scala @@ -286,6 +286,16 @@ class BaseSettings(val config: Config) { */ lazy val bigQueryExportStorageBucketRegion = config.getString("schedoscope.export.bigQuery.exportStorageBucketRegion") + /** + * Host of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + */ + lazy val bigQueryExportProxyHost = config.getString("schedoscope.export.bigQuery.proxyHost") + + /** + * Port of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + */ + lazy val bigQueryExportProxyPort = config.getString("schedoscope.export.bigQuery.proxyPort") + /** * Number of reducers to use for (S)Ftp export. */ diff --git a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala index 6253740f0..a56b2ead3 100644 --- a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala +++ b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala @@ -19,6 +19,7 @@ package org.schedoscope.dsl.transformations import org.apache.hadoop.mapreduce.Job import org.schedoscope.Schedoscope import org.schedoscope.dsl.{Field, View} +import org.schedoscope.export.bigquery.BigQueryExportJob import org.schedoscope.export.ftp.FtpExportJob import org.schedoscope.export.ftp.outputformat.FileOutputType import org.schedoscope.export.ftp.upload.FileCompressionCodec @@ -69,23 +70,28 @@ object Export { (conf) => { val filter = v.partitionParameters - .map { - (p => s"${p.n} = '${p.v.get}'") - } + .map { p => s"${p.n} = '${p.v.get}'" } .mkString(" and ") val distributionField = if (distributionKey != null) distributionKey.n else null - val anonFields = v.fields.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray - val anonParameters = v.partitionParameters.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray + val anonFields = v.fields + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray + + val anonParameters = v.partitionParameters + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray new JdbcExportJob().configure( conf.get("schedoscope.export.isKerberized").get.asInstanceOf[Boolean], @@ -124,7 +130,7 @@ object Export { } /** - * This function runs the post commit action and finalizes the database tables. + * This function runs the post commit action for a JDBC export and finalizes the database tables. * * @param job The MR job object * @param driver The schedoscope driver @@ -143,7 +149,7 @@ object Export { runState } catch { - case ex: RetryException => throw new RetryableDriverException(ex.getMessage, ex) + case ex: RetryException => throw RetryableDriverException(ex.getMessage, ex) case ex: UnrecoverableException => DriverRunFailed(driver, ex.getMessage, ex) } } @@ -151,10 +157,10 @@ object Export { /** * This function prepares a MapReduce job for exporting a given view to BigQuery. * - * @param v the view to export. - * @param projectId GCP project ID under which exported BigQuery dataset will be created. If not set, - * this is the default GCP project of the current user. Can be globally configured by - * setting schedoscope.export.bigQuery.projectId + * @param v the view to export. + * @param projectId GCP project ID under which exported BigQuery dataset will be created. If not set, + * this is the default GCP project of the current user. Can be globally configured by + * setting schedoscope.export.bigQuery.projectId * @param gcpKey GCP key in JSON format to use for authentication when exporting to BigQuery. * If not set, the local gcloud key of the user running Schedoscope is used. * Can be globally configured by setting schedoscope.export.bigQuery.gcpKey @@ -167,17 +173,19 @@ object Export { * @param storageBucketFolderPrefix Folder prefix to apply to blobs in the GCP Cloud Storage bucket while exporting * to BigQuery. Defaults to "". Can be globally configured by * setting schedoscope.export.bigQuery.storageBucketFolderPrefix - * @param storageBucketRegion GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to - * europe-west3. - * Can be globally configured by setting schedoscope.export.bigQuery.storageBucketRegion - * @param dataLocation GCP data storage location of exported data within BigQuery. Defaults to EU. - * Can be globally configured by setting schedoscope.export.bigQuery.dataLocation - * @param numReducers Number of reducers to use for BigQuery export. Defines the parallelism. Defaults to 10. - * an be globally configured by setting schedoscope.export.bigQuery.numReducers - * @param isKerberized Is the cluster kerberized? - * @param kerberosPrincipal Kerberos principal to use. Can be globally configured by setting schedoscope.kerberos.principal - * @param metastoreUri URI of the metastore. Can be globally configured by setting schedoscope.metastore.metastoreUri - * @param exportSalt Salt to use for anonymization. schedoscope.export.salt + * @param storageBucketRegion GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to + * europe-west3. + * Can be globally configured by setting schedoscope.export.bigQuery.storageBucketRegion + * @param dataLocation GCP data storage location of exported data within BigQuery. Defaults to EU. + * Can be globally configured by setting schedoscope.export.bigQuery.dataLocation + * @param numReducers Number of reducers to use for BigQuery export. Defines the parallelism. Defaults to 10. + * an be globally configured by setting schedoscope.export.bigQuery.numReducers + * @param proxyHost Host of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + * @param proxyPort Port of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + * @param isKerberized Is the cluster kerberized? + * @param kerberosPrincipal Kerberos principal to use. Can be globally configured by setting schedoscope.kerberos.principal + * @param metastoreUri URI of the metastore. Can be globally configured by setting schedoscope.metastore.metastoreUri + * @param exportSalt Salt to use for anonymization. schedoscope.export.salt * @return the MapReduce transformation performing the export */ def BigQuery( @@ -190,6 +198,8 @@ object Export { storageBucketRegion: String = Schedoscope.settings.bigQueryExportStorageBucketRegion, dataLocation: String = Schedoscope.settings.bigQueryExportDataLocation, numReducers: Int = Schedoscope.settings.bigQueryExportNumReducers, + proxyHost: String = if (Schedoscope.settings.bigQueryExportProxyHost.isEmpty) null else Schedoscope.settings.bigQueryExportProxyHost, + proxyPort: String = if (Schedoscope.settings.bigQueryExportProxyPort.isEmpty) null else Schedoscope.settings.bigQueryExportProxyPort, isKerberized: Boolean = !Schedoscope.settings.kerberosPrincipal.isEmpty(), kerberosPrincipal: String = Schedoscope.settings.kerberosPrincipal, metastoreUri: String = Schedoscope.settings.metastoreUri, @@ -198,15 +208,171 @@ object Export { val t = MapreduceTransformation( v, - (conf) => ??? + (conf) => { + + val filter = v.partitionParameters + .map { p => s"${p.n} = '${p.v.get}'" } + .mkString(" and ") + + val anonFields = v.fields + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray + + val anonParameters = v.partitionParameters + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray + + val bigQueryPartitionDate: Option[String] = v.partitionParameters + .filter { p => Set("month_id", "date_id").contains(p.n) } + .map { + p => + if (p.n == "month_id") + p.v.get.asInstanceOf[String] + "01" + else + p.v.get.asInstanceOf[String] + } + .headOption + + val bigQueryPartitionSuffixes = v.partitionParameters + .filter { p => !Set("year", "month", "day", "date_id", "month_id").contains(p.n) } + .sortBy { + _.n + } + .map { + _.v.get + } + .mkString("_") + + + val baseJobParameters: Seq[String] = Seq() ++ + ( + if (conf("schedoscope.export.isKerberized").asInstanceOf[Boolean]) + Seq("-s", "true", "-p", conf("schedoscope.export.kerberosPrincipal").asInstanceOf[String]) + else + Nil + ) ++ + Seq("-m", conf("schedoscope.export.metastoreUri").asInstanceOf[String]) ++ + Seq("-d", v.dbName) ++ + Seq("-t", v.n) ++ + ( + if (!filter.isEmpty) + Seq("-i", filter) + else + Nil + ) ++ + Seq("-c", conf("schedoscope.export.numReducers").asInstanceOf[Integer].toString) ++ + ( + if (!(anonFields ++ anonParameters).isEmpty) + Seq("-A", (anonFields ++ anonParameters).mkString(" "), "-S", conf("schedoscope.export.exportSalt").asInstanceOf[String]) + else + Nil + ) + + val bigQueryJobParameters: Seq[String] = Seq() ++ + ( + if (conf.contains("schedoscope.export.projectId")) + Seq("-P", conf("schedoscope.export.projectId").asInstanceOf[String]) + else + Nil + ) ++ + ( + if (conf.contains("schedoscope.export.gcpKey")) + Seq("-k", conf("schedoscope.export.gcpKey").asInstanceOf[String]) + else + Nil + ) ++ + ( + if (conf.contains("schedoscope.export.gcpKeyFile")) + Seq("-K", conf("schedoscope.export.gcpKeyFile").asInstanceOf[String]) + else + Nil + ) ++ + ( + if (conf.contains("schedoscope.export.proxyHost")) + Seq("-y", conf("schedoscope.export.proxyHost").asInstanceOf[String]) + else + Nil + ) ++ + ( + if (conf.contains("schedoscope.export.proxyPort")) + Seq("-Y", conf("schedoscope.export.proxyPort").asInstanceOf[String]) + else + Nil + ) ++ + Seq("-l", conf("schedoscope.export.dataLocation").asInstanceOf[String]) ++ + Seq("-b", conf("schedoscope.export.storageBucket").asInstanceOf[String]) ++ + Seq("-f", conf("schedoscope.export.storageBucketFolderPrefix").asInstanceOf[String]) ++ + Seq("-r", conf("schedoscope.export.storageBucketRegion").asInstanceOf[String]) ++ + ( + if (bigQueryPartitionDate.isDefined) + Seq("-D", bigQueryPartitionDate.get) + else + Nil + ) ++ + ( + if (!bigQueryPartitionSuffixes.isEmpty) + Seq("-x", bigQueryPartitionSuffixes) + else + Nil + ) + + new BigQueryExportJob().createJob((baseJobParameters ++ bigQueryJobParameters).toArray[String]) + }, + bigQueryPostCommit ) t.directoriesToDelete = List() t.configureWith( Map( + "schedoscope.export.storageBucket" -> storageBucket, + "schedoscope.export.storageBucketFolderPrefix" -> storageBucketFolderPrefix, + "schedoscope.export.storageBucketRegion" -> storageBucketRegion, + "schedoscope.export.dataLocation" -> dataLocation, + "schedoscope.export.numReducers" -> numReducers, + "schedoscope.export.isKerberized" -> isKerberized, + "schedoscope.export.kerberosPrincipal" -> kerberosPrincipal, + "schedoscope.export.metastoreUri" -> metastoreUri, + "schedoscope.export.exportSalt" -> exportSalt + ) ++ (if (projectId != null) Seq("schedoscope.export.projectId" -> projectId) else Nil) + ++ (if (gcpKey != null) Seq("schedoscope.export.gcpKey" -> gcpKey) else Nil) + ++ (if (gcpKeyFile != null) Seq("schedoscope.export.gcpKeyFile" -> gcpKeyFile) else Nil) + ++ (if (proxyHost != null) Seq("schedoscope.export.proxyHost" -> proxyHost) else Nil) + ++ (if (proxyPort != null) Seq("schedoscope.export.proxyPort" -> proxyPort) else Nil) + ) + } + + /** + * This function runs the post commit action for a BigQuery export and finalizes the database tables. + * + * @param job The MR job object + * @param driver The schedoscope driver + * @param runState The job's runstate + */ + def bigQueryPostCommit( + job: Job, + driver: Driver[MapreduceBaseTransformation], + runState: DriverRunState[MapreduceBaseTransformation]): DriverRunState[MapreduceBaseTransformation] = { + + try { + + BigQueryExportJob.finishJob(job, runState.isInstanceOf[DriverRunSucceeded[MapreduceBaseTransformation]]) + runState - )) + } catch { + case ex: RetryException => throw RetryableDriverException(ex.getMessage, ex) + case ex: UnrecoverableException => DriverRunFailed(driver, ex.getMessage, ex) + } } /** @@ -253,23 +419,28 @@ object Export { (conf) => { val filter = v.partitionParameters - .map { - (p => s"${p.n} = '${p.v.get}'") - } + .map { p => s"${p.n} = '${p.v.get}'" } .mkString(" and ") val valueFieldName = if (value != null) value.n else null - val anonFields = v.fields.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray - val anonParameters = v.partitionParameters.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray + val anonFields = v.fields + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray + + val anonParameters = v.partitionParameters + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray new RedisExportJob().configure( conf.get("schedoscope.export.isKerberized").get.asInstanceOf[Boolean], @@ -320,11 +491,10 @@ object Export { * @param kafkaHosts String list of Kafka hosts to communicate with * @param zookeeperHosts String list of zookeeper hosts * @param replicationFactor The replication factor, defaults to 1 - * @param numPartitions The number of partitions in the topic. Defaults to 3 * @param exportSalt an optional salt when anonymizing fields * @param producerType The type of producer to use, defaults to synchronous * @param cleanupPolicy Default cleanup policy is delete - * @param compressionCodes Default compression codec is gzip + * @param compressionCodec Default compression codec is gzip * @param encoding Defines, whether data is to be serialized as strings (one line JSONs) or Avro * @param numReducers number of reducers to use (i.e., the parallelism) * @param isKerberized Is the cluster kerberized? @@ -354,21 +524,26 @@ object Export { (conf) => { val filter = v.partitionParameters + .map { p => s"${p.n} = '${p.v.get}'" } + .mkString(" and ") + + val anonFields = v.fields + .filter { + _.isPrivacySensitive + } .map { - (p => s"${p.n} = '${p.v.get}'") + _.n } - .mkString(" and ") + .toArray - val anonFields = v.fields.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray - val anonParameters = v.partitionParameters.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray + val anonParameters = v.partitionParameters + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray new KafkaExportJob().configure( conf.get("schedoscope.export.isKerberized").get.asInstanceOf[Boolean], @@ -451,21 +626,26 @@ object Export { v, (conf) => { val filter = v.partitionParameters + .map { p => s"${p.n} = '${p.v.get}'" } + .mkString(" and ") + + val anonFields = v.fields + .filter { + _.isPrivacySensitive + } .map { - (p => s"${p.n} = '${p.v.get}'") + _.n } - .mkString(" and ") + .toArray - val anonFields = v.fields.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray - val anonParameters = v.partitionParameters.filter { - _.isPrivacySensitive - }.map { - _.n - }.toArray + val anonParameters = v.partitionParameters + .filter { + _.isPrivacySensitive + } + .map { + _.n + } + .toArray new FtpExportJob().configure( conf.get("schedoscope.export.isKerberized").get.asInstanceOf[Boolean], diff --git a/schedoscope-export/README.md b/schedoscope-export/README.md index 929f6779f..86f27d6c3 100644 --- a/schedoscope-export/README.md +++ b/schedoscope-export/README.md @@ -3,6 +3,7 @@ Schedoscope Export is a collection of Map/Reduce jobs to move data from Hive (via HCatalog) into various output sinks. Currently the following sinks are supported: * JDBC + * Google BigQuery * Redis * Kafka * (S)FTP @@ -57,6 +58,60 @@ After the classpath has been defined the JDBC export job can now be started: yarn jar schedoscope-export-*-SNAPSHOT-jar-with-dependencies.jar org.schedoscope.export.jdbc.JdbcExportJob -d default -t my_table -s -p 'hive/_HOST@PRINCIPAL.COM' -m 'thrift://metastore:9083' -c 10 -j 'jdbc:mysql://host/db' -k 1000 -u username -w mypassword +### BigQuery + +This Map/Reduce job moves data into Google BigQuery via CloudStorage. It transforms the Hive table schema to a BigQuery +table schema as best as it can. Problematic constructs like maps, arrays of arrays, etc. are mapped to JSON formatted +string columns. + +#### Configuration options + + * -s set to true if kerberos is enabled + + * -m specify the metastore URIs + + * -p the kerberos principal + + * -d input database + + * -t input table + + * -i input filter, e.g. month='08' and year='2015' + + * -c number of reducers, concurrency level + + * -A a list of fields to anonymize separated by space, e.g. 'id visitor_id' + + * -S an optional salt to for anonymizing fields + + * -P the GCP project ID under which to create the resulting BigQuery dataset, e.g., project-4711. If not passed, the user's default GCP project will be used + + * -k GCP key to use for authentication in JSON format. If not passed, the GCP default authentication protocol will be followed. + + * -K absolute path to the file with GCP key to use for authentication in JSON format. If not passed, the GCP default authentication protocol will be followed. + + * -D the BigQuery table partition date into which to insert the exported data, e.g., 20171001. If not passed, it is assumed that the resulting BigQuery table is not partitioned + + * -x the postfix to append to the resulting BigQuery table name, e.g., EC0101. If not passed, no postfix will be appended + + * -l the location where to store the resulting BigQuery table, e.g., US. If not passed, EU will be used + + * -b GCP storage bucket to use for temporal storage, e.g., my-storage-bucket-for-export. This one is required + + * -f GCP storage bucket folder prefix to prepend to temporal storage blobs, e.g., scratch + + * -r GCP storage bucket region to use, e.g., europe-west1. Defaults to europe-west3 + + * -y proxy host to use for GCP access + + * -Y proxy port to use for GCP access + + #### Run the BigQuery export + +
+ yarn jar schedoscope-export-*-SNAPSHOT-jar-with-dependencies.jar org.schedoscope.export.redis.BigQueryExportJob -d default -t my_table -b my-storage-bucket-for-export -s -p 'hive/_HOST@PRINCIPAL.COM' -m 'thrift://metastore:9083' -c 10
+ 
+ ### Redis This Map/Reduce job moves data into Redis, it supports to modes: diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java index 275250810..7bb659a08 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java @@ -9,8 +9,9 @@ import org.apache.hadoop.io.LongWritable; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mapreduce.Reducer; import org.apache.hadoop.util.ToolRunner; -import org.apache.hive.hcatalog.data.HCatRecord; +import org.apache.hive.hcatalog.data.DefaultHCatRecord; import org.apache.hive.hcatalog.data.schema.HCatSchema; import org.apache.hive.hcatalog.mapreduce.HCatInputFormat; import org.apache.thrift.TException; @@ -74,7 +75,9 @@ public int run(String[] args) throws CmdLineException, IOException, TException, Job job = createJob(args); - boolean success = executeJob(job); + boolean success = job.waitForCompletion(true); + + finishJob(job, success); return (success ? 0 : 1); } @@ -92,16 +95,7 @@ public Job createJob(String[] args) throws IOException, TException, CmdLineExcep return prepareJobObject(prepareJobConfiguration()); } - public boolean executeJob(Job job) throws IOException, InterruptedException, ClassNotFoundException { - - boolean success = job.waitForCompletion(true); - - finishJob(job, success); - - return success; - } - - public void finishJob(Job job, boolean wasSuccessful) { + public static void finishJob(Job job, boolean wasSuccessful) { Configuration conf = job.getConfiguration(); if (wasSuccessful) { @@ -164,20 +158,23 @@ private Job prepareJobObject(Configuration conf) throws IOException, TException job.setJarByClass(BigQueryExportJob.class); job.setMapperClass(Mapper.class); - job.setNumReduceTasks(0); + job.setReducerClass(Reducer.class); + + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(DefaultHCatRecord.class); + job.setOutputKeyClass(LongWritable.class); + job.setOutputValueClass(DefaultHCatRecord.class); if (inputFilter == null || inputFilter.trim().equals("")) { HCatInputFormat.setInput(job, inputDatabase, inputTable); } else { - HCatInputFormat.setInput(job, inputDatabase, inputTable, - inputFilter); + HCatInputFormat.setInput(job, inputDatabase, inputTable, inputFilter); } job.setInputFormatClass(HCatInputFormat.class); job.setOutputFormatClass(BigQueryOutputFormat.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(HCatRecord.class); + job.setNumReduceTasks(numReducer); return job; } From cb0c1ffe67818b21b7341a6c0268d463dd263801 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 22 Dec 2017 13:13:02 +0100 Subject: [PATCH 33/34] Implemented BigQuery export integration test with schedoscope export DSL --- schedoscope-core/pom.xml | 9 --- .../dsl/transformations/ExportTest.scala | 63 +++++++++++++++++-- .../src/test/scala/test/views/TestViews.scala | 22 +++++++ .../export/bigquery/BigQueryExportJob.java | 15 +++++ .../BigQueryOutputConfiguration.java | 15 +++++ .../outputformat/BigQueryOutputFormat.java | 15 +++++ .../BiqQueryHCatRecordWriter.java | 15 +++++ .../HCatRecordToBigQueryMapConvertor.java | 15 +++++ .../HCatSchemaToBigQuerySchemaConverter.java | 15 +++++ .../outputschema/PartitioningScheme.java | 15 +++++ .../redis/outputformat/RedisOutputFormat.java | 8 +-- .../export/utils/BigQueryUtils.java | 15 +++++ .../export/utils/CloudStorageUtils.java | 15 +++++ .../utils/HCatRecordJsonSerializer.java | 3 +- .../HCatSchemaToBigQueryTransformer.java | 15 +++++ .../schedoscope/export/utils/HCatUtils.java | 1 - .../export/bigquery/BigQueryBaseTest.java | 15 +++++ .../export/bigquery/BigQueryExportTest.java | 15 +++++ .../BigQueryOutputFormatTest.java | 15 +++++ .../HCatSchemaToBigQueryTransformerTest.java | 15 +++++ .../src/test/resources/log4j.properties | 1 - 21 files changed, 295 insertions(+), 22 deletions(-) diff --git a/schedoscope-core/pom.xml b/schedoscope-core/pom.xml index c89bf2312..e2988239f 100644 --- a/schedoscope-core/pom.xml +++ b/schedoscope-core/pom.xml @@ -51,11 +51,6 @@ schedoscope-conf ${schedoscope.version}
- - guava - com.google.guava - 11.0 - joda-time joda-time @@ -232,10 +227,6 @@ slf4j-api org.slf4j - - guava - com.google.guava - jsp-api javax.servlet.jsp diff --git a/schedoscope-core/src/test/scala/org/schedoscope/dsl/transformations/ExportTest.scala b/schedoscope-core/src/test/scala/org/schedoscope/dsl/transformations/ExportTest.scala index 877dea230..684d5ab16 100644 --- a/schedoscope-core/src/test/scala/org/schedoscope/dsl/transformations/ExportTest.scala +++ b/schedoscope-core/src/test/scala/org/schedoscope/dsl/transformations/ExportTest.scala @@ -19,23 +19,52 @@ import java.sql.DriverManager import java.util.Properties import _root_.test.views._ -import com.google.common.collect.ImmutableList import org.apache.commons.net.ftp.FTPClient import org.apache.curator.test.TestingServer import org.codehaus.jackson.map.ObjectMapper import org.codehaus.jackson.map.`type`.TypeFactory import org.rarefiedredis.redis.adapter.jedis.JedisAdapter -import org.scalatest.{FlatSpec, Matchers} +import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} import org.schedoscope.Schedoscope import org.schedoscope.dsl.Field.v import org.schedoscope.dsl.Parameter.p import org.schedoscope.export.testsupport.{EmbeddedFtpSftpServer, EmbeddedKafkaCluster, SimpleTestKafkaConsumer} +import org.schedoscope.export.utils.BigQueryUtils.{bigQueryService, dropDataset, existsDataset} +import org.schedoscope.export.utils.CloudStorageUtils.{createBucket, deleteBucket, storageService} import org.schedoscope.export.utils.RedisMRJedisFactory import org.schedoscope.test.{rows, test} import scala.collection.JavaConversions.iterableAsScalaIterable +import scala.collection.JavaConverters._ -class ExportTest extends FlatSpec with Matchers { +class ExportTest extends FlatSpec with Matchers with BeforeAndAfter { + + private val CALL_BIG_QUERY = false + private val CLEAN_UP_BIG_QUERY = true + + before { + if (CALL_BIG_QUERY) { + val bigQuery = bigQueryService + val storage = storageService + + if (existsDataset(bigQuery, null, "default")) + dropDataset(bigQuery, null, "default") + + createBucket(storage, "schedoscope_export_big_query_full_test", "europe-west3") + } + } + + after { + if (CALL_BIG_QUERY && CLEAN_UP_BIG_QUERY) { + val bigQuery = bigQueryService + val storage = storageService + + if (existsDataset(bigQuery, null, "default")) + dropDataset(bigQuery, null, "default") + + deleteBucket(storage, "schedoscope_export_big_query_full_test") + } + } Class.forName("org.apache.derby.jdbc.EmbeddedDriver") val dbConnection = DriverManager.getConnection("jdbc:derby:memory:TestingDB;create=true") @@ -43,6 +72,7 @@ class ExportTest extends FlatSpec with Matchers { val jedisAdapter = new JedisAdapter() RedisMRJedisFactory.setJedisMock(jedisAdapter) + val ec0101Clicks = new Click(p("EC0101"), p("2014"), p("01"), p("01")) with rows { set( v(id, "event01"), @@ -98,6 +128,29 @@ class ExportTest extends FlatSpec with Matchers { statement.close() } + it should "execute hive transformations and perform BigQuery export" in { + + if (CALL_BIG_QUERY) + new ClickOfEC0101WithBigQueryExport(p("2014"), p("01"), p("01")) with test { + basedOn(ec0101Clicks, ec0106Clicks) + + `then`() + + numRows shouldBe 3 + + row( + v(id) shouldBe "event01", + v(url) shouldBe "http://ec0101.com/url1") + row( + v(id) shouldBe "event02", + v(url) shouldBe "http://ec0101.com/url2") + row( + v(id) shouldBe "event03", + v(url) shouldBe "http://ec0101.com/url3") + + } + } + it should "execute hive transformations and perform Redis export" in { new ClickOfEC0101WithRedisExport(p("2014"), p("01"), p("01")) with test { @@ -131,8 +184,8 @@ class ExportTest extends FlatSpec with Matchers { zkServer.start() Thread.sleep(500) - val kafkaServer = new EmbeddedKafkaCluster(zkServer.getConnectString, new Properties(), ImmutableList.of(9092)) - kafkaServer.startup(); + val kafkaServer = new EmbeddedKafkaCluster(zkServer.getConnectString, new Properties(), List(new Integer(9092)).asJava) + kafkaServer.startup() val v = new ClickOfEC01WithKafkaExport(p("2014"), p("01"), p("01")) with test { basedOn(ec0101Clicks, ec0106Clicks) diff --git a/schedoscope-core/src/test/scala/test/views/TestViews.scala b/schedoscope-core/src/test/scala/test/views/TestViews.scala index c33ecb877..7be7ee2ee 100644 --- a/schedoscope-core/src/test/scala/test/views/TestViews.scala +++ b/schedoscope-core/src/test/scala/test/views/TestViews.scala @@ -327,6 +327,28 @@ case class ClickOfEC0101WithJdbcExport(year: Parameter[String], } +case class ClickOfEC0101WithBigQueryExport(year: Parameter[String], + month: Parameter[String], + day: Parameter[String]) extends View + with Id + with DailyParameterization { + + val url = fieldOf[String] + + val click = dependsOn(() => Click(p("EC0101"), year, month, day)) + + transformVia( + () => HiveTransformation( + insertInto(this, + s""" + SELECT ${click().id.n}, ${click().url.n} + FROM ${click().tableName} + WHERE ${click().shopCode.n} = '${click().shopCode.v.get}'"""))) + + exportTo(() => BigQuery(this, storageBucket = "schedoscope_export_big_query_full_test")) + +} + case class ClickOfEC0101WithRedisExport(year: Parameter[String], month: Parameter[String], day: Parameter[String]) extends View diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java index 7bb659a08..b0da091d1 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/BigQueryExportJob.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery; import org.apache.commons.io.FileUtils; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java index 7ce301615..4e00b5691 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputConfiguration.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputformat; import com.google.cloud.bigquery.TableId; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java index d02b8129a..9194d8d2b 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BigQueryOutputFormat.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputformat; import com.google.cloud.bigquery.BigQuery; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java index e27df27fe..f883e3c36 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputformat/BiqQueryHCatRecordWriter.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputformat; import com.fasterxml.jackson.databind.ObjectMapper; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java index f4e622e4f..bb3269abf 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatRecordToBigQueryMapConvertor.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputschema; import com.fasterxml.jackson.core.JsonProcessingException; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java index d5106bad6..32e38f349 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQuerySchemaConverter.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputschema; import com.google.cloud.bigquery.*; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java index 23a6c4643..1eca10003 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/bigquery/outputschema/PartitioningScheme.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputschema; public enum PartitioningScheme { diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/redis/outputformat/RedisOutputFormat.java b/schedoscope-export/src/main/java/org/schedoscope/export/redis/outputformat/RedisOutputFormat.java index eb95039f9..5d105ee5c 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/redis/outputformat/RedisOutputFormat.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/redis/outputformat/RedisOutputFormat.java @@ -61,11 +61,11 @@ public class RedisOutputFormat extends @Override public void checkOutputSpecs(JobContext context) throws IOException { - /* + /* * Jedis jedis = - * RedisMRJedisFactory.getJedisClient(context.getConfiguration()); - * LOG.info("set up redis: " + jedis.ping()); jedis.close(); - */ + * RedisMRJedisFactory.getJedisClient(context.getConfiguration()); + * LOG.info("set up redis: " + jedis.ping()); jedis.close(); + */ } @Override diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java index a324594a6..9f5dd6446 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/BigQueryUtils.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.utils; import com.google.auth.oauth2.GoogleCredentials; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java index 976b09f18..2dc594e2c 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/CloudStorageUtils.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.utils; import com.google.api.gax.paging.Page; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatRecordJsonSerializer.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatRecordJsonSerializer.java index 2d2b4cfa1..f7a0316dd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatRecordJsonSerializer.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatRecordJsonSerializer.java @@ -1,5 +1,5 @@ /** - * Copyright 2016 Otto (GmbH & Co KG) + * Copyright 2015 Otto (GmbH & Co KG) *

* Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.schedoscope.export.utils; import com.fasterxml.jackson.core.JsonParser; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java index a221d30df..a2cedae90 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatSchemaToBigQueryTransformer.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.utils; diff --git a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatUtils.java b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatUtils.java index 294ceb83d..d60e387cd 100644 --- a/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatUtils.java +++ b/schedoscope-export/src/main/java/org/schedoscope/export/utils/HCatUtils.java @@ -13,7 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ - package org.schedoscope.export.utils; import org.apache.commons.codec.digest.DigestUtils; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java index e62037f21..93079c997 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryBaseTest.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery; import com.google.cloud.bigquery.*; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java index a5b1feeb0..1312b739a 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/BigQueryExportTest.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery; import com.google.cloud.bigquery.BigQuery; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java index 89a126e37..411fb6375 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/BigQueryOutputFormatTest.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputschema; import org.apache.hadoop.conf.Configuration; diff --git a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java index e4f327998..179d3d806 100644 --- a/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java +++ b/schedoscope-export/src/test/java/org/schedoscope/export/bigquery/outputschema/HCatSchemaToBigQueryTransformerTest.java @@ -1,3 +1,18 @@ +/** + * Copyright 2015 Otto (GmbH & Co KG) + *

+ * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + *

+ * http://www.apache.org/licenses/LICENSE-2.0 + *

+ * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ package org.schedoscope.export.bigquery.outputschema; import com.fasterxml.jackson.core.JsonProcessingException; diff --git a/schedoscope-export/src/test/resources/log4j.properties b/schedoscope-export/src/test/resources/log4j.properties index 7fc2fec78..113efd926 100644 --- a/schedoscope-export/src/test/resources/log4j.properties +++ b/schedoscope-export/src/test/resources/log4j.properties @@ -1,7 +1,6 @@ #log4j.rootLogger=OFF # Root logger option log4j.rootLogger=ERROR, stdout - # Direct log messages to stdout log4j.appender.stdout=org.apache.log4j.ConsoleAppender log4j.appender.stdout.Target=System.out From 726a7b41dfa66d36673e10ea809505f6533bfcd1 Mon Sep 17 00:00:00 2001 From: Utz Westermann Date: Fri, 22 Dec 2017 13:34:38 +0100 Subject: [PATCH 34/34] Minor formatting --- .../dsl/transformations/Export.scala | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala index a56b2ead3..1b6f78003 100644 --- a/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala +++ b/schedoscope-core/src/main/scala/org/schedoscope/dsl/transformations/Export.scala @@ -157,10 +157,10 @@ object Export { /** * This function prepares a MapReduce job for exporting a given view to BigQuery. * - * @param v the view to export. - * @param projectId GCP project ID under which exported BigQuery dataset will be created. If not set, - * this is the default GCP project of the current user. Can be globally configured by - * setting schedoscope.export.bigQuery.projectId + * @param v the view to export. + * @param projectId GCP project ID under which exported BigQuery dataset will be created. If not set, + * this is the default GCP project of the current user. Can be globally configured by + * setting schedoscope.export.bigQuery.projectId * @param gcpKey GCP key in JSON format to use for authentication when exporting to BigQuery. * If not set, the local gcloud key of the user running Schedoscope is used. * Can be globally configured by setting schedoscope.export.bigQuery.gcpKey @@ -173,19 +173,19 @@ object Export { * @param storageBucketFolderPrefix Folder prefix to apply to blobs in the GCP Cloud Storage bucket while exporting * to BigQuery. Defaults to "". Can be globally configured by * setting schedoscope.export.bigQuery.storageBucketFolderPrefix - * @param storageBucketRegion GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to - * europe-west3. - * Can be globally configured by setting schedoscope.export.bigQuery.storageBucketRegion - * @param dataLocation GCP data storage location of exported data within BigQuery. Defaults to EU. - * Can be globally configured by setting schedoscope.export.bigQuery.dataLocation - * @param numReducers Number of reducers to use for BigQuery export. Defines the parallelism. Defaults to 10. - * an be globally configured by setting schedoscope.export.bigQuery.numReducers - * @param proxyHost Host of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. - * @param proxyPort Port of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. - * @param isKerberized Is the cluster kerberized? - * @param kerberosPrincipal Kerberos principal to use. Can be globally configured by setting schedoscope.kerberos.principal - * @param metastoreUri URI of the metastore. Can be globally configured by setting schedoscope.metastore.metastoreUri - * @param exportSalt Salt to use for anonymization. schedoscope.export.salt + * @param storageBucketRegion GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to + * europe-west3. + * Can be globally configured by setting schedoscope.export.bigQuery.storageBucketRegion + * @param dataLocation GCP data storage location of exported data within BigQuery. Defaults to EU. + * Can be globally configured by setting schedoscope.export.bigQuery.dataLocation + * @param numReducers Number of reducers to use for BigQuery export. Defines the parallelism. Defaults to 10. + * an be globally configured by setting schedoscope.export.bigQuery.numReducers + * @param proxyHost Host of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + * @param proxyPort Port of proxy to use for GCP API access. Set to empty, i.e., no proxy to use. + * @param isKerberized Is the cluster kerberized? + * @param kerberosPrincipal Kerberos principal to use. Can be globally configured by setting schedoscope.kerberos.principal + * @param metastoreUri URI of the metastore. Can be globally configured by setting schedoscope.metastore.metastoreUri + * @param exportSalt Salt to use for anonymization. schedoscope.export.salt * @return the MapReduce transformation performing the export */ def BigQuery(