Skip to content
This repository has been archived by the owner on Jan 27, 2020. It is now read-only.

Big query support #185

Merged
merged 34 commits into from Dec 22, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
fe8acd5
Merging master
Nov 1, 2017
5873e3c
Initial rigging
Nov 3, 2017
5e1d027
Initial schema conversion for flat tables
Nov 14, 2017
f64b3a4
Implemented test checks
Nov 14, 2017
bdaa1b3
arrays of primitive types
Nov 14, 2017
e1cbe3a
Finished schema generation
Nov 16, 2017
6cc4ca6
Added _USED_HCAT_FILTER column
Nov 16, 2017
2e408d1
Changed partitioning descriptor
Nov 17, 2017
cb72d95
Added consideration of table postfixes
Nov 24, 2017
a46c9e8
Generalizing HCat Record Traversal
Nov 29, 2017
317ac1d
1st refactoring of HCatSchema traversal, not yet generic enough
Dec 1, 2017
09fd092
Made transformation more functional, and support transformation along…
Dec 3, 2017
a3d7208
First Implementation of Record Mapping
Dec 4, 2017
46a9a3c
Reshaped Constructor Interface to not use functions but present hooks…
Dec 6, 2017
97eb117
Implemented record conversion and tested insertion of converted recor…
Dec 8, 2017
7d3f4bb
Added retry function to bigqueryutils
Dec 11, 2017
724026d
Implemented OutputFormat and RecordWriter for BigQuery export job
Dec 12, 2017
2159f04
Minor refactorings
Dec 12, 2017
038f03c
Corrected passing of configuration parameters to BigQueryOutputFormat
Dec 13, 2017
3b3b779
Corrected passing of configuration parameters to BigQueryOutputFormat
Dec 13, 2017
1bdf469
Changing streaming approach to avoid temporal tables
Dec 13, 2017
d311eb0
Implemented Job Rollback in case of Error
Dec 14, 2017
8291993
Implemented OutputFormat Tests, failed though at streaming into older…
Dec 15, 2017
ca6ab70
Temporary commit
Dec 15, 2017
f25e11a
Finally managed to get BigQuery output format running and loading dat…
Dec 18, 2017
6ce87b5
Fixed logging properties in tests
Dec 19, 2017
d6a09fe
Added java doc, support for proxies, and table postfixes
Dec 19, 2017
0b024da
Additional Javadoc
Dec 20, 2017
98f0391
Created MR Job skeleton
Dec 20, 2017
079e03f
Finished MapReduce job for BigQuery export and tested it
Dec 21, 2017
f386ac4
Outlined Schedoscope export syntax for BigQueryk
Dec 21, 2017
c6699b3
Implemented first version of Schedoscope export syntax for BigQuery e…
Dec 22, 2017
cb0c1ff
Implemented BigQuery export integration test with schedoscope export DSL
Dec 22, 2017
726a7b4
Minor formatting
Dec 22, 2017
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
63 changes: 63 additions & 0 deletions schedoscope-conf/src/main/resources/reference.conf
Expand Up @@ -329,6 +329,69 @@

salt = "vD75MqvaasIlCf7H"

#
# BigQuery exporter settings.
#

bigQuery {

#
# GCP project ID under which the exported BigQuery dataset will be created
#

projectId = ""

#
# GCP key in JSON format to use for authentication
#

gcpKey = ""

#
# Number of reducers to use for parallel writing to BigQuery.
#

numberOfReducers = 10

#
# GCP data storage location of exported data within BigQuery.
#

dataLocation = "EU"

#
# GCP Cloud Storage bucket for temporary storage to use for exporting to BigQuery.
#

exportStorageBucket = "schedoscope_bigquery_export"

#
# GCP Cloud Storage bucket folder prefix to apply to blobs when exporting to BigQuery
#

exportStorageBucketFolderPrefix = ""

#
# GCP Cloud Storage bucket region to use for exporting to BigQuery
#

exportStorageBucketRegion = "europe-west3"

#
# Host of proxy to use for GCP API access
#

proxyHost = ""

#
# Port of proxy to use for GCP API access
#

proxyPort = ""


}

#
# JDBC exporter settings.
#
Expand Down
Expand Up @@ -247,10 +247,55 @@ class BaseSettings(val config: Config) {
lazy val redisExportBatchSize = config.getInt("schedoscope.export.redis.insertBatchSize")

/**
* Number of reducers to use for Redis export.
* Number of reducers to use for Kafka export.
*/
lazy val kafkaExportNumReducers = config.getInt("schedoscope.export.kafka.numberOfReducers")

/**
* GCP project ID under which exported BigQuery dataset will be created. Defaults to the default project of the current user.
*/
lazy val bigQueryExportProjectId = config.getString("schedoscope.export.bigQuery.projectId")

/**
* Number of reducers to use for BigQuery export.
*/
lazy val bigQueryExportNumReducers = config.getInt("schedoscope.export.bigQuery.numberOfReducers")

/**
* GCP data storage location of exported data within BigQuery. Defaults to EU.
*/
lazy val bigQueryExportDataLocation = config.getString("schedoscope.export.bigQuery.dataLocation")

/**
* GCP key in JSON format to use for authentication when exporting to BigQuery. If not set, the key of the current user is used.
*/
lazy val bigQueryExportGcpKey = config.getString("schedoscope.export.bigQuery.gcpKey")

/**
* GCP Cloud Storage bucket to use for temporary storage while exporting to BigQuery. Defaults to "schedoscope_bigquery_export"
*/
lazy val bigQueryExportStorageBucket = config.getString("schedoscope.export.bigQuery.exportStorageBucket")

/**
* Folder prefix to apply to blobs in the GCP Cloud Storage bucket while exporting to BigQuery. Defaults to ""
*/
lazy val bigQueryExportStorageBucketFolderPrefix = config.getString("schedoscope.export.bigQuery.exportStorageBucketFolderPrefix")

/**
* GCP Cloud Storage bucket region to use for exporting to BigQuery. Defaults to europe-west3
*/
lazy val bigQueryExportStorageBucketRegion = config.getString("schedoscope.export.bigQuery.exportStorageBucketRegion")

/**
* Host of proxy to use for GCP API access. Set to empty, i.e., no proxy to use.
*/
lazy val bigQueryExportProxyHost = config.getString("schedoscope.export.bigQuery.proxyHost")

/**
* Port of proxy to use for GCP API access. Set to empty, i.e., no proxy to use.
*/
lazy val bigQueryExportProxyPort = config.getString("schedoscope.export.bigQuery.proxyPort")

/**
* Number of reducers to use for (S)Ftp export.
*/
Expand Down
9 changes: 0 additions & 9 deletions schedoscope-core/pom.xml
Expand Up @@ -51,11 +51,6 @@
<artifactId>schedoscope-conf</artifactId>
<version>${schedoscope.version}</version>
</dependency>
<dependency>
<artifactId>guava</artifactId>
<groupId>com.google.guava</groupId>
<version>11.0</version>
</dependency>
<dependency>
<groupId>joda-time</groupId>
<artifactId>joda-time</artifactId>
Expand Down Expand Up @@ -232,10 +227,6 @@
<artifactId>slf4j-api</artifactId>
<groupId>org.slf4j</groupId>
</exclusion>
<exclusion>
<artifactId>guava</artifactId>
<groupId>com.google.guava</groupId>
</exclusion>
<exclusion>
<artifactId>jsp-api</artifactId>
<groupId>javax.servlet.jsp</groupId>
Expand Down