/
SpannerToBigQuery.java
101 lines (81 loc) · 4.29 KB
/
SpannerToBigQuery.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
/*
* Copyright (c) Mercari, Inc.
*
* This source code is licensed under the MIT license found in the
* LICENSE file in the root directory of this source tree.
*/
package com.mercari.solution.templates;
import com.google.cloud.spanner.Struct;
import com.mercari.solution.transforms.SpannerQueryIO;
import com.mercari.solution.util.converter.StructToTableRowConverter;
import org.apache.beam.sdk.Pipeline;
import org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO;
import org.apache.beam.sdk.options.*;
import org.apache.beam.sdk.transforms.*;
import org.apache.beam.sdk.values.*;
import java.util.Map;
/**
* <p>SpannerToBigQuery template inserts query results from Cloud Spanner to specified BigQuery table.</p>
*
* Template parameters are as follows.
*
* <table summary="summary" border="1" cellpadding="3" cellspacing="0">
* <tr><th>Parameter</th><th>Type</th><th>Description</th></tr>
* <tr><td>projectId</td><td>String</td><td>Project ID for Spanner you will query</td></tr>
* <tr><td>instanceId</td><td>String</td><td>Spanner instanceID you will query.</td></tr>
* <tr><td>databaseId</td><td>String</td><td>Spanner databaseID you will query.</td></tr>
* <tr><td>query</td><td>String</td><td>SQL query to read records from Cloud Spanner</td></tr>
* <tr><td>output</td><td>String</td><td>Destination BigQuery table. format {dataset}.{table}</td></tr>
* <tr><td>timestampBound</td><td>String</td><td>(Optional) Timestamp bound (format: yyyy-MM-ddTHH:mm:SSZ). default is strong.</td></tr>
* </table>
*/
public class SpannerToBigQuery {
private SpannerToBigQuery() {}
public interface SpannerToBigQueryPipelineOption extends PipelineOptions {
@Description("Project id spanner instance belong to")
ValueProvider<String> getProjectId();
void setProjectId(ValueProvider<String> projectId);
@Description("Spanner instance id you want to access")
ValueProvider<String> getInstanceId();
void setInstanceId(ValueProvider<String> instanceId);
@Description("Spanner Database id you want to access")
ValueProvider<String> getDatabaseId();
void setDatabaseId(ValueProvider<String> databaseId);
@Description("SQL query to extract records from spanner")
ValueProvider<String> getQuery();
void setQuery(ValueProvider<String> query);
@Description("Destination BigQuery table. format {dataset}.{table}")
ValueProvider<String> getOutput();
void setOutput(ValueProvider<String> output);
@Description("(Optional) Input timestamp bound as format 'yyyy-MM-ddTHH:mm:SSZ'")
ValueProvider<String> getTimestampBound();
void setTimestampBound(ValueProvider<String> timestampBound);
}
public static void main(final String[] args) {
final SpannerToBigQueryPipelineOption options = PipelineOptionsFactory
.fromArgs(args)
.as(SpannerToBigQueryPipelineOption.class);
final Pipeline pipeline = Pipeline.create(options);
final ValueProvider<String> output = options.getOutput();
final PCollection<Struct> structs = pipeline
.apply("QuerySpanner", SpannerQueryIO.read(
options.getProjectId(),
options.getInstanceId(),
options.getDatabaseId(),
options.getQuery(),
options.getTimestampBound()));
final PCollectionView<Map<String,String>> schemaView = structs
.apply("SampleStruct", Sample.any(1))
.apply("AsMap", MapElements
.into(TypeDescriptors.maps(TypeDescriptors.strings(),TypeDescriptors.strings()))
.via(struct -> StructToTableRowConverter.convertSchema(output.get(), struct)))
.apply("AsView", View.asSingleton());
structs.apply("WriteBigQuery", BigQueryIO.<Struct>write()
.to(options.getOutput())
.withFormatFunction(StructToTableRowConverter::convert)
.withSchemaFromView(schemaView)
.withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED)
.withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_APPEND));
pipeline.run();
}
}