In [1]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("s3_to_clickhouse")
    .master("spark://spark-master:7077")
    .config(
        "spark.jars.packages",
        ",".join([
            # s3 packages
            "org.apache.hadoop:hadoop-aws:3.3.4,com.amazonaws:aws-java-sdk-bundle:1.12.262",
            # clcikhouse packages
            "com.clickhouse.spark:clickhouse-spark-runtime-3.4_2.12:0.8.0",
            "com.clickhouse:clickhouse-client:0.7.0",
            "com.clickhouse:clickhouse-http-client:0.7.0",
            "org.apache.httpcomponents.client5:httpclient5:5.2.1",
        ])
    )
    # s3 configs
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000")
    .config("spark.hadoop.fs.s3a.access.key", "minio")
    .config("spark.hadoop.fs.s3a.secret.key", "minio-password")
    .config("spark.hadoop.fs.s3a.path.style.access", "true")
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem")
    # clickhouse configs
    .config("spark.sql.catalog.clickhouse", "com.clickhouse.spark.ClickHouseCatalog")
    .config("spark.sql.catalog.clickhouse.host", "clickhouse-server")
    .config("spark.sql.catalog.clickhouse.protocol", "http")
    .config("spark.sql.catalog.clickhouse.http_port", "8123")
    .config("spark.sql.catalog.clickhouse.user", "default")
    .config("spark.sql.catalog.clickhouse.password", "1234qwe")
    .config("spark.sql.catalog.clickhouse.database", "default")
    .config("spark.clickhouse.write.format", "json")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
com.clickhouse.spark#clickhouse-spark-runtime-3.4_2.12 added as a dependency
com.clickhouse#clickhouse-client added as a dependency
com.clickhouse#clickhouse-http-client added as a dependency
org.apache.httpcomponents.client5#httpclient5 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-8b0d9423-4783-4623-96fe-b5dedcf227c2;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found com.clickhouse.spark#clickhouse-spark-runtime-3.4_2.12;0.8.0 in central
	found com.clickhouse#clickhouse-client;0.7.0 in central
	found com.clickhouse#clickhouse-data;0.7.0 in central
	found com.clickhouse#cli

In [2]:
df = spark.read.parquet(f"s3a://batch-bucket/parquet_table/")
df.show(truncate=False)
df.printSchema()

25/10/16 16:22:14 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
                                                                                

+---+---------------+------+
|id |name           |value |
+---+---------------+------+
|371|test_record_371|478.34|
|445|test_record_445|68.44 |
|211|test_record_211|823.56|
|149|test_record_149|312.37|
|86 |test_record_86 |232.7 |
|491|test_record_491|397.39|
|266|test_record_266|790.16|
|3  |test_record_3  |534.92|
|257|test_record_257|860.8 |
|103|test_record_103|684.82|
|110|test_record_110|847.1 |
|68 |test_record_68 |811.67|
|421|test_record_421|671.11|
|364|test_record_364|144.37|
|190|test_record_190|984.4 |
|97 |test_record_97 |120.8 |
|381|test_record_381|58.58 |
|393|test_record_393|221.61|
|438|test_record_438|737.89|
|214|test_record_214|778.84|
+---+---------------+------+
only showing top 20 rows

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- value: double (nullable = true)



In [3]:
spark.sql("""
CREATE TABLE IF NOT EXISTS clickhouse.default.test_table (
    id INT NOT NULL,
    name STRING,
    value FLOAT
)
USING clickhouse
TBLPROPERTIES (
    'engine'='MergeTree()',
    'order_by'='id'
)
""")

DataFrame[]

In [4]:
df.writeTo("clickhouse.default.test_table").append()

                                                                                

In [5]:
df_new = spark.sql("select * from clickhouse.default.test_table")
df_new.show()

+---+--------------+------+
| id|          name| value|
+---+--------------+------+
|  1| test_record_1|787.86|
|  2| test_record_2|462.91|
|  3| test_record_3|534.92|
|  7| test_record_7|696.54|
|  8| test_record_8|826.02|
| 11|test_record_11|868.09|
| 12|test_record_12|564.66|
| 13|test_record_13|511.07|
| 14|test_record_14|581.58|
| 15|test_record_15|681.07|
| 18|test_record_18|230.19|
| 21|test_record_21|351.03|
| 23|test_record_23|636.17|
| 24|test_record_24|564.96|
| 25|test_record_25|121.46|
| 31|test_record_31|595.62|
| 32|test_record_32|785.98|
| 35|test_record_35|931.88|
| 36|test_record_36|328.08|
| 38|test_record_38|701.95|
+---+--------------+------+
only showing top 20 rows



In [6]:
spark.stop()