In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark Session with Iceberg and ClickHouse configurations
spark = (
    SparkSession.builder
    .appName("streaming_iceberg_to_clickhouse")
    .master("spark://spark-master:7077")
    .config(
        "spark.jars.packages",
        ",".join([
            # s3 (AWS SDK v2)
            "org.apache.hadoop:hadoop-aws:3.3.4",
            "com.amazonaws:aws-java-sdk-bundle:1.12.262",
            # iceberg packages
            "org.apache.iceberg:iceberg-spark-runtime-3.5_2.12:1.10.0",
            "org.apache.iceberg:iceberg-aws-bundle:1.10.0",
            # clcikhouse packages
            "com.clickhouse:clickhouse-jdbc:0.7.1"
        ])
    )
    # iceberg confs
    .config("spark.sql.extensions", "org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions") 
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog") 
    .config("spark.sql.catalog.iceberg.type", "rest") 
    .config("spark.sql.catalog.iceberg.uri", "http://iceberg:8181") 
    .config("spark.sql.catalog.iceberg.io-impl", "org.apache.iceberg.aws.s3.S3FileIO") 
    .config("spark.sql.catalog.iceberg.warehouse", "s3://iceberg/") 
    .config("spark.sql.catalog.iceberg.s3.endpoint", "http://minio:9000") 
    .config("spark.sql.catalog.iceberg.s3.path-style-access", "true")
    # s3 confs
    .config("spark.hadoop.fs.s3a.access.key", "minio") 
    .config("spark.hadoop.fs.s3a.secret.key", "minio-password") 
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") 
    .config("spark.hadoop.fs.s3a.path.style.access", "true") 
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") 
    .config("spark.hadoop.fs.s3a.connection.ssl.enabled", "false")
    # Чтобы сессия не занимала все воркеры
    .config("spark.cores.max", "3")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.8/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.hadoop#hadoop-aws added as a dependency
com.amazonaws#aws-java-sdk-bundle added as a dependency
org.apache.iceberg#iceberg-spark-runtime-3.5_2.12 added as a dependency
org.apache.iceberg#iceberg-aws-bundle added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-de1fbeba-1431-422e-b408-58a4c40eea7a;1.0
	confs: [default]
	found org.apache.hadoop#hadoop-aws;3.3.4 in central
	found com.amazonaws#aws-java-sdk-bundle;1.12.262 in central
	found org.wildfly.openssl#wildfly-openssl;1.0.7.Final in central
	found org.apache.iceberg#iceberg-spark-runtime-3.5_2.12;1.10.0 in central
	found org.apache.iceberg#iceberg-aws-bundle;1.10.0 in central
	found com.clickhouse#clickhouse-jdbc;0.7.1 in central
	found com.clickhouse#clickhouse-client;0.7.1 in central
	found com.clickhouse#clickhouse-data;0.7

## Подготовим iceberg- и clickhouse- таблицы

In [3]:
# spark.sql("DROP TABLE IF EXISTS iceberg.test_schema.streaming_test_table")

spark.sql("CREATE NAMESPACE IF NOT EXISTS iceberg.test_schema")

# Create Iceberg table (adjust schema according to your needs)
spark.sql("""
    CREATE TABLE IF NOT EXISTS iceberg.test_schema.streaming_test_table (
        id STRING,
        name STRING,
        hdttm TIMESTAMP
    )
    USING iceberg
    TBLPROPERTIES (
        'format-version' = '2',
        'write.parquet.compression-codec' = 'snappy'
    )
""")

DataFrame[]

Выполним DDL-запрос по адресу http://localhost:8123/play
```SQL
CREATE TABLE default.streaming_test_table
(
    id String,
    name String,
    hdttm DateTime
)
ENGINE = MergeTree()
ORDER BY id;
```

## Запустим main.py

Выполним ```python3 main.py``` в терминале

## Iceberg -> ClickHouse

In [7]:
streaming_df = (
    spark.readStream
    .format("iceberg")
    .option("stream-from-timestamp", "0")
    .load("iceberg.test_schema.streaming_test_table")
)

In [None]:
def write_to_clickhouse(batch_df, batch_id):
    if not batch_df.isEmpty():
        print(f"\nProcessing batch {batch_id} with {batch_df.count()} records")
        
        (
            batch_df.write
            .format("jdbc")
            .option("driver", "com.clickhouse.jdbc.ClickHouseDriver")
            .option("url", "jdbc:clickhouse://clickhouse-server:8123/default")
            .option("dbtable", "default.streaming_test_table")
            .option("user", "default")
            .option("password", "1234qwe")
            .option("batchsize", "10000")
            .option("socket_timeout", "300000")
            .option("numPartitions", "4")
            .option("rewriteBatchedStatements", "true")
            .mode("append")
            .save()
        )
        
        print(f"Batch {batch_id} written successfully to ClickHouse")
    else:
        print(f"Batch {batch_id} is empty, skipping")

query = (
    streaming_df.writeStream 
    .foreachBatch(write_to_clickhouse)
    .outputMode("append")
    .option("checkpointLocation", "s3a://checkpoints/iceberg_to_clickhouse_checkpoints")
    .trigger(processingTime="10 seconds")
    .start()
)

query.awaitTermination()

25/10/21 03:55:29 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
25/10/21 03:55:31 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                


Processing batch 1 with 41 records


                                                                                

Batch 1 written successfully to ClickHouse


25/10/21 03:55:50 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000 milliseconds, but spent 18695 milliseconds



Processing batch 2 with 8 records
Batch 2 written successfully to ClickHouse

Processing batch 3 with 3 records
Batch 3 written successfully to ClickHouse

Processing batch 4 with 2 records


                                                                                

Batch 4 written successfully to ClickHouse

Processing batch 5 with 5 records
Batch 5 written successfully to ClickHouse

Processing batch 6 with 4 records
Batch 6 written successfully to ClickHouse

Processing batch 7 with 5 records
Batch 7 written successfully to ClickHouse

Processing batch 8 with 3 records
Batch 8 written successfully to ClickHouse

Processing batch 9 with 4 records
Batch 9 written successfully to ClickHouse

Processing batch 10 with 5 records
Batch 10 written successfully to ClickHouse

Processing batch 11 with 4 records
Batch 11 written successfully to ClickHouse

Processing batch 12 with 6 records
Batch 12 written successfully to ClickHouse

Processing batch 13 with 6 records
Batch 13 written successfully to ClickHouse

Processing batch 14 with 3 records
Batch 14 written successfully to ClickHouse


In [None]:
spark.stop()