In [2]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder
    .appName("kafka_to_clickhouse") 
    .master("spark://spark-master:7077")
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.1,"
            "org.apache.kafka:kafka-clients:3.5.1,"
            "org.apache.commons:commons-pool2:2.11.1,"
            "com.clickhouse:clickhouse-jdbc:0.7.2" # с версиями 0.8.x и более не работает
           )
    .config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoint")
    # Чтобы сессия не занимала все воркеры
    .config("spark.cores.max", "3")
    .getOrCreate()
)

:: loading settings :: url = jar:file:/opt/conda/lib/python3.11/site-packages/pyspark/jars/ivy-2.5.1.jar!/org/apache/ivy/core/settings/ivysettings.xml


Ivy Default Cache set to: /home/jovyan/.ivy2/cache
The jars for the packages stored in: /home/jovyan/.ivy2/jars
org.apache.spark#spark-sql-kafka-0-10_2.12 added as a dependency
org.apache.kafka#kafka-clients added as a dependency
org.apache.commons#commons-pool2 added as a dependency
com.clickhouse#clickhouse-jdbc added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-920f6be9-18fe-4560-8858-87bed8a74f0c;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.12;3.5.1 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.12;3.5.1 in central
	found org.apache.hadoop#hadoop-client-runtime;3.3.4 in central
	found org.apache.hadoop#hadoop-client-api;3.3.4 in central
	found org.xerial.snappy#snappy-java;1.1.10.3 in central
	found org.slf4j#slf4j-api;2.0.7 in central
	found commons-logging#commons-logging;1.1.3 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.apache.commons#commons-pool2;2.11.1 in centr

## Подготовка таблицы в ClickHouse

Выполним DDL-запрос по адресу http://localhost:8123/play
```sql
CREATE TABLE IF NOT EXISTS default.streaming_test_table
(
    id String,
    name String,
    timestamp String,
    kafka_timestamp DateTime64(3),
    processed_at DateTime64(3)
)
ENGINE = MergeTree()
ORDER BY (processed_at, id)
PARTITION BY toYYYYMM(processed_at);
```


## Начнем запись в Kafka

Создадим новый ```test_topic``` в Kafka (подключен kafka-ui по адресу http://localhost:8090)

Выполним ```python3 main.py``` в терминале

## Запускаем стриминг (микробатчинг)

In [5]:
from pyspark.sql.functions import from_json, col, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType

schema = StructType([
    StructField("id", StringType(), True),
    StructField("name", StringType(), True),
    StructField("timestamp", StringType(), True)
])

kafka_df = (
    spark.readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "test_topic")
    .option("startingOffsets", "earliest")
    .load()
)

parsed_df = kafka_df.select(
    from_json(col("value").cast("string"), schema).alias("data"),
    col("timestamp").alias("kafka_timestamp")
).select("data.*", "kafka_timestamp")

processed_df = parsed_df.withColumn("processed_at", current_timestamp())

In [None]:
def write_to_clickhouse(batch_df, batch_id):
    if not batch_df.isEmpty():
        print(f"\nProcessing batch {batch_id} with {batch_df.count()} records")
        (
            batch_df.write.format("jdbc")
            .option("driver", "com.clickhouse.jdbc.ClickHouseDriver")
            .option("url", "jdbc:clickhouse://clickhouse-server:8123/default")
            .option("dbtable", "default.streaming_test_table")
            .option("user", "default")
            .option("password", "1234qwe")
            .option("batchsize", "10000")
            .option("socket_timeout", "300000")
            .option("numPartitions", "4")
            .option("rewriteBatchedStatements", "true")
            .mode("append")
            .save()
        )
        print(f"Batch {batch_id} written successfully to ClickHouse")
    else:
        print(f"Batch {batch_id} is empty, skipping")

query = processed_df.writeStream \
    .foreachBatch(write_to_clickhouse) \
    .outputMode("append") \
    .option("checkpointLocation", "/tmp/checkpoint/kafka_to_clickhouse") \
    .start()

query.awaitTermination()

25/10/22 00:18:43 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


Batch 0 is empty, skipping


                                                                                


Processing batch 1 with 1 records


                                                                                

Batch 1 written successfully to ClickHouse


                                                                                


Processing batch 2 with 5 records


                                                                                

Batch 2 written successfully to ClickHouse


                                                                                


Processing batch 3 with 1 records
Batch 3 written successfully to ClickHouse


                                                                                


Processing batch 4 with 1 records


                                                                                

Batch 4 written successfully to ClickHouse

Processing batch 5 with 2 records


                                                                                

Batch 5 written successfully to ClickHouse

Processing batch 6 with 1 records


                                                                                

Batch 6 written successfully to ClickHouse


                                                                                


Processing batch 7 with 1 records


                                                                                

Batch 7 written successfully to ClickHouse


                                                                                


Processing batch 8 with 1 records


                                                                                

Batch 8 written successfully to ClickHouse


                                                                                


Processing batch 9 with 1 records
Batch 9 written successfully to ClickHouse


                                                                                


Processing batch 10 with 1 records


                                                                                

Batch 10 written successfully to ClickHouse

Processing batch 11 with 1 records
Batch 11 written successfully to ClickHouse

Processing batch 12 with 1 records
Batch 12 written successfully to ClickHouse

Processing batch 13 with 1 records


                                                                                

Batch 13 written successfully to ClickHouse


                                                                                


Processing batch 14 with 1 records
Batch 14 written successfully to ClickHouse

Processing batch 15 with 1 records
Batch 15 written successfully to ClickHouse

Processing batch 16 with 1 records
Batch 16 written successfully to ClickHouse

Processing batch 17 with 1 records


                                                                                

Batch 17 written successfully to ClickHouse


                                                                                


Processing batch 18 with 1 records
Batch 18 written successfully to ClickHouse

Processing batch 19 with 1 records
Batch 19 written successfully to ClickHouse

Processing batch 20 with 1 records
Batch 20 written successfully to ClickHouse

Processing batch 21 with 1 records
Batch 21 written successfully to ClickHouse


                                                                                


Processing batch 22 with 1 records


                                                                                

Batch 22 written successfully to ClickHouse

Processing batch 23 with 1 records


                                                                                

Batch 23 written successfully to ClickHouse


                                                                                


Processing batch 24 with 1 records
Batch 24 written successfully to ClickHouse

Processing batch 25 with 1 records


                                                                                

Batch 25 written successfully to ClickHouse

Processing batch 26 with 1 records
Batch 26 written successfully to ClickHouse

Processing batch 27 with 1 records


                                                                                

Batch 27 written successfully to ClickHouse

Processing batch 28 with 1 records
Batch 28 written successfully to ClickHouse

Processing batch 29 with 1 records
Batch 29 written successfully to ClickHouse

Processing batch 30 with 1 records
Batch 30 written successfully to ClickHouse

Processing batch 31 with 1 records
Batch 31 written successfully to ClickHouse

Processing batch 32 with 1 records
Batch 32 written successfully to ClickHouse


                                                                                


Processing batch 33 with 1 records


                                                                                

Batch 33 written successfully to ClickHouse

Processing batch 34 with 1 records


                                                                                

Batch 34 written successfully to ClickHouse


                                                                                


Processing batch 35 with 1 records
Batch 35 written successfully to ClickHouse

Processing batch 36 with 1 records


                                                                                

Batch 36 written successfully to ClickHouse

Processing batch 37 with 1 records


                                                                                

Batch 37 written successfully to ClickHouse


                                                                                


Processing batch 38 with 1 records


                                                                                

Batch 38 written successfully to ClickHouse

Processing batch 39 with 1 records
Batch 39 written successfully to ClickHouse

Processing batch 40 with 1 records


                                                                                

Batch 40 written successfully to ClickHouse


                                                                                


Processing batch 41 with 1 records
Batch 41 written successfully to ClickHouse


                                                                                


Processing batch 42 with 1 records
Batch 42 written successfully to ClickHouse

Processing batch 43 with 1 records
Batch 43 written successfully to ClickHouse

Processing batch 44 with 1 records
Batch 44 written successfully to ClickHouse


                                                                                


Processing batch 45 with 1 records
Batch 45 written successfully to ClickHouse

Processing batch 46 with 1 records
Batch 46 written successfully to ClickHouse

Processing batch 47 with 1 records
Batch 47 written successfully to ClickHouse

Processing batch 48 with 1 records
Batch 48 written successfully to ClickHouse

Processing batch 49 with 1 records


                                                                                

Batch 49 written successfully to ClickHouse

Processing batch 50 with 1 records
Batch 50 written successfully to ClickHouse


                                                                                


Processing batch 51 with 1 records
Batch 51 written successfully to ClickHouse

Processing batch 52 with 1 records
Batch 52 written successfully to ClickHouse

Processing batch 53 with 1 records
Batch 53 written successfully to ClickHouse

Processing batch 54 with 1 records


                                                                                

Batch 54 written successfully to ClickHouse

Processing batch 55 with 1 records


                                                                                

Batch 55 written successfully to ClickHouse


                                                                                


Processing batch 56 with 1 records
Batch 56 written successfully to ClickHouse

Processing batch 57 with 1 records
Batch 57 written successfully to ClickHouse

Processing batch 58 with 1 records
Batch 58 written successfully to ClickHouse

Processing batch 59 with 1 records


                                                                                

Batch 59 written successfully to ClickHouse

Processing batch 60 with 1 records


                                                                                

Batch 60 written successfully to ClickHouse

Processing batch 61 with 1 records
Batch 61 written successfully to ClickHouse

Processing batch 62 with 1 records


                                                                                

Batch 62 written successfully to ClickHouse


                                                                                


Processing batch 63 with 1 records


                                                                                

Batch 63 written successfully to ClickHouse


                                                                                


Processing batch 64 with 1 records


                                                                                

Batch 64 written successfully to ClickHouse

Processing batch 65 with 1 records


                                                                                

Batch 65 written successfully to ClickHouse


                                                                                


Processing batch 66 with 1 records
Batch 66 written successfully to ClickHouse


                                                                                


Processing batch 67 with 1 records
Batch 67 written successfully to ClickHouse

Processing batch 68 with 1 records
Batch 68 written successfully to ClickHouse

Processing batch 69 with 1 records
Batch 69 written successfully to ClickHouse

Processing batch 70 with 1 records
Batch 70 written successfully to ClickHouse


                                                                                


Processing batch 71 with 1 records


                                                                                

Batch 71 written successfully to ClickHouse


                                                                                


Processing batch 72 with 1 records
Batch 72 written successfully to ClickHouse


                                                                                


Processing batch 73 with 1 records
Batch 73 written successfully to ClickHouse


                                                                                


Processing batch 74 with 1 records
Batch 74 written successfully to ClickHouse


                                                                                


Processing batch 75 with 1 records
Batch 75 written successfully to ClickHouse

Processing batch 76 with 1 records


                                                                                

Batch 76 written successfully to ClickHouse

Processing batch 77 with 1 records
Batch 77 written successfully to ClickHouse

Processing batch 78 with 1 records


                                                                                

Batch 78 written successfully to ClickHouse


                                                                                


Processing batch 79 with 1 records


                                                                                

Batch 79 written successfully to ClickHouse


                                                                                


Processing batch 80 with 1 records


                                                                                

Batch 80 written successfully to ClickHouse

Processing batch 81 with 1 records


                                                                                

Batch 81 written successfully to ClickHouse


                                                                                


Processing batch 82 with 1 records
Batch 82 written successfully to ClickHouse


                                                                                


Processing batch 83 with 1 records
Batch 83 written successfully to ClickHouse


                                                                                


Processing batch 84 with 1 records
Batch 84 written successfully to ClickHouse


                                                                                


Processing batch 85 with 1 records
Batch 85 written successfully to ClickHouse


                                                                                


Processing batch 86 with 1 records


                                                                                

Batch 86 written successfully to ClickHouse

Processing batch 87 with 1 records


                                                                                

Batch 87 written successfully to ClickHouse

Processing batch 88 with 1 records


                                                                                

Batch 88 written successfully to ClickHouse

Processing batch 89 with 1 records


                                                                                

Batch 89 written successfully to ClickHouse

Processing batch 90 with 1 records


                                                                                

Batch 90 written successfully to ClickHouse

Processing batch 91 with 1 records


                                                                                

Batch 91 written successfully to ClickHouse

Processing batch 92 with 1 records


                                                                                

Batch 92 written successfully to ClickHouse

Processing batch 93 with 1 records


                                                                                

Batch 93 written successfully to ClickHouse


                                                                                


Processing batch 94 with 1 records
Batch 94 written successfully to ClickHouse
