In [1]:
import os
import sys
import traceback
import logging
from pyspark.sql import SparkSession
from pyspark.sql.functions import  explode,col, expr,when,to_date, sum, from_json
from pyspark.sql.types import  ArrayType,StructType, StructField, BooleanType, StringType, IntegerType, DateType, FloatType,DoubleType, LongType
import threading

import time
from pyspark.sql import functions as F

In [2]:

spark = SparkSession.builder \
    .appName("MinIO with Delta Lake") \
    .config("spark.jars", "jars/hadoop-aws-3.3.4.jar,jars/kafka-clients-3.3.2.jar,jars/spark-sql-kafka-0-10_2.12-3.2.1.jar,jars/aws-java-sdk-bundle-1.12.262.jar,jars/delta-core_2.12-2.2.0.jar,jars/delta-storage-2.2.0.jar")\
    .config("spark.hadoop.fs.s3a.endpoint", "http://minio:9000") \
    .config("spark.hadoop.fs.s3a.access.key", "conbo123") \
    .config("spark.hadoop.fs.s3a.secret.key", "123conbo") \
    .config("spark.hadoop.fs.s3a.path.style.access", "true") \
    .config("spark.hadoop.fs.s3a.impl", "org.apache.hadoop.fs.s3a.S3AFileSystem") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .config("spark.delta.logStore.class", "org.apache.spark.sql.delta.storage.S3SingleDriverLogStore") \
    .config("delta.enable-non-concurrent-writes", "true") \
    .config('spark.sql.warehouse.dir', "s3a://lakehouse/") \
    .getOrCreate()

In [3]:
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "tmdb_movies") \
    .load()

# Chuyển đổi dữ liệu từ binary sang string nếu cần
kafka_df_str = kafka_df.selectExpr(
    "CAST(key AS STRING) as key", 
    "CAST(value AS STRING) as value", 
    "topic", 
    "partition", 
    "offset", 
    "timestamp"
)

# Tạo streaming query với sink kiểu memory
query = kafka_df_str.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("kafka_memory_table") \
    .start()

Py4JJavaError: An error occurred while calling o51.load.
: java.lang.NoClassDefFoundError: org/apache/kafka/common/serialization/ByteArraySerializer
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.<init>(KafkaSourceProvider.scala:599)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider$.<clinit>(KafkaSourceProvider.scala)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.org$apache$spark$sql$kafka010$KafkaSourceProvider$$validateStreamOptions(KafkaSourceProvider.scala:338)
	at org.apache.spark.sql.kafka010.KafkaSourceProvider.sourceSchema(KafkaSourceProvider.scala:71)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceSchema(DataSource.scala:236)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo$lzycompute(DataSource.scala:118)
	at org.apache.spark.sql.execution.datasources.DataSource.sourceInfo(DataSource.scala:118)
	at org.apache.spark.sql.execution.streaming.StreamingRelation$.apply(StreamingRelation.scala:34)
	at org.apache.spark.sql.streaming.DataStreamReader.loadInternal(DataStreamReader.scala:168)
	at org.apache.spark.sql.streaming.DataStreamReader.load(DataStreamReader.scala:144)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at java.base/jdk.internal.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:77)
	at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.base/java.lang.reflect.Method.invoke(Method.java:568)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
	at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
	at java.base/java.lang.Thread.run(Thread.java:833)
Caused by: java.lang.ClassNotFoundException: org.apache.kafka.common.serialization.ByteArraySerializer
	at java.base/java.net.URLClassLoader.findClass(URLClassLoader.java:445)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:587)
	at java.base/java.lang.ClassLoader.loadClass(ClassLoader.java:520)
	... 22 more


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, from_json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, BooleanType, ArrayType, MapType

# Tạo Spark Session
spark = SparkSession.builder \
    .appName("KafkaToDelta") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

# Định nghĩa Schema cho dữ liệu JSON từ Kafka
movie_schema = StructType([
    StructField("id", IntegerType(), True),
    StructField("title", StringType(), True),
    StructField("original_title", StringType(), True),
    StructField("original_language", StringType(), True),
    StructField("overview", StringType(), True),
    StructField("budget", IntegerType(), True),
    StructField("popularity", DoubleType(), True),
    StructField("release_date", StringType(), True),
    StructField("revenue", IntegerType(), True),
    StructField("runtime", IntegerType(), True),
    StructField("status", StringType(), True),
    StructField("tagline", StringType(), True),
    StructField("vote_average", DoubleType(), True),
    StructField("vote_count", IntegerType(), True),
    StructField("genres", ArrayType(StructType([
        StructField("id", IntegerType(), True),
        StructField("name", StringType(), True)
    ])), True),
    StructField("homepage", StringType(), True),
])

# Đọc dữ liệu từ Kafka
kafka_df = spark.readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "localhost:9092") \
    .option("subscribe", "tmdb_movies") \
    .option("startingOffsets", "earliest") \
    .load()

# Chuyển đổi dữ liệu từ JSON
movies_df = kafka_df.select(from_json(col("value").cast("string"), movie_schema).alias("data")).select("data.*")

movies_df.printSchema()


In [19]:
kafka_df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", "kafka:9092") \
    .option("subscribe", "tmdb_movies") \
    .load()

# Chuyển đổi dữ liệu từ binary sang string nếu cần
kafka_df_str = kafka_df.selectExpr(
    "CAST(key AS STRING) as key", 
    "CAST(value AS STRING) as value", 
    "topic", 
    "partition", 
    "offset", 
    "timestamp"
)

# Tạo streaming query với sink kiểu memory
query = kafka_df_str.writeStream \
    .outputMode("append") \
    .format("memory") \
    .queryName("kafka_memory_table") \
    .start()


In [20]:
# Truy vấn và hiển thị dữ liệu trong bảng tạm
spark.sql("SELECT * FROM kafka_memory_table").show()


+---+-----+-----+---------+------+---------+
|key|value|topic|partition|offset|timestamp|
+---+-----+-----+---------+------+---------+
+---+-----+-----+---------+------+---------+



In [9]:
import time
kafka_string_df = kafka_df.selectExpr(
    "CAST(key AS STRING) as key",
    "CAST(value AS STRING) as value",
    "timestamp"
)

# 4. Ghi stream dữ liệu vào memory sink để có thể truy vấn trực tiếp trên Jupyter
# Tạo một bảng tạm có tên "kafka_table"
query = kafka_string_df.writeStream \
    .format("memory") \
    .queryName("kafka_table") \
    .outputMode("append") \
    .start()

# 5. Chờ một chút để streaming query có thời gian thu thập dữ liệu (ví dụ: 10 giây)
time.sleep(10)

# 6. Hiển thị dữ liệu đã thu thập được từ memory sink thông qua Spark SQL
spark.sql("SELECT * FROM kafka_table").show(truncate=False)

# 7. Khi không cần streaming nữa, dừng query
query.stop()

+---+-----+---------+
|key|value|timestamp|
+---+-----+---------+
+---+-----+---------+



In [23]:
from pyspark.streaming.kafka import KafkaUtils
from pyspark.streaming import StreamingContext

ssc = StreamingContext(spark.sparkContext, 1)

kafkaParams = {
  "bootstrap.servers": "kafka:9092",
  "auto.offset.reset": "smallest"
}

directStream = KafkaUtils.createDirectStream(
  ssc,
  topics=['tmdb_movies'],
  kafkaParams=kafkaParams
)

directStream.print()

ssc.start()
ssc.awaitTermination()

ModuleNotFoundError: No module named 'pyspark.streaming.kafka'

In [11]:
from pyspark.sql import SparkSession

def main():
    
    spark = SparkSession.builder \
    .appName("Kafka Streaming") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,"
            "org.apache.kafka:kafka-clients:3.3.2") \
    .getOrCreate()
    # Đọc dữ liệu từ Kafka
    kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:2933092") \
        .option("subscribe", "tmdb_movies") \
        .option("startingOffsets", "earliest") \
        .load()

    # Chuyển đổi dữ liệu từ binary sang string
    json_df = kafka_df.selectExpr("CAST(value AS STRING)")

    # Ghi dữ liệu ra console để kiểm tra
    query = json_df.writeStream \
        .outputMode("append") \
        .format("console") \
        .start()
    

In [2]:
spark = SparkSession.builder \
    .appName("Kafka Streaming") \
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1,"
            "org.apache.kafka:kafka-clients:3.3.2") \
    .getOrCreate()

In [3]:
kafka_df = spark.readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka:2933092") \
        .option("subscribe", "tmdb_movies") \
        .option("startingOffsets", "earliest") \
        .load()