In [44]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, col, year, month
from pyspark.sql.types import StructType, StringType, IntegerType, FloatType, TimestampType, MapType
from pyspark.sql.types import DoubleType

In [None]:
spark = SparkSession.builder \
    .appName("KafkaStreamingExample") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.8") \
    .getOrCreate()

In [46]:
bootstrap_servers = "pkc-56d1g.eastus.azure.confluent.cloud:9092"
kafka_topic = "Eslam_topic" # add topic name
kafka_username = "JUKQQM4ZM632RECA"
kafka_password = "UUkrPuSttgOC0U9lY3ZansNsKfN9fbxZPFwrGxudDrfv+knTD4rCwK+KdIzVPX0D"

In [48]:
schema = StructType() \
    .add("eventType", StringType()) \
    .add("customerId", StringType()) \
    .add("productId", StringType()) \
    .add("timestamp", TimestampType()) \
    .add("metadata", StructType()
        .add("category", StringType())
        .add("source", StringType())
    ) \
    .add("quantity", IntegerType()) \
    .add("totalAmount", DoubleType()) \
    .add("paymentMethod", StringType()) \
    .add("recommendedProductId", StringType()) \
    .add("algorithm", StringType())

In [49]:
df = spark \
    .readStream \
    .format("kafka") \
    .option("kafka.bootstrap.servers", bootstrap_servers) \
    .option("subscribe", kafka_topic) \
    .option("startingOffsets", "earliest") \
    .option("kafka.security.protocol", "SASL_SSL") \
    .option("kafka.sasl.mechanism", "PLAIN") \
    .option("kafka.sasl.jaas.config",
            f'org.apache.kafka.common.security.plain.PlainLoginModule required username="{kafka_username}" password="{kafka_password}";') \
    .load()

In [50]:
json_df = df.selectExpr("CAST(value AS STRING)").select(from_json("value", schema).alias("data")).select("data.*")


In [54]:
transformed_df = json_df \
    .withColumn("timestamp", col("timestamp").cast(TimestampType())) \
    .withColumn("year", year(col("timestamp"))) \
    .withColumn("month", month(col("timestamp"))) \
    .withColumn("category", col("metadata").getItem("category")) \
    .withColumn("source", col("metadata").getItem("source"))


In [51]:
query = transformed_df \
    .writeStream \
    .outputMode("append") \
    .format("parquet") \
    .option("path", "hdfs://localhost:9000//user/streaming") \
    .option("checkpointLocation", "hdfs://localhost:9000//user/streaming_check") \
    .start()

#query.awaitTermination()

In [None]:
streaming_df = spark.read.parquet("/user/streaming/*.parquet")

In [None]:
SELECT 
    productId, 
    SUM(totalAmount) as total_sales 
FROM 
    streaming_data 
GROUP BY 
    productId 
ORDER BY 
    total_sales DESC 
LIMIT 5;


In [None]:
SELECT 
    metadata.category, 
    YEAR('timestamp') as year, 
    MONTH('timestamp') as month, 
    SUM(totalAmount) as total_sales, 
    SUM(quantity) as total_quantity 
FROM 
    streaming_data 
GROUP BY 
    metadata.category, 
    YEAR('timestamp'), 
    MONTH('timestamp') 
ORDER BY 
    year, 
    month, 
    metadata.category;


In [None]:
streaming_df.write\
    .mode("overwrite")\
    .option("path","/user/streaming/*.parquet")\
    .saveAsTable("default.streaming_table")

In [43]:
spark.stop()