# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Structured Streaming (Kakfa producer)** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Create SparkSession

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on Structured Streaming (Kafka)") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

# Create a data stream from a Kafka topic

In [None]:
# Create the remote connection
kafka_df = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "kafka:9093") \
            .option("subscribe", "pkg-telemetry") \
            .load()

kafka_df.printSchema()

In [None]:
from pyspark.sql.functions import year, month, day, from_json, col
from pcamarillor.spark_utils import SparkUtils
vg_telemetry_df = kafka_df.select(kafka_df.value.cast("string").alias("value_str"))

# We need to extract the columns from the input JSON
schema_columns = [("player_id", "string"),
                  ("timestamp", "string"),
                  ("device", "string"),
                  ("action", "string"),
                  ("location", "struct"),
                  ("speed_m_s", "float"),
                  ("battery_level", "float")]
pkg_schema = SparkUtils.generate_schema(schema_columns)
vg_extracted_df = vg_telemetry_df.withColumn("player_id", from_json(vg_telemetry_df.value_str, pkg_schema))

# Extract the year, monht, and day from the timestamp column
vg_extracted_time_df = vg_extracted_df.withColumn("year", year(vg_extracted_df.timestamp)) \
                                      .withColumn("month", month(vg_extracted_df.timestamp)) \
                                      .withColumn("day", day(vg_extracted_df.timestamp))

query_console = vg_extracted_time_df.writeStream \
                    .format("console") \
                    .outputMode("append") \
                    .start()

query_console.awaitTermination(30)
query_console.stop()
                                        

In [None]:
# Finally, we send the stream to a files sink
query_files = vg_extracted_time_df.writeStream \
                .trigger(processingTime="10 seconds") \      
                .partitionBy(col("player_id"), col("year"), col("month"), col("day")) \
                .format("csv") \
                .option("header", "true") \
                .option("path", "/opt/workdir")
                .start()

In [None]:
sc.stop()