# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>

#### <center> **Final Project: Structured Streaming** </center>
---

**Date**: October, 2025

**Student Name**: Vicente Sebastian Serrano Cabrera

**Professor**: Pablo Camarillo Ramirez

python3 lib/Modulo_Sebas/producerValo.py kafka:9093 valorant-topic data/valorant_events.csv

In [None]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Structured Streaming (Project)") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

spark.conf.set("spark.sql.shuffle.partitions", "5")

# Consumer

## Dataset and Stream creation

In [None]:
kafka_df = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "kafka:9093") \
            .option("subscribe", "valorant-topic") \
            .load()

kafka_df.printSchema()

In [None]:
from pyspark.sql.functions import from_json, col
from pyspark.sql.types import StructField, StringType
from pcamarillor.spark_utils import SparkUtils
valo_df = kafka_df.select(kafka_df.value.cast("string").alias("value_str"))

schema_columns = [
    ("event_id", "string"),
    ("match_id", "string"),
    ("player_id", "string"),
    ("player_name", "string"),
    ("rank", "string"),
    ("map_name", "string"),
    ("game_mode", "string"),
    ("event_type", "string"),
    ("weapon_or_ability", "string"),
    ("timestamp", "string")
]

pkg_schema = SparkUtils.generate_schema(schema_columns)
valo_extracted_df = valo_df.withColumn("telemetry", from_json(valo_df.value_str, pkg_schema))
valo_extracted_df.printSchema()            

## Transformations and Actions

In [None]:
from pyspark.sql.functions import to_timestamp, to_date, hour, when, count, avg, year, month, day

valo_transformed_df = valo_extracted_df \
    .withColumn("timestamp", to_timestamp(col("telemetry.timestamp"))) \
    .withColumn("event_date", to_date(col("telemetry.timestamp"))) \
    .withColumn("event_hour", hour(col("telemetry.timestamp"))) \
    .withColumn(
        "critical_event",
        when(col("telemetry.event_type").isin(["Kill", "SpikePlant", "SpikeDefuse"]), 1).otherwise(0))

valo_clean_df = valo_transformed_df.dropna(subset=[
    "telemetry.player_id",
    "telemetry.match_id",
    "telemetry.event_type",
    "telemetry.timestamp"
]).dropDuplicates()

df_player_stats = valo_clean_df \
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(
    col("telemetry.player_id"),
    col("telemetry.player_name"),
    col("telemetry.rank"),
    col("timestamp")
).agg(
    count("critical_event").alias("total_critical_events")
)

df_map_stats = valo_clean_df \
    .withWatermark("timestamp", "5 minutes") \
    .groupBy(
    col("telemetry.map_name"),
    col("timestamp")
).agg(
    avg("critical_event").alias("avg_critical_events")
)

valo_clean_df.printSchema()
df_player_stats.printSchema()
df_map_stats.printSchema()


## Persistence Data

In [None]:
query_clean = valo_clean_df.writeStream \
    .trigger(processingTime="10 seconds") \
    .partitionBy("event_date") \
    .format("parquet") \
    .option("path", "/opt/spark/work-dir/data/valorant/paquet/valorant_clean/") \
    .option("checkpointLocation", "/opt/spark/work-dir/data/valorant/valo_df/clean/") \
    .outputMode("append") \
    .start()

query_player_stats = df_player_stats.writeStream \
    .trigger(processingTime="10 seconds") \
    .partitionBy("rank") \
    .format("parquet") \
    .option("path", "/opt/spark/work-dir/data/valorant/paquet/player_stats/") \
    .option("checkpointLocation", "/opt/spark/work-dir/data/valorant/valo_df/player_stats/") \
    .outputMode("append") \
    .start()

query_map_stats = df_map_stats.writeStream \
    .trigger(processingTime="10 seconds") \
    .partitionBy("map_name") \
    .format("parquet") \
    .option("path", "/opt/spark/work-dir/data/valorant/paquet/map_stats/") \
    .option("checkpointLocation", "/opt/spark/work-dir/data/valorant/valo_df/map_stats/") \
    .outputMode("append") \
    .start()

spark.streams.awaitAnyTermination(300)