# <center> <img src="../../labs/img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Procesamiento de Datos Masivos** </center>
---
### <center> **Primavera 2025** </center>
---
### <center> **Ejemplos de Spark: Structured Streaming (Kafka consumer application)** </center>

---
**Profesor**: Dr. Pablo Camarillo Ramirez

In [11]:
import findspark
from equipo.spark_utils import SparkUtils
from pyspark.sql.functions import from_json


findspark.init()

In [12]:
SPARK_ID = "3c4c7def4de3"
KAFKA_ID = "66817d0dbacb"

#### Creacion de la conexión con el cluster de spark


In [13]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Equipo-kafka-structured-stream") \
    .master(f"spark://{SPARK_ID}:7077") \
    .config("spark.ui.port","4040") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:3.5.4") \
    .getOrCreate()
sc = spark.sparkContext

### Creación del Kafka Stream

In [14]:
kafka_lines = spark \
                .readStream \
                .format("kafka") \
                .option("kafka.bootstrap.servers", f"{KAFKA_ID}:9093") \
                .option("subscribe", "tweet-1, tweet-2, tweet-3, tweet-4") \
                .load()

kafka_lines.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [15]:
headers = [
        ("tweet_id", "string"),
        ("user_id", "integer"),
        ("timestamp", "string"),
        ("text", "string"),
        ("hashtags", "string"),
        ("mentions", "string"),
        ("retweet_count", "integer"),
        ("favorite_count", "integer"),
        ("reply_count", "integer"),
        ("quote_count", "integer"),
        ("views", "integer")
]

tweet_schema = SparkUtils.generate_schema(headers)

### Transform binary data into string

In [16]:
kafka_df = kafka_lines.withColumn("value_str", kafka_lines.value.cast("string"))
kafka_df = kafka_df.select(from_json(kafka_df.value_str, tweet_schema).alias("data")).select("data.*")

### Configuración del "Sink" del stream

In [17]:

query = kafka_df \
                .writeStream \
                .outputMode("append") \
                .trigger(processingTime='5 seconds') \
                .format("parquet") \
                .option("path", "/home/jovyan/notebooks/data/parquet") \
                .option("truncate", "false") \
                .option("checkpointLocation", "/home/jovyan/notebooks/data/checkpoint") \
                .start()

query.awaitTermination(600)


25/05/13 01:31:09 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
25/05/13 01:31:16 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 7227 milliseconds
25/05/13 01:31:16 WARN AdminClientConfig: These configurations '[key.deserializer, value.deserializer, enable.auto.commit, max.poll.records, auto.offset.reset]' were supplied but are not used yet.
25/05/13 01:37:45 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 5022 milliseconds
25/05/13 01:38:35 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 5000 milliseconds, but spent 5011 milliseconds
                                                                                

False

In [19]:
query.stop()
sc.stop()