# Producer (30 points)

In a separate file, in the repo.

# Consumer

## Dataset and Stream creation (10 points)

In [1]:
import findspark
findspark.init()

from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Final project: Structured Streaming") \
    .master("spark://spark-master:7077") \
    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.13:4.0.0") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("INFO")

# Optimization (reduce the number of shuffle partitions)
spark.conf.set("spark.sql.shuffle.partitions", "5")

:: loading settings :: url = jar:file:/opt/spark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /root/.ivy2.5.2/cache
The jars for the packages stored in: /root/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-0d6cd9f0-b500-4087-9940-d9bc9352073d;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.0 in central
	found org.apache.kafka#kafka-clients;3.9.0 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.7 in central
	found org.slf4j#slf4j-api;2.0.16 in central
	found org.apache.hadoop#hadoop-client-runtime;3.4.1 in central
	found org.apache.hadoop#hadoop-client-api;3.4.1 in central
	found com.google.code.findbugs#jsr305;3.0.0 in central
	found org.scala-lang.modules#scala-parallel-collections_2.13;1.2.0

In [2]:
# Create the remote connection
kafka_df = spark.readStream \
            .format("kafka") \
            .option("kafka.bootstrap.servers", "kafka:9093") \
            .option("subscribe", "telemetry-project") \
            .load() 
kafka_df.selectExpr(
    "CAST(key AS STRING)", 
    "CAST(value AS STRING)",
    "offset as row_index",
    "timestamp as source_timestamp", 
)


DataFrame[key: string, value: string, row_index: bigint, source_timestamp: timestamp]

In [3]:
kafka_df.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



In [4]:
from regalado_floriano.spark_utils import SparkUtils

In [5]:
from pyspark.sql import functions as F

In [6]:
schema_keys = (("property_id","int"),
("country","string"),
("city","string"),
("property_type","string"),
("furnishing_status","string"),
("property_size_sqft","int"),
("price","int"),
("constructed_year","int"),
("previous_owners","int"),
("rooms","int"),
("bathrooms","int"),
("garage","bool"),
("garden","bool"),
("crime_cases_reported","int"),
("legal_cases_on_property","bool"),
("customer_salary","int"),
("loan_amount","int"),
("loan_tenure_years","int"),
("monthly_expenses","int"),
("down_payment","int"),
("emi_to_income_ratio","float"),
("satisfaction_score","int"),
("neighbourhood_rating","int"),
("connectivity_score","int"),
("decision","bool")
)
houses_schema = SparkUtils.generate_schema(schema_keys)

## Transformations and Actions (15 points)

In [7]:
from pyspark.sql.functions import explode, split, window

In [8]:
from pyspark.sql.functions import year, month, day, from_json, col 
from pyspark.sql.types import StructField, StringType, ArrayType
from regalado_floriano.spark_utils import SparkUtils

## Transformation 1: Reading JSON

In [9]:
from pyspark.sql.functions import year, month, day, from_json, col 
from pyspark.sql.types import StructField, StringType
from regalado_floriano.spark_utils import SparkUtils

raw_string_column = kafka_df.value.cast("string").alias("value_str")
raw_string_df = kafka_df.select(raw_string_column)
raw_telemetry_df = kafka_df.select( raw_string_column   , "offset","timestamp")
vg_telemetry_df = raw_telemetry_df.withColumn("telemetry", (from_json("value_str", (houses_schema) ) ))
# We need to extract the columns from the input JSON 
 
vg_extracted_df = vg_telemetry_df.withColumn("year", year(vg_telemetry_df.timestamp)) \
                                      .withColumn("month", month(vg_telemetry_df.timestamp)) \
                                      .withColumn("day", day(vg_telemetry_df.timestamp))

In [10]:
to_select = []
for key, type_ in schema_keys:
    to_select.append(f"telemetry.{key}")
to_select.extend(["timestamp","offset","value_str","year","month","day"])
house_df = vg_extracted_df.select(to_select)  

In [16]:
from pyspark.sql.functions import year, month, day, from_json, col

## Transformation 2: Add sale info

In [17]:
houses_with_sales = house_df.withColumn("sale_total", house_df.price * house_df.decision.cast("int") )

## Transformation 3: Window and Group By Countries

In [36]:
windowed_houses = (
    houses_with_sales
        .withWatermark("timestamp", "1 minute")
        .groupBy(
            window("timestamp", "30 seconds", "15 seconds"),
            F.col("country"),
            F.col("property_type")
        )
)


## Transformation 4: Aggregate Sales

In [44]:
aggregated_houses = windowed_houses.agg(
    F.sum(houses_with_sales.sale_total).alias("sale_total"),
    F.max(houses_with_sales.sale_total).alias("biggest_sale"),
    F.avg(houses_with_sales.sale_total).alias("average_sale"),
    F.avg(houses_with_sales.satisfaction_score).alias("average_satisfaction") 
)\
    .withColumn("window_start", F.col("window.start")) \
    .withColumn("window_end",   F.col("window.end")) \
    .withColumn("year",  F.year("window.start")) \
    .withColumn("month", F.month("window.start"))\
    .drop("window")

# Persistence Data

In [39]:
query_files = aggregated_houses.writeStream \
                .trigger(processingTime="10 seconds") \
                .partitionBy("year", "month") \
                .format("csv") \
                .option("header", "true") \
                .option("path", "/opt/spark/work-dir/data/final_streaming_csv/") \
                .option("checkpointLocation", "/opt/spark/work-dir/data/final_streaming_csv_delta") \
                .start()


25/11/24 05:10:20 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [48]:
query_files.awaitTermination(15)

False

In [46]:
query_files.lastProgress

{
  "id" : "6210ade4-fc16-4d11-ae14-621fedd3e657",
  "runId" : "aff74035-903d-4d07-9bb9-f4dd368f0043",
  "name" : null,
  "timestamp" : "2025-11-24T05:16:20.001Z",
  "batchId" : 10,
  "batchDuration" : 617,
  "numInputRows" : 902,
  "inputRowsPerSecond" : 90.2,
  "processedRowsPerSecond" : 1461.9124797406807,
  "durationMs" : {
    "addBatch" : 558,
    "commitOffsets" : 21,
    "getBatch" : 0,
    "latestOffset" : 3,
    "queryPlanning" : 11,
    "triggerExecution" : 617,
    "walCommit" : 23
  },
  "eventTime" : {
    "avg" : "2025-11-24T05:16:15.008Z",
    "max" : "2025-11-24T05:16:19.996Z",
    "min" : "2025-11-24T05:16:10.013Z",
    "watermark" : "2025-11-24T05:15:10.002Z"
  },
  "stateOperators" : [ {
    "operatorName" : "stateStoreSave",
    "numRowsTotal" : 232,
    "numRowsUpdated" : 232,
    "allUpdatesTimeMs" : 48,
    "numRowsRemoved" : 468,
    "allRemovalsTimeMs" : 9,
    "commitTimeMs" : 88,
    "memoryUsedBytes" : 215624,
    "numRowsDroppedByWatermark" : 0,
    "numSh

# Power BI Report

In [49]:
from powerbiclient import Report, models

In [50]:
from powerbiclient.authentication import DeviceCodeLoginAuthentication

In [53]:
device_auth = DeviceCodeLoginAuthentication()

Performing device flow authentication. Please follow the instructions below.
To sign in, use a web browser to open the page https://microsoft.com/devicelogin and enter the code F54DYK7XQ to authenticate.

Device flow authentication successfully completed.
You are now logged in .

The result should be passed only to trusted code in your notebook.


In [55]:
group_id="ab09a2e8-9e06-41f3-8b0c-333cd6d5d1d4"
report_id="31aeecd4-2363-4024-ba80-02d7da8ac74c/80b19268b93e0e02d088"

In [57]:
report = Report(group_id=group_id, report_id=report_id, auth=device_auth)

report

Exception: Could not get embed URL: Get embed URL failed with status code 404

This did not work because I do not have admin permissions in Power BI