# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Generate schemas dynamically** </center>
---
**Profesor**: Pablo Camarillo Ramirez <br>
**Student**: Jaime Enrique Galindo Villegas

# Find the PySpark Installation

In [20]:
import findspark
findspark.init()

In [21]:
from jaime_galindo.spark_utils import SparkUtils

In [22]:
schema = SparkUtils.generate_schema([
    ("name", "string"),
    ("age", "int"),
    ("city", "string")
])

schema

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('city', StringType(), True)])

# Create SparkSession

In [23]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://a8d253fcd1c5:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [24]:
from datetime import datetime

factory_data = [
    ("M001", datetime(2025, 4, 26, 8, 0, 0), 75.3),
    ("M002", datetime(2025, 4, 26, 8, 5, 0), 68.7),
    ("M001", datetime(2025, 4, 26, 8, 10, 0), 76.1),
    ("M003", datetime(2025, 4, 26, 8, 15, 0), 72.4),
    ("M002", datetime(2025, 4, 26, 8, 20, 0), 69.8),
    ("M001", datetime(2025, 4, 26, 8, 25, 0), 77.5),
    ("M003", datetime(2025, 4, 26, 8, 30, 0), 73.2),
    ("M002", datetime(2025, 4, 26, 8, 35, 0), 70.1),
    ("M001", datetime(2025, 4, 26, 8, 40, 0), 78.0),
    ("M003", datetime(2025, 4, 26, 8, 45, 0), 74.6),
]

factory_schema = SparkUtils.generate_schema([("machine_id", "string"), ("sensor_timestamp", "timestamp"), ("temp", "float")])
df_factory = spark.createDataFrame(factory_data, factory_schema)
df_factory.show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M002|2025-04-26 08:05:00|68.7|
|      M001|2025-04-26 08:10:00|76.1|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:20:00|69.8|
|      M001|2025-04-26 08:25:00|77.5|
|      M003|2025-04-26 08:30:00|73.2|
|      M002|2025-04-26 08:35:00|70.1|
|      M001|2025-04-26 08:40:00|78.0|
|      M003|2025-04-26 08:45:00|74.6|
+----------+-------------------+----+



# Explore the schema of the DataFrame

In [25]:
df_factory.printSchema()

root
 |-- machine_id: string (nullable = true)
 |-- sensor_timestamp: timestamp (nullable = true)
 |-- temp: float (nullable = true)



# Get the average temperature per machine


In [26]:
from pyspark.sql.functions import col, avg
df_factory.groupBy(col("machine_id")).agg(avg("temp")).show()

+----------+-----------------+
|machine_id|        avg(temp)|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



# Find the maximum and minimum temperature per machine

In [27]:
from pyspark.sql.functions import min, max
df_factory.groupBy(col("machine_id")).agg(
    min("temp").alias("min_temp"),
    max("temp")
).show()

+----------+--------+---------+
|machine_id|min_temp|max(temp)|
+----------+--------+---------+
|      M002|    68.7|     70.1|
|      M003|    72.4|     74.6|
|      M001|    75.3|     78.0|
+----------+--------+---------+



# Filter records above a temperature threshold (temp > 75).


In [28]:
df_factory.filter(col("temp") > 75).show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



# Count of readings per machine


In [29]:
df_factory.groupBy(col("machine_id")).count().show()

+----------+-----+
|machine_id|count|
+----------+-----+
|      M002|    3|
|      M003|    3|
|      M001|    4|
+----------+-----+



# Machine with the highest temperature


In [32]:
df_factory.orderBy(col("temp").desc()).limit(1).show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+

