# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Program: _Ingenieria en Sistemas Computacionales_  </center>
---
### <center> **Autumn 2025** </center>
---

**Activity**: Generate Schema

**Date**: Aug 29, 2025

**Student Name**: Luis Angel Santana Hernandez

**Professor**: Pablo Camarillo Ramirez 

# Find the PySpark Installation

In [95]:
import findspark
findspark.init()

# Create SparkSession

In [96]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://4dcc0176a67d:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [97]:
from luis_santana.spark_utils import SparkUtils
schema = SparkUtils.generate_schema([("name", "string"), ("age", "int"), ("city", "string")])
schema

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('city', StringType(), True)])

## Example: Smart Factory Sensor Data

In [98]:
from datetime import datetime

factory_data = [
    ("M001", datetime(2025, 4, 26, 8, 0, 0), 75.3),
    ("M002", datetime(2025, 4, 26, 8, 5, 0), 68.7),
    ("M001", datetime(2025, 4, 26, 8, 10, 0), 76.1),
    ("M003", datetime(2025, 4, 26, 8, 15, 0), 72.4),
    ("M002", datetime(2025, 4, 26, 8, 20, 0), 69.8),
    ("M001", datetime(2025, 4, 26, 8, 25, 0), 77.5),
    ("M003", datetime(2025, 4, 26, 8, 30, 0), 73.2),
    ("M002", datetime(2025, 4, 26, 8, 35, 0), 70.1),
    ("M001", datetime(2025, 4, 26, 8, 40, 0), 78.0),
    ("M003", datetime(2025, 4, 26, 8, 45, 0), 74.6),
]

factory_schema = SparkUtils.generate_schema([("machine_id", "string"), ("sensor_timestamp", "timestamp"), ("temp", "float")])
df_factory = spark.createDataFrame(factory_data, factory_schema)
df_factory.show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M002|2025-04-26 08:05:00|68.7|
|      M001|2025-04-26 08:10:00|76.1|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:20:00|69.8|
|      M001|2025-04-26 08:25:00|77.5|
|      M003|2025-04-26 08:30:00|73.2|
|      M002|2025-04-26 08:35:00|70.1|
|      M001|2025-04-26 08:40:00|78.0|
|      M003|2025-04-26 08:45:00|74.6|
+----------+-------------------+----+



## Filtering and couting Data

In [99]:
from pyspark.sql.functions import col
filtered_df = df_factory.filter(col("temp") > 100)
filtered_df.show()

+----------+----------------+----+
|machine_id|sensor_timestamp|temp|
+----------+----------------+----+
+----------+----------------+----+



In [100]:
record_count = df_factory.count()
print(f"Total records: {record_count}")



Total records: 10


                                                                                

## Sorting and Grouping data

In [101]:
ordered_df = df_factory.orderBy(col("temp"), ascending=False)
ordered_df.show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:40:00|78.0|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:00:00|75.3|
|      M003|2025-04-26 08:45:00|74.6|
|      M003|2025-04-26 08:30:00|73.2|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:35:00|70.1|
|      M002|2025-04-26 08:20:00|69.8|
|      M002|2025-04-26 08:05:00|68.7|
+----------+-------------------+----+



In [102]:
grouped_df = df_factory.groupBy("machine_id").count()
grouped_df.show()

+----------+-----+
|machine_id|count|
+----------+-----+
|      M002|    3|
|      M003|    3|
|      M001|    4|
+----------+-----+



                                                                                

## Agregations

In [103]:
from pyspark.sql.functions import avg, min, max
agg_df = df_factory.groupBy("machine_id").agg(
avg("temp").alias("avg_temp"),
min("temp").alias("min_temp"),
max("temp").alias("max_temp")
)
agg_df.show()





+----------+-----------------+--------+--------+
|machine_id|         avg_temp|min_temp|max_temp|
+----------+-----------------+--------+--------+
|      M002|69.53333282470703|    68.7|    70.1|
|      M003|73.39999898274739|    72.4|    74.6|
|      M001|76.72500038146973|    75.3|    78.0|
+----------+-----------------+--------+--------+



                                                                                

## Activity
Using SparkSQL, you need to:
1. Explore the schema of the DataFrame
2. Get the average temperature per machine
3. Find the maximum and minimum temperature per machine
4. Filter records above a temperature threshold (temp > 75).
5. Count of readings per machine
6. Machine with the highest temperature


### Explore the schema of the DataFrame


In [104]:
# 1. Explore the schema of the DataFrame
df_factory.printSchema()


root
 |-- machine_id: string (nullable = true)
 |-- sensor_timestamp: timestamp (nullable = true)
 |-- temp: float (nullable = true)



### Get the average temperature per machine


In [105]:
# 2. Get the average temperature per machine

avg_df = df_factory.groupBy("machine_id").agg(
    avg("temp").alias("avg_temp")
)
avg_df.show()




+----------+-----------------+
|machine_id|         avg_temp|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



                                                                                

### 3. Find the maximum and minimum temperature per machine

In [106]:
# 3. Find the maximum and minimum temperature per machine
max_min = df_factory.groupBy("machine_id").agg(
    max("temp").alias("max_temp"),
    min("temp").alias("min_temp")
)
max_min.show()


+----------+--------+--------+
|machine_id|max_temp|min_temp|
+----------+--------+--------+
|      M002|    70.1|    68.7|
|      M003|    74.6|    72.4|
|      M001|    78.0|    75.3|
+----------+--------+--------+



                                                                                

### 4. Filter records above a temperature threshold (temp > 75).


In [107]:
# 4. Filter records above a temperature threshold (temp > 75).
above_threshold = df_factory.filter(col("temp") > 75)
above_threshold.show()



+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



### 5. Count of readings per machine


In [108]:
# 5. Count of readings per machine
count_readings = df_factory.groupBy("machine_id").count()
count_readings.show()


+----------+-----+
|machine_id|count|
+----------+-----+
|      M002|    3|
|      M003|    3|
|      M001|    4|
+----------+-----+



                                                                                

### 6. Machine with the highest temperature

In [111]:
# 6. Machine with the highest temperature
highest_temp = df_factory.agg(max("temp").alias("highest_temp"))
highest_temp.show()


highest_temp_machine = df_factory.orderBy(col("temp").desc()).limit(1)
highest_temp_machine.show()

                                                                                

+------------+
|highest_temp|
+------------+
|        78.0|
+------------+

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



                                                                                

In [112]:
sc.stop()