# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Spark SQL** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Find the PySpark Installation

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on Map Reduce") \
    .master("spark://7d7f1cd1d5d2:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/12 15:16:45 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from lib.bernardoorozco.spark_utils import SparkUtils
schema = SparkUtils.generate_schema([("name", "string"), ("age", "int"), ("city", "string")])

## Example: Smart Factory Sensor Data

### Explore the schema of the DataFrame

In [4]:
from datetime import datetime

factory_data = [
    ("M001", datetime(2025, 4, 26, 8, 0, 0), 75.3),
    ("M002", datetime(2025, 4, 26, 8, 5, 0), 68.7),
    ("M001", datetime(2025, 4, 26, 8, 10, 0), 76.1),
    ("M003", datetime(2025, 4, 26, 8, 15, 0), 72.4),
    ("M002", datetime(2025, 4, 26, 8, 20, 0), 69.8),
    ("M001", datetime(2025, 4, 26, 8, 25, 0), 77.5),
    ("M003", datetime(2025, 4, 26, 8, 30, 0), 73.2),
    ("M002", datetime(2025, 4, 26, 8, 35, 0), 70.1),
    ("M001", datetime(2025, 4, 26, 8, 40, 0), 78.0),
    ("M003", datetime(2025, 4, 26, 8, 45, 0), 74.6),
]

factory_schema = SparkUtils.generate_schema([("machine_id", "string"), ("sensor_timestamp", "timestamp"), ("temp", "float")])
df_factory = spark.createDataFrame(factory_data, factory_schema)
df_factory.show()

                                                                                

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M002|2025-04-26 08:05:00|68.7|
|      M001|2025-04-26 08:10:00|76.1|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:20:00|69.8|
|      M001|2025-04-26 08:25:00|77.5|
|      M003|2025-04-26 08:30:00|73.2|
|      M002|2025-04-26 08:35:00|70.1|
|      M001|2025-04-26 08:40:00|78.0|
|      M003|2025-04-26 08:45:00|74.6|
+----------+-------------------+----+



In [5]:
from pyspark.sql.functions import avg, max, min, count

### Get the average temperature per machine

In [6]:
dffilter=df_factory.groupBy("machine_id").agg(avg("temp").alias("avg_temp"))
dffilter.show()

[Stage 2:>                                                          (0 + 1) / 2]

+----------+-----------------+
|machine_id|         avg_temp|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



                                                                                

### Find the maximum and minimum temperature per machine

In [7]:
dffilter2 = df_factory.groupBy("machine_id").agg(max("temp").alias("maxtem"),min("temp").alias("mintemp"))
dffilter2.show()

+----------+------+-------+
|machine_id|maxtem|mintemp|
+----------+------+-------+
|      M002|  70.1|   68.7|
|      M003|  74.6|   72.4|
|      M001|  78.0|   75.3|
+----------+------+-------+



### Filter records above a temperature threshold (tempm 75)

In [8]:
dffilter3=df_factory.filter(df_factory.temp > 75).show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



### Count of readings per machine

In [9]:
dffilter4=df_factory.groupBy("machine_id").agg(count("*").alias("readings"))
dffilter4.show()

+----------+--------+
|machine_id|readings|
+----------+--------+
|      M002|       3|
|      M003|       3|
|      M001|       4|
+----------+--------+



### Machine with the highest temperature

In [10]:
dffilter5=df_factory.groupBy("machine_id").agg(max("temp").alias("max_temp")).orderBy("max_temp", ascending=False).limit(1)
dffilter5.show()

+----------+--------+
|machine_id|max_temp|
+----------+--------+
|      M001|    78.0|
+----------+--------+



In [11]:
sc.stop()