# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Spark SQL** </center>
---
**Profesor**: Pablo Camarillo Ramirez
**Alumno**: Aura Melina Gutierrez Jimenez (Ing. en Sistemas Computacionales)

# Find the PySpark Installation

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://b1ca502cde8a:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/12 14:23:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [5]:
from auragutierrez.spark_utils import SparkUtils
schema = SparkUtils.generate_schema([("name", "string"), ("age", "int"), ("city", "string")])

In [6]:
print(schema)
print(schema.fieldNames())
print(schema.json())

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('city', StringType(), True)])
['name', 'age', 'city']
{"fields":[{"metadata":{},"name":"name","nullable":true,"type":"string"},{"metadata":{},"name":"age","nullable":true,"type":"integer"},{"metadata":{},"name":"city","nullable":true,"type":"string"}],"type":"struct"}


## Example: Smart Factory Sensor Data

In [7]:
from datetime import datetime

factory_data = [
    ("M001", datetime(2025, 4, 26, 8, 0, 0), 75.3),
    ("M002", datetime(2025, 4, 26, 8, 5, 0), 68.7),
    ("M001", datetime(2025, 4, 26, 8, 10, 0), 76.1),
    ("M003", datetime(2025, 4, 26, 8, 15, 0), 72.4),
    ("M002", datetime(2025, 4, 26, 8, 20, 0), 69.8),
    ("M001", datetime(2025, 4, 26, 8, 25, 0), 77.5),
    ("M003", datetime(2025, 4, 26, 8, 30, 0), 73.2),
    ("M002", datetime(2025, 4, 26, 8, 35, 0), 70.1),
    ("M001", datetime(2025, 4, 26, 8, 40, 0), 78.0),
    ("M003", datetime(2025, 4, 26, 8, 45, 0), 74.6),
]

factory_schema = SparkUtils.generate_schema([("machine_id", "string"), ("sensor_timestamp", "timestamp"), ("temp", "float")])
df_factory = spark.createDataFrame(factory_data, factory_schema)
df_factory.show()

                                                                                

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M002|2025-04-26 08:05:00|68.7|
|      M001|2025-04-26 08:10:00|76.1|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:20:00|69.8|
|      M001|2025-04-26 08:25:00|77.5|
|      M003|2025-04-26 08:30:00|73.2|
|      M002|2025-04-26 08:35:00|70.1|
|      M001|2025-04-26 08:40:00|78.0|
|      M003|2025-04-26 08:45:00|74.6|
+----------+-------------------+----+



## Get the avarage temperature per machine

In [8]:
from pyspark.sql.functions import col, avg

df_factory.groupBy(col("machine_id")).agg(avg("temp")).show()



+----------+-----------------+
|machine_id|        avg(temp)|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



                                                                                

## Find the maximun and minimun temperature per machine

In [9]:
from pyspark.sql.functions import min, max

df_factory.groupBy(col("machine_id")).agg(
    min("temp").alias("min_temp"),
    max("temp")
).show()



+----------+--------+---------+
|machine_id|min_temp|max(temp)|
+----------+--------+---------+
|      M002|    68.7|     70.1|
|      M003|    72.4|     74.6|
|      M001|    75.3|     78.0|
+----------+--------+---------+



                                                                                

## Count of readings per machine

In [10]:
df_factory.groupBy(col("machine_id")).count().show()

+----------+-----+
|machine_id|count|
+----------+-----+
|      M002|    3|
|      M003|    3|
|      M001|    4|
+----------+-----+



## Machine with the highest temperature

In [12]:
df_factory.orderBy(col("temp").desc()).show(1)

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+
only showing top 1 row


In [None]:
sc.stop()