# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025 - Juan Carlos Alonso Gonzalez** </center>
---
### <center> **Examples on Spark SQL** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Find the PySpark Installation

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [11]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://13d9d31b0e77:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()


sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [3]:
from juanalonso.spark_utils import SparkUtils
schema = SparkUtils.generate_schema([("name", "string"), ("age", "int"), ("city", "string")])

StructType([StructField('name', StringType(), True), StructField('age', IntegerType(), True), StructField('city', StringType(), True)])

## Example: Smart Factory Sensor Data

In [12]:
print("A")
import juanalonso.spark_utils as spark_utils
print("B")
import importlib
importlib.reload(spark_utils)  # comment this out if it hangs here
print("C")
SparkUtils = spark_utils.SparkUtils
print("D")

from datetime import datetime
print("E")

factory_data = [("M001", datetime(2025,4,26,8,0,0), 75.3)]
print("F")

factory_schema = SparkUtils.generate_schema([
    ("machine_id","StringType"),
    ("sensor_timestamp","TimestampType"),
    ("temp","FloatType")
])
print("G")

df_factory = spark.createDataFrame(factory_data, factory_schema)
print("H")
df_factory.show()
print("I")

A
B
C
D
E
F
G
H


ERROR:root:KeyboardInterrupt while sending command.                 (0 + 0) / 1]
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
ERROR:py4j.clientserver:Exception occurred while shutting down connection
Traceback (most recent call last):
  File "/opt/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/opt/spark/python/lib/py4j-0.10.9.9-src.zip/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705

KeyboardInterrupt: 

## Explore Schema

In [25]:
from pyspark.sql import functions as F

df_factory.printSchema()

root
 |-- machine_id: string (nullable = true)
 |-- sensor_timestamp: timestamp (nullable = true)
 |-- temp: float (nullable = true)



## Average temperature per machine


### Explore the schema of the DataFrame

In [13]:
df_factory.printSchema()

root
 |-- machine_id: string (nullable = true)
 |-- sensor_timestamp: timestamp (nullable = true)
 |-- temp: float (nullable = true)



### Get the average temperature per machine

In [16]:
from pyspark.sql.functions import col, avg
df_factory.groupBy(col("machine_id")).agg(avg("temp")).show()

+----------+-----------------+
|machine_id|        avg(temp)|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



### Find the maximum and minimum temperature per machine

In [19]:
from pyspark.sql.functions import min, max
df_factory.groupBy(col("machine_id")).agg(
    min("temp").alias("min_temp"),
    max("temp")
).show()

+----------+--------+---------+
|machine_id|min_temp|max(temp)|
+----------+--------+---------+
|      M002|    68.7|     70.1|
|      M003|    72.4|     74.6|
|      M001|    75.3|     78.0|
+----------+--------+---------+



### Filter records above a temperature threshold temp > 75)

In [20]:
df_factory.filter(col("temp") > 75).show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



### Count of readings per machine

In [None]:
df_factory.groupBy(col(" machine_id")).count().show()

+----------+-----+
|machine_id|count|
+----------+-----+
|      M002|    3|
|      M003|    3|
|      M001|    4|
+----------+-----+



### Machine with the highest temperature

In [26]:
df_factory.orderBy(col("temp"), ascending=False).show()


+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:40:00|78.0|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:00:00|75.3|
|      M003|2025-04-26 08:45:00|74.6|
|      M003|2025-04-26 08:30:00|73.2|
|      M003|2025-04-26 08:15:00|72.4|
|      M002|2025-04-26 08:35:00|70.1|
|      M002|2025-04-26 08:20:00|69.8|
|      M002|2025-04-26 08:05:00|68.7|
+----------+-------------------+----+



In [27]:
df_factory.filter(col("machine_id") == "M001").show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



In [28]:
df_factory.groupBy("machine_id").agg(F.avg("temp")).show()

## Maximum and minimum temperature per machine

In [None]:
df_factory.groupBy("machine_id").agg(F.max("temp"), F.min("temp")).show()

## Filter records above a temperature threshold

In [None]:
df_factory.filter(df_factory.temp > 75).show()


## Count the number of readings per machine

In [None]:
df_factory.groupBy("machine_id").count().show()

## Find the machine with the highest temperature

### Explore the schema of the DataFrame

In [13]:
df_factory.printSchema()

root
 |-- machine_id: string (nullable = true)
 |-- sensor_timestamp: timestamp (nullable = true)
 |-- temp: float (nullable = true)



### Get the average temperature per machine

In [16]:
from pyspark.sql.functions import col, avg
df_factory.groupBy(col("machine_id")).agg(avg("temp")).show()

+----------+-----------------+
|machine_id|        avg(temp)|
+----------+-----------------+
|      M002|69.53333282470703|
|      M003|73.39999898274739|
|      M001|76.72500038146973|
+----------+-----------------+



### Find the maximum and minimum temperature per machine

In [19]:
from pyspark.sql.functions import min, max
df_factory.groupBy(col("machine_id")).agg(
    min("temp").alias("min_temp"),
    max("temp")
).show()

+----------+--------+---------+
|machine_id|min_temp|max(temp)|
+----------+--------+---------+
|      M002|    68.7|     70.1|
|      M003|    72.4|     74.6|
|      M001|    75.3|     78.0|
+----------+--------+---------+



### Filter records above a temperature threshold temp > 75)

In [20]:
df_factory.filter(col("temp") > 75).show()

+----------+-------------------+----+
|machine_id|   sensor_timestamp|temp|
+----------+-------------------+----+
|      M001|2025-04-26 08:00:00|75.3|
|      M001|2025-04-26 08:10:00|76.1|
|      M001|2025-04-26 08:25:00|77.5|
|      M001|2025-04-26 08:40:00|78.0|
+----------+-------------------+----+



### Count of readings per machine

In [None]:
df_factory.orderBy(F.desc("temp")).limit(1).show()

In [14]:
sc.stop()