# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 19th 2025

**Student Name**: Luis Adrian Bravo Ramirez

**Professor**: Pablo Camarillo Ramirez

# Find the PySpark Installation & Create SparkSession

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://4f3c6067fdf1:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/19 14:18:58 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Define the Schema & Load CSV

In [3]:
from pcamarillor.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [18]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=10)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|    5| Vistara| UK-945|

# Data Cleaning, Columns Creations and Transformation

- Drop unnecessary columns. Count how many null values dataset have before/after
cleaning process.

In [19]:
from pyspark.sql.functions import trim, col, count, isnull, when

# Get number of null values for each column before cleaning 
print(f"Number of records before cleaning: {df_airlines.count()}")
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Perform data cleaning with dropna (make sure that it erases null values)
airlines_clean = airlines_clean.dropna()

print(f"\nNumber of records after cleaning with trim & dropna: {airlines_clean.count()}\n")

airlines_clean.show(n=10)

Number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                


Number of records after cleaning with trim & dropna: 300153





+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|index|  airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|
+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|     Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    3|  Vistara| UK-995|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    5|  Vistara| UK-945|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.33|        1| 5955|
|    6|  Vistara| UK-927|      Delhi|       Morning| zero|     Morning|          Mumbai|Economy|    2.08|        1| 6060|
|    9| GO_FIRST| G8-336|      Delhi|     Afternoon| zero|     Evening|          Mumbai|Economy|    2.25|        1| 5954|
|   12|   Indigo|6E-5001

                                                                                

- Normalize categorical values: map ”zero” → 0, ”one” → 1, etc. in stops.

In [20]:
from pyspark.sql.functions import when, lit

airlines_clean = airlines_clean.withColumn(
    "stops",
    when(col("stops") == "zero", lit(0))
    .when(col("stops") == "one", lit(1))
    .when(col("stops") == "two", lit(2))
    .otherwise(lit(3))
)

airlines_clean.show(n=20)

airlines_clean.select("stops").distinct().show()


                                                                                

+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|index|  airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|
+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning|    0|     Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    3|  Vistara| UK-995|      Delhi|       Morning|    0|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    5|  Vistara| UK-945|      Delhi|       Morning|    0|   Afternoon|          Mumbai|Economy|    2.33|        1| 5955|
|    6|  Vistara| UK-927|      Delhi|       Morning|    0|     Morning|          Mumbai|Economy|    2.08|        1| 6060|
|    9| GO_FIRST| G8-336|      Delhi|     Afternoon|    0|     Evening|          Mumbai|Economy|    2.25|        1| 5954|
|   12|   Indigo|6E-5001



+-----+
|stops|
+-----+
|    1|
|    3|
|    0|
+-----+



                                                                                

- Create a new column called route: ”Delhi → Mumbai” from source city and
destination city.

- Transform departure time and arrival time to numerical category (Morning,
Afternoon, etc.), then encode as numbers (0=Early Morning, 1=Morning, etc.)

- Add a new column is expensive: when(price > 6000, True).otherwise(False).

# Aggregations

- Get the average price per airline

- Average duration per route

- Minimum and maximum price per airline

- Count flights by departure time category