# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 21th 2025

**Student Name**: Luis Angel Santana Hernandez

**Professor**: Pablo Camarillo Ramirez

# Find the PySpark Installation

In [73]:
import findspark
findspark.init()

# Create SparkSession

In [74]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://4dcc0176a67d:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

## Define the schema

In [75]:
from luis_santana.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

## Load CSV

In [76]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

## Data Cleaning

In [77]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")

# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()



                                                                                

number of records before cleaning: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [78]:
# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())



print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()



                                                                                

number of records after cleaning with trim: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [79]:
# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

## Normalize categorical values: map ”zero” → 0, ”one” → 1, etc. in stops.

In [None]:
#get distict values of a column
airlines_clean_v2.select("stops").distinct().show()



+-----------+
|      stops|
+-----------+
|two_or_more|
|        one|
|       zero|
+-----------+



                                                                                

In [81]:
from pyspark.sql.functions import when, lit
# Perform the 1st transformation
airlines_t1 = airlines_clean_v2.withColumn("stops",when(airlines_clean_v2.stops == "zero", lit(0)).when(airlines_clean_v2.stops == "one", lit(1)).when(airlines_clean_v2.stops == "two_or_more", lit(2)))

airlines_t1.select("stops").distinct().show()



+-----+
|stops|
+-----+
|    1|
|    2|
|    0|
+-----+



                                                                                

# Create a new column called route: ”Delhi → Mumbai” from source city and destination city.

In [None]:
from pyspark.sql.functions import concat_ws
airlines_t2 = airlines_t1.withColumn("route", concat_ws(" -> ", col("source_city"), col("destination_city")))
airlines_t2.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|          route|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|Economy|    2.17|        1| 5953|Delhi -> Mumbai|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5953|Delhi -> Mumbai|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|Delhi -> Mumbai|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|Delhi -> Mumbai|
|    4| Vistara| UK-963|      Delh

# Transform departure time and arrival time to numerical category (Morning,Afternoon, etc.), then encode as numbers (0=Early Morning, 1=Morning, etc.).

In [83]:
airlines_t2.select("departure_time").distinct().show()




+--------------+
|departure_time|
+--------------+
|       Evening|
|       Morning|
|    Late_Night|
|     Afternoon|
| Early_Morning|
|         Night|
+--------------+



                                                                                

In [84]:
airlines_t2.select("arrival_time").distinct().show()



+-------------+
| arrival_time|
+-------------+
|      Evening|
|      Morning|
|   Late_Night|
|    Afternoon|
|Early_Morning|
|        Night|
+-------------+



                                                                                

1. Early_Morning
2. Morning
3. Evening
4. Afternoon
5. Night
6. Late_Night     

In [85]:
list = ["Early_Morning", "Morning", "Evening", "Afternoon", "Night", "Late_Night"]

airlines_t3 = airlines_t2.withColumn("departure_time",
    when(airlines_t2.departure_time == list[0], lit(0))
    .when(airlines_t2.departure_time == list[1], lit(1))
    .when(airlines_t2.departure_time == list[2], lit(2))
    .when(airlines_t2.departure_time == list[3], lit(3))
    .when(airlines_t2.departure_time == list[4], lit(4))
    .when(airlines_t2.departure_time == list[5], lit(5))
)

airlines_t3.select("departure_time").distinct().show()




+--------------+
|departure_time|
+--------------+
|             1|
|             3|
|             5|
|             4|
|             2|
|             0|
+--------------+



                                                                                

In [86]:
airlines_t3 = airlines_t3.withColumn("arrival_time",
    when(airlines_t3.arrival_time == list[0], lit(0))
    .when(airlines_t3.arrival_time == list[1], lit(1))
    .when(airlines_t3.arrival_time == list[2], lit(2))
    .when(airlines_t3.arrival_time == list[3], lit(3))
    .when(airlines_t3.arrival_time == list[4], lit(4))
    .when(airlines_t3.arrival_time == list[5], lit(5))
)
airlines_t3.select("arrival_time").distinct().show()




+------------+
|arrival_time|
+------------+
|           1|
|           3|
|           5|
|           4|
|           2|
|           0|
+------------+



                                                                                

# Add a new column is expensive: when(price > 6000, True).otherwise(False).

In [87]:
airlines_t4 = airlines_t3.withColumn("is_expensive", when(airlines_t3.price > 6000, lit(True)).otherwise(lit(False)))

airlines_t4.show(n=5)

+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+---------------+------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|          route|is_expensive|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+---------------+------------+
|    0|SpiceJet|SG-8709|      Delhi|             2|    0|           4|          Mumbai|Economy|    2.17|        1| 5953|Delhi -> Mumbai|       false|
|    1|SpiceJet|SG-8157|      Delhi|             0|    0|           1|          Mumbai|Economy|    2.33|        1| 5953|Delhi -> Mumbai|       false|
|    2| AirAsia| I5-764|      Delhi|             0|    0|           0|          Mumbai|Economy|    2.17|        1| 5956|Delhi -> Mumbai|       false|
|    3| Vistara| UK-995|      Delhi|             1|    0|           3|          Mumbai|Economy|    2

# results
# Get the average price per airline

In [88]:
from pyspark.sql.functions import avg
airlines_t4.groupby("airline").agg(avg("price").alias("average_price")).show()




+---------+------------------+
|  airline|     average_price|
+---------+------------------+
|   Indigo| 5324.216303339517|
| SpiceJet| 6179.278881367218|
|Air_India| 23507.01911190229|
|  AirAsia|4091.0727419555224|
| GO_FIRST| 5652.007595045959|
|  Vistara| 30396.53630170735|
+---------+------------------+



                                                                                

# Average duration per route

In [89]:
airlines_t4.groupBy("route").agg(avg("duration").alias("average_duration")).show()




+--------------------+------------------+
|               route|  average_duration|
+--------------------+------------------+
|    Delhi -> Chennai|12.433964745763944|
|  Hyderabad -> Delhi|10.829816602522587|
|   Mumbai -> Chennai|12.665900287564627|
|Hyderabad -> Kolkata|13.535322410033165|
| Hyderabad -> Mumbai|11.962923295795918|
| Mumbai -> Bangalore|11.612022516178817|
|    Delhi -> Kolkata| 12.73596614766045|
|   Mumbai -> Kolkata|12.836848115489666|
|Bangalore -> Kolkata|13.099143404859825|
| Mumbai -> Hyderabad|13.263310412247066|
|    Kolkata -> Delhi| 11.60498857561711|
|Hyderabad -> Chennai|13.293238468912078|
|     Delhi -> Mumbai|10.367774213738123|
|   Kolkata -> Mumbai|12.991932481150478|
|     Mumbai -> Delhi|  9.81805726844943|
|Kolkata -> Hyderabad|13.853107514948396|
|Bangalore -> Chennai|14.480207509137166|
|  Bangalore -> Delhi|  9.77995566082195|
|Bangalore -> Hyde...|14.162432783513621|
|Hyderabad -> Bang...| 12.09331678643705|
+--------------------+------------

                                                                                

# Minimum and maximum price per airline

In [90]:
from pyspark.sql.functions import min, max

airlines_t4.groupBy("airline").agg(
    min("price").alias("minimum_price"),
    max("price").alias("maximum_price")
).show()



+---------+-------------+-------------+
|  airline|minimum_price|maximum_price|
+---------+-------------+-------------+
|   Indigo|         1105|        31952|
| SpiceJet|         1106|        34158|
|Air_India|         1526|        90970|
|  AirAsia|         1105|        31917|
| GO_FIRST|         1105|        32803|
|  Vistara|         1714|       123071|
+---------+-------------+-------------+



                                                                                

# Count flights by departure time category

In [91]:
airlines_t4.groupBy("departure_time").count().show()



+--------------+-----+
|departure_time|count|
+--------------+-----+
|             1|71146|
|             3|47794|
|             5| 1306|
|             4|48015|
|             2|65102|
|             0|66790|
+--------------+-----+



                                                                                

In [92]:
sc.stop()