# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Aura Melina Gutierrez Jimenez (Ing. Sistemas Computacionales)

**Professor**: Pablo Camarillo Ramirez

In [8]:
import findspark
findspark.init()

In [9]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://b1ca502cde8a:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [10]:
from pcamarillor.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [11]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

## Drop unnecessary columns.

In [12]:
df_airlines = df_airlines.drop("flight", "days_left")

## Count how many null values the dataset has before/after the cleaning process.

In [13]:
from pyspark.sql.functions import col, when, count, isnull

print("Number of records before cleaning:", df_airlines.count())
df_airlines.select([count(when(isnull(col(c.name)), c.name)).alias(c.name) for c in df_airlines.schema]).show()

airlines_clean = df_airlines.dropna()

print("Number of records after cleaning:", airlines_clean.count())
airlines_clean.select([count(when(isnull(col(c.name)), c.name)).alias(c.name) for c in airlines_clean.schema]).show()

                                                                                

Number of records before cleaning: 300153


                                                                                

+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+
|index|airline|source_city|departure_time|stops|arrival_time|destination_city|class|duration|price|
+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+
|    0|      0|          0|             0|    0|           0|               0|    0|       0|    0|
+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+



                                                                                

Number of records after cleaning: 300153




+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+
|index|airline|source_city|departure_time|stops|arrival_time|destination_city|class|duration|price|
+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+
|    0|      0|          0|             0|    0|           0|               0|    0|       0|    0|
+-----+-------+-----------+--------------+-----+------------+----------------+-----+--------+-----+



                                                                                

## Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

In [22]:
from pyspark.sql.functions import lower, lit
df_transformed = airlines_clean.withColumn("stops",
                                           when(lower(col("stops")) == "zero", lit(0))
                                           .when(lower(col("stops")) == "one", lit(1))
                                           .otherwise(None))
df_transformed.show(n=5)

+-----+--------+-----------+--------------+-----+-------------+----------------+-------+--------+-----+
|index| airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|price|
+-----+--------+-----------+--------------+-----+-------------+----------------+-------+--------+-----+
|    0|SpiceJet|      Delhi|       Evening|    0|        Night|          Mumbai|Economy|    2.17| 5953|
|    1|SpiceJet|      Delhi| Early_Morning|    0|      Morning|          Mumbai|Economy|    2.33| 5953|
|    2| AirAsia|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|Economy|    2.17| 5956|
|    3| Vistara|      Delhi|       Morning|    0|    Afternoon|          Mumbai|Economy|    2.25| 5955|
|    4| Vistara|      Delhi|       Morning|    0|      Morning|          Mumbai|Economy|    2.33| 5955|
+-----+--------+-----------+--------------+-----+-------------+----------------+-------+--------+-----+
only showing top 5 rows


## Create a new column called route: “Delhi → Mumbai” from source_city and destination_city

In [24]:
from pyspark.sql.functions import concat_ws
df_transformed = df_transformed.withColumn("route", concat_ws(" -> ", col("source_city"), col("destination_city")))
df_transformed.show(n=5)

+-----+--------+-----------+--------------+-----+-------------+----------------+-------+--------+-----+---------------+
|index| airline|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|price|          route|
+-----+--------+-----------+--------------+-----+-------------+----------------+-------+--------+-----+---------------+
|    0|SpiceJet|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17| 5953|Delhi -> Mumbai|
|    1|SpiceJet|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33| 5953|Delhi -> Mumbai|
|    2| AirAsia|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17| 5956|Delhi -> Mumbai|
|    3| Vistara|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25| 5955|Delhi -> Mumbai|
|    4| Vistara|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33| 5955|Delhi -> Mumbai|
+-----+--------+-----------+------------

## Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.).

In [25]:
df_transformed = df_transformed.withColumn("departure_time",
                                           when(lower(col("departure_time")) == "early_morning", 0)
                                           .when(lower(col("departure_time")) == "morning", 1)
                                           .when(lower(col("departure_time")) == "afternoon", 2)
                                           .when(lower(col("departure_time")) == "evening", 3)
                                           .when(lower(col("departure_time")) == "night", 4)
                                           .otherwise(None))
df_transformed = df_transformed.withColumn("arrival_time",
                                           when(lower(col("arrival_time")) == "early_morning", 0)
                                           .when(lower(col("arrival_time")) == "morning", 1)
                                           .when(lower(col("arrival_time")) == "afternoon", 2)
                                           .when(lower(col("arrival_time")) == "evening", 3)
                                           .when(lower(col("arrival_time")) == "night", 4)
                                           .otherwise(None))
df_transformed.show(n=5)

+-----+--------+-----------+--------------+-----+------------+----------------+-------+--------+-----+---------------+
|index| airline|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|price|          route|
+-----+--------+-----------+--------------+-----+------------+----------------+-------+--------+-----+---------------+
|    0|SpiceJet|      Delhi|             3| zero|           4|          Mumbai|Economy|    2.17| 5953|Delhi -> Mumbai|
|    1|SpiceJet|      Delhi|             0| zero|           1|          Mumbai|Economy|    2.33| 5953|Delhi -> Mumbai|
|    2| AirAsia|      Delhi|             0| zero|           0|          Mumbai|Economy|    2.17| 5956|Delhi -> Mumbai|
|    3| Vistara|      Delhi|             1| zero|           2|          Mumbai|Economy|    2.25| 5955|Delhi -> Mumbai|
|    4| Vistara|      Delhi|             1| zero|           1|          Mumbai|Economy|    2.33| 5955|Delhi -> Mumbai|
+-----+--------+-----------+--------------+-----

## Add a new column is_expensive: when(price > 6000, True).otherwise(False).

In [27]:
df_transformed = df_transformed.withColumn("is_expensive", when(col("price") > 6000, lit(True)).otherwise(False))
df_transformed.show(n=5)

+-----+--------+-----------+--------------+-----+------------+----------------+-------+--------+-----+---------------+------------+
|index| airline|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|price|          route|is_expensive|
+-----+--------+-----------+--------------+-----+------------+----------------+-------+--------+-----+---------------+------------+
|    0|SpiceJet|      Delhi|             3| zero|           4|          Mumbai|Economy|    2.17| 5953|Delhi -> Mumbai|       false|
|    1|SpiceJet|      Delhi|             0| zero|           1|          Mumbai|Economy|    2.33| 5953|Delhi -> Mumbai|       false|
|    2| AirAsia|      Delhi|             0| zero|           0|          Mumbai|Economy|    2.17| 5956|Delhi -> Mumbai|       false|
|    3| Vistara|      Delhi|             1| zero|           2|          Mumbai|Economy|    2.25| 5955|Delhi -> Mumbai|       false|
|    4| Vistara|      Delhi|             1| zero|           1|          Mumb

## Get the average price per airline.

In [29]:
from pyspark.sql.functions import avg
df_transformed.groupBy("airline").agg(avg("price").alias("avg_price")).orderBy(col("avg_price").desc()).show()



+---------+------------------+
|  airline|         avg_price|
+---------+------------------+
|  Vistara| 30396.53630170735|
|Air_India| 23507.01911190229|
| SpiceJet| 6179.278881367218|
| GO_FIRST| 5652.007595045959|
|   Indigo| 5324.216303339517|
|  AirAsia|4091.0727419555224|
+---------+------------------+



                                                                                

## Average duration per route

In [30]:
df_transformed.groupBy("route").agg(avg("duration").alias("avg_duration")).orderBy(col("avg_duration").desc()).show()



+--------------------+------------------+
|               route|      avg_duration|
+--------------------+------------------+
|  Kolkata -> Chennai|14.774181563782903|
|  Chennai -> Kolkata|14.515774035955694|
|Bangalore -> Chennai|14.480207509137166|
|Bangalore -> Hyde...|14.162432783513621|
|Chennai -> Bangalore|13.952593563812163|
|Kolkata -> Hyderabad|13.853107514948396|
|Kolkata -> Bangalore| 13.79294687524098|
|Hyderabad -> Kolkata|13.535322410033165|
|Hyderabad -> Chennai|13.293238468912078|
| Mumbai -> Hyderabad|13.263310412247066|
|Chennai -> Hyderabad|13.153984931732971|
|Bangalore -> Kolkata|13.099143404859825|
|   Kolkata -> Mumbai|12.991932481150478|
|   Mumbai -> Kolkata|12.836848115489666|
|    Delhi -> Kolkata| 12.73596614766045|
|   Mumbai -> Chennai|12.665900287564627|
|  Delhi -> Hyderabad|12.518350118710492|
|    Delhi -> Chennai|12.433964745763944|
|   Chennai -> Mumbai|12.374656244132625|
|Hyderabad -> Bang...| 12.09331678643705|
+--------------------+------------

                                                                                

## Minimum and maximum price per airline.

In [34]:
from pyspark.sql.functions import min, max, col

df_transformed.groupBy("airline").agg(min(col("price")).alias("min_price"),max(col("price")).alias("max_price")).show()



+---------+---------+---------+
|  airline|min_price|max_price|
+---------+---------+---------+
|   Indigo|     1105|    31952|
| SpiceJet|     1106|    34158|
|Air_India|     1526|    90970|
|  AirAsia|     1105|    31917|
| GO_FIRST|     1105|    32803|
|  Vistara|     1714|   123071|
+---------+---------+---------+



                                                                                

## Count flights by departure_time category.

In [37]:
df_transformed.groupBy("departure_time").agg(count("departure_time").alias("flight_count")).orderBy("departure_time").show()



+--------------+------------+
|departure_time|flight_count|
+--------------+------------+
|          NULL|           0|
|             0|       66790|
|             1|       71146|
|             2|       47794|
|             3|       65102|
|             4|       48015|
+--------------+------------+



                                                                                