# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**:

**Professor**: Pablo Camarillo Ramirez

#### **By:** Fernando Ramos

In [1]:
import findspark
findspark.init()

In [158]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg, trim, col, count, isnull, when, lit, concat, round, asc, desc
from datetime import datetime
from fernandoramos.spark_utils import SparkUtils

spark = SparkSession.builder \
    .appName("Examples on SparkSQL") \
    .master("spark://d3eb0343c341:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [159]:
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "double")
     ]

airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', DoubleType(), True)])

In [160]:
airlines_dataset = spark \
    .read \
    .format("csv") \
    .option("key", "value") \
    .option("header", "true") \
    .option("schema", airlines_schema) \
    .csv("/opt/spark/work-dir/data/airline/") 

airlines_dataset.show()

+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4|  Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|    5|  Vistara

In [161]:
print(f"number of records before cleaning: {airlines_dataset.count()}")

# Get number of null values for each column before cleaning 
airlines_dataset.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = airlines_dataset \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = airlines_clean.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

## 1st transformation: Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops

In [162]:
# Perform the 1st transformation: Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

airlines_t1 = airlines_clean_v2.withColumn("stops_numeric",
                                           when(airlines_clean_v2.stops == "zero", lit(0)) \
                                           .when(airlines_clean_v2.stops == "one", lit(1)) \
                                           .when(airlines_clean_v2.stops == "two_or_more", lit(2))
)
airlines_t1.select("index", "airline", "flight", "stops", "stops_numeric").show(n=5)



+------+--------+-------+-----+-------------+
| index| airline| flight|stops|stops_numeric|
+------+--------+-------+-----+-------------+
|     0|SpiceJet|SG-8709| zero|            0|
|   100| Vistara| UK-705|  one|            1|
|  1000|  Indigo|6E-2373|  one|            1|
| 10000|GO_FIRST| G8-392|  one|            1|
|100000| Vistara| UK-810|  one|            1|
+------+--------+-------+-----+-------------+
only showing top 5 rows


                                                                                

## 2nd transformation: Create a new column called route: “Delhi → Mumbai” from source_city and destination_city

In [163]:
# Perform the 2nd transformation: Create a new column called route: “Delhi → Mumbai” from source_city and destination_city

airlines_t2 = airlines_t1.withColumn("route", 
                                     concat(col("source_city"), lit("->"), col("destination_city"))
)
airlines_t2.select("index", "airline", "flight", "source_city", "destination_city", "route").show(n=5)



+------+--------+-------+-----------+----------------+-----------------+
| index| airline| flight|source_city|destination_city|            route|
+------+--------+-------+-----------+----------------+-----------------+
|     0|SpiceJet|SG-8709|      Delhi|          Mumbai|    Delhi->Mumbai|
|   100| Vistara| UK-705|      Delhi|          Mumbai|    Delhi->Mumbai|
|  1000|  Indigo|6E-2373|      Delhi|          Mumbai|    Delhi->Mumbai|
| 10000|GO_FIRST| G8-392|      Delhi|       Bangalore| Delhi->Bangalore|
|100000| Vistara| UK-810|  Bangalore|          Mumbai|Bangalore->Mumbai|
+------+--------+-------+-----------+----------------+-----------------+
only showing top 5 rows


                                                                                

## 3rd transformation: Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.)

In [164]:
# Perform the 3rd transformation: Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), 
# then encode as numbers (0=Early_Morning, 1=Morning, etc.)

airlines_t3_p1 = airlines_t2.withColumn("departure_time_numeric",
                                     when(airlines_t2.departure_time == "Early_Morning", lit(0)) \
                                     .when(airlines_t2.departure_time == "Morning", lit(1)) \
                                     .when(airlines_t2.departure_time == "Evening", lit(2)) \
                                     .when(airlines_t2.departure_time == "Afternoon", lit(3)) \
                                     .when(airlines_t2.departure_time == "Night", lit(4)) \
                                     .when(airlines_t2.departure_time == "Late_Night", lit(5)) 
)
airlines_t3_p1.select("index", "airline", "flight", "departure_time", "departure_time_numeric").show(n=5)



+------+--------+-------+--------------+----------------------+
| index| airline| flight|departure_time|departure_time_numeric|
+------+--------+-------+--------------+----------------------+
|     0|SpiceJet|SG-8709|       Evening|                     2|
|   100| Vistara| UK-705| Early_Morning|                     0|
|  1000|  Indigo|6E-2373|     Afternoon|                     3|
| 10000|GO_FIRST| G8-392|     Afternoon|                     3|
|100000| Vistara| UK-810| Early_Morning|                     0|
+------+--------+-------+--------------+----------------------+
only showing top 5 rows


                                                                                

In [173]:
airlines_t3_p2 = airlines_t3_p1.withColumn("arrival_time_numeric",
                                     when(airlines_t3_p1.arrival_time == "Early_Morning", lit(0)) \
                                     .when(airlines_t3_p1.arrival_time == "Morning", lit(1)) \
                                     .when(airlines_t3_p1.arrival_time == "Evening", lit(2)) \
                                     .when(airlines_t3_p1.arrival_time == "Afternoon", lit(3)) \
                                     .when(airlines_t3_p1.arrival_time == "Night", lit(4)) \
                                     .when(airlines_t3_p1.arrival_time == "Late_Night", lit(5)) 
)
airlines_t3_p2.select("index", "airline", "flight", "arrival_time", "arrival_time_numeric").show(n=5)



+------+--------+-------+------------+--------------------+
| index| airline| flight|arrival_time|arrival_time_numeric|
+------+--------+-------+------------+--------------------+
|     0|SpiceJet|SG-8709|       Night|                   4|
|   100| Vistara| UK-705|       Night|                   4|
|  1000|  Indigo|6E-2373|     Evening|                   2|
| 10000|GO_FIRST| G8-392|       Night|                   4|
|100000| Vistara| UK-810|       Night|                   4|
+------+--------+-------+------------+--------------------+
only showing top 5 rows


                                                                                

## 4th transformation: Add a new column is_expensive: when(price > 6000, True).otherwise(False)

In [166]:
# Perform the 4th transformation: Add a new column is_expensive: when(price > 6000, True).otherwise(False)

airlines_t4 = airlines_t3_p2.withColumn("is_expensive",
                                        when(airlines_t3_p2.price > 6000, lit(True)) \
                                        .when(airlines_t3_p2.price <= 6000, lit(False))
)
airlines_t4.select("index", "airline", "flight", "price", "is_expensive").show(n=5)



+------+--------+-------+-----+------------+
| index| airline| flight|price|is_expensive|
+------+--------+-------+-----+------------+
|     0|SpiceJet|SG-8709| 5953|       false|
|   100| Vistara| UK-705|18450|        true|
|  1000|  Indigo|6E-2373| 9373|        true|
| 10000|GO_FIRST| G8-392| 7424|        true|
|100000| Vistara| UK-810| 7212|        true|
+------+--------+-------+-----+------------+
only showing top 5 rows


                                                                                

## Get the average price per airline.

In [167]:
airlines_t4.groupBy(col("airline")) \
    .agg(round(avg("price"), 2).alias("average_price")) \
    .orderBy("average_price") \
    .show()



+---------+-------------+
|  airline|average_price|
+---------+-------------+
|  AirAsia|      4091.07|
|   Indigo|      5324.22|
| GO_FIRST|      5652.01|
| SpiceJet|      6179.28|
|Air_India|     23507.02|
|  Vistara|     30396.54|
+---------+-------------+



                                                                                

## Get the average duration per route.

In [169]:
airlines_t4.groupBy(col("route")) \
    .agg(round(avg("duration"), 2).alias("average_duration")) \
    .orderBy(desc("average_duration")) \
    .show()



+--------------------+----------------+
|               route|average_duration|
+--------------------+----------------+
|    Kolkata->Chennai|           14.77|
|    Chennai->Kolkata|           14.52|
|  Bangalore->Chennai|           14.48|
|Bangalore->Hyderabad|           14.16|
|  Chennai->Bangalore|           13.95|
|  Kolkata->Hyderabad|           13.85|
|  Kolkata->Bangalore|           13.79|
|  Hyderabad->Kolkata|           13.54|
|  Hyderabad->Chennai|           13.29|
|   Mumbai->Hyderabad|           13.26|
|  Chennai->Hyderabad|           13.15|
|  Bangalore->Kolkata|            13.1|
|     Kolkata->Mumbai|           12.99|
|     Mumbai->Kolkata|           12.84|
|      Delhi->Kolkata|           12.74|
|     Mumbai->Chennai|           12.67|
|    Delhi->Hyderabad|           12.52|
|      Delhi->Chennai|           12.43|
|     Chennai->Mumbai|           12.37|
|Hyderabad->Bangalore|           12.09|
+--------------------+----------------+
only showing top 20 rows


                                                                                

## Get the minimum and maximum price per airline.

In [170]:
from pyspark.sql.types import IntegerType

airlines_t4.groupBy(col("airline")) \
    .agg(min(col("price").cast(IntegerType())).alias("min_price"), \
         max(col("price").cast(IntegerType())).alias("max_price")) \
    .show()



+---------+---------+---------+
|  airline|min_price|max_price|
+---------+---------+---------+
|   Indigo|     1105|    31952|
| SpiceJet|     1106|    34158|
|Air_India|     1526|    90970|
|  AirAsia|     1105|    31917|
| GO_FIRST|     1105|    32803|
|  Vistara|     1714|   123071|
+---------+---------+---------+



                                                                                

## Get the count flights by departure_time category.

In [171]:
airlines_t4.groupBy(col("departure_time")) \
    .agg(count("index").alias("total_flights")) \
    .orderBy("total_flights") \
    .show()



+--------------+-------------+
|departure_time|total_flights|
+--------------+-------------+
|    Late_Night|         1306|
|     Afternoon|        47794|
|         Night|        48015|
|       Evening|        65102|
| Early_Morning|        66790|
|       Morning|        71146|
+--------------+-------------+



                                                                                