# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Mateo Garcia Lopez

**Professor**: Pablo Camarillo Ramirez

In [5]:
import findspark
findspark.init()

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, count, when, isnull, trim, lit, concat_ws, avg, min, max

In [7]:
# - 1. Spark Session Initialization -
spark = SparkSession.builder \
    .appName("Airline Data Processing") \
    .master("spark://00a67c339fa3:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [9]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 25:>                                                         (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

In [11]:
from pcamarillor.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [12]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

                                                                                

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [14]:
# - 4. Data Transformations -

# Applying all transformations sequentially
transformed_df = airlines_clean \
    .withColumn(
        # Normalize 'stops' column: "zero" -> 0, "one" -> 1, "two_or_more" -> 2
        "stops",
        when(col("stops") == "zero", 0)
        .when(col("stops") == "one", 1)
        .otherwise(2) # Assuming "two_or_more" is the only other possibility
    ) \
    .withColumn(
        # Create a new column 'route'
        "route",
        concat_ws(" → ", col("source_city"), col("destination_city"))
    ) \
    .withColumn(
        # Encode 'departure_time' numerically
        "departure_time_encoded",
        when(col("departure_time") == "Early_Morning", 0)
        .when(col("departure_time") == "Morning", 1)
        .when(col("departure_time") == "Afternoon", 2)
        .when(col("departure_time") == "Evening", 3)
        .when(col("departure_time") == "Night", 4)
        .otherwise(5) # For "Late_Night"
    ) \
    .withColumn(
        # Encode 'arrival_time' numerically
        "arrival_time_encoded",
        when(col("arrival_time") == "Early_Morning", 0)
        .when(col("arrival_time") == "Morning", 1)
        .when(col("arrival_time") == "Afternoon", 2)
        .when(col("arrival_time") == "Evening", 3)
        .when(col("arrival_time") == "Night", 4)
        .otherwise(5) # For "Late_Night"
    ) \
    .withColumn(
        # Add a new boolean column 'is_expensive'
        "is_expensive",
        when(col("price") > 6000, True).otherwise(False)
    )

In [15]:
print("\nAll transformations applied successfully. Final DataFrame schema and sample:")
transformed_df.printSchema()
transformed_df.select("airline", "stops", "route", "departure_time", "departure_time_encoded", "price", "is_expensive").show(5)


✅ All transformations applied successfully. Final DataFrame schema and sample:
root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: integer (nullable = false)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)
 |-- route: string (nullable = false)
 |-- departure_time_encoded: integer (nullable = false)
 |-- arrival_time_encoded: integer (nullable = false)
 |-- is_expensive: boolean (nullable = false)





+--------+-----+--------------+--------------+----------------------+-----+------------+
| airline|stops|         route|departure_time|departure_time_encoded|price|is_expensive|
+--------+-----+--------------+--------------+----------------------+-----+------------+
|SpiceJet|    0|Delhi → Mumbai| Early_Morning|                     0| 5953|       false|
| Vistara|    0|Delhi → Mumbai|       Morning|                     1| 5955|       false|
| Vistara|    0|Delhi → Mumbai|       Morning|                     1| 5955|       false|
| Vistara|    0|Delhi → Mumbai|       Morning|                     1| 6060|        true|
|GO_FIRST|    0|Delhi → Mumbai|     Afternoon|                     2| 5954|       false|
+--------+-----+--------------+--------------+----------------------+-----+------------+
only showing top 5 rows


                                                                                

In [16]:
# --- 5. Aggregations ---

print("\n- Aggregation Results -")

## Get the average price per airline
print("\nAverage price per airline:")
avg_price_per_airline = transformed_df.groupBy("airline") \
                                      .agg(avg("price").alias("average_price")) \
                                      .orderBy(col("average_price").desc())
avg_price_per_airline.show()

## Average duration per route
print("\nAverage duration per route:")
avg_duration_per_route = transformed_df.groupBy("route") \
                                       .agg(avg("duration").alias("average_duration")) \
                                       .orderBy(col("average_duration").desc())
avg_duration_per_route.show()

## Minimum and maximum price per airline
print("\nMinimum and maximum price per airline:")
min_max_price_per_airline = transformed_df.groupBy("airline") \
                                          .agg(
                                              min("price").alias("min_price"),
                                              max("price").alias("max_price")
                                          ) \
                                          .orderBy("airline")
min_max_price_per_airline.show()

## Count flights by departure_time category
print("\nCount of flights by departure time category:")
flights_by_departure = transformed_df.groupBy("departure_time") \
                                     .count() \
                                     .orderBy(col("count").desc())
flights_by_departure.show()


- Aggregation Results -

Average price per airline:


                                                                                

+---------+------------------+
|  airline|     average_price|
+---------+------------------+
|  Vistara| 30396.53630170735|
|Air_India| 23507.01911190229|
| SpiceJet| 6179.278881367218|
| GO_FIRST| 5652.007595045959|
|   Indigo| 5324.216303339517|
|  AirAsia|4091.0727419555224|
+---------+------------------+


Average duration per route:


                                                                                

+--------------------+------------------+
|               route|  average_duration|
+--------------------+------------------+
|   Kolkata → Chennai|14.774181563782903|
|   Chennai → Kolkata|14.515774035955694|
| Bangalore → Chennai|14.480207509137166|
|Bangalore → Hyder...|14.162432783513621|
| Chennai → Bangalore|13.952593563812163|
| Kolkata → Hyderabad|13.853107514948396|
| Kolkata → Bangalore| 13.79294687524098|
| Hyderabad → Kolkata|13.535322410033165|
| Hyderabad → Chennai|13.293238468912078|
|  Mumbai → Hyderabad|13.263310412247066|
| Chennai → Hyderabad|13.153984931732971|
| Bangalore → Kolkata|13.099143404859825|
|    Kolkata → Mumbai|12.991932481150478|
|    Mumbai → Kolkata|12.836848115489666|
|     Delhi → Kolkata| 12.73596614766045|
|    Mumbai → Chennai|12.665900287564627|
|   Delhi → Hyderabad|12.518350118710492|
|     Delhi → Chennai|12.433964745763944|
|    Chennai → Mumbai|12.374656244132625|
|Hyderabad → Banga...| 12.09331678643705|
+--------------------+------------

                                                                                

+---------+---------+---------+
|  airline|min_price|max_price|
+---------+---------+---------+
|  AirAsia|     1105|    31917|
|Air_India|     1526|    90970|
| GO_FIRST|     1105|    32803|
|   Indigo|     1105|    31952|
| SpiceJet|     1106|    34158|
|  Vistara|     1714|   123071|
+---------+---------+---------+


Count of flights by departure time category:




+--------------+-----+
|departure_time|count|
+--------------+-----+
|       Morning|71146|
| Early_Morning|66790|
|       Evening|65102|
|         Night|48015|
|     Afternoon|47794|
|    Late_Night| 1306|
+--------------+-----+



                                                                                

In [17]:
spark.stop()