# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Ana Carolina Arellano Valdez

**Professor**: Pablo Camarillo Ramirez

In [46]:
import findspark
findspark.init()

In [47]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab 03: Data Cleaning and Transformation Pipeline") \
    .master("spark://2b8212f158e8:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [48]:
from carolinarellano.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [49]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=7)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|    5| Vistara| UK-945|

                                                                                

In [50]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

                                                                                

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

### Delete unnecessary columns

In [56]:
# Necessary columns
necessary_columns = ["airline", "source_city", "destination_city", "class", "duration", "price", "stops", "departure_time", "arrival_time"]
airlines_final = airlines_clean.select(necessary_columns)
airlines_final.show(n=7)



+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+
| airline|source_city|destination_city|  class|duration|price|stops|departure_time|arrival_time|
+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+
|SpiceJet|      Delhi|          Mumbai|Economy|    2.33| 5953| zero| Early_Morning|     Morning|
| Vistara|      Delhi|          Mumbai|Economy|    2.25| 5955| zero|       Morning|   Afternoon|
| Vistara|      Delhi|          Mumbai|Economy|    2.33| 5955| zero|       Morning|   Afternoon|
| Vistara|      Delhi|          Mumbai|Economy|    2.08| 6060| zero|       Morning|     Morning|
|GO_FIRST|      Delhi|          Mumbai|Economy|    2.25| 5954| zero|     Afternoon|     Evening|
|  Indigo|      Delhi|          Mumbai|Economy|    2.17| 5955| zero| Early_Morning|     Morning|
|  Indigo|      Delhi|          Mumbai|Economy|    2.17| 5955| zero|       Morning|   Afternoon|
+--------+-----------+--------

                                                                                

### Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops

In [57]:
from pyspark.sql.functions import when, col
from pyspark.sql.types import IntegerType

print("Unique values in stops column:")
airlines_final.select("stops").distinct().show()

airlines_final = airlines_final.withColumn("stops", 
    when(col("stops") == "zero", 0)
    .when(col("stops") == "one", 1)
    .when(col("stops") == "two_or_more", 2)
    .otherwise(col("stops").cast(IntegerType()))
)

airlines_final.show(n=7)

Unique values in stops column:


                                                                                

+-----------+
|      stops|
+-----------+
|two_or_more|
|        one|
|       zero|
+-----------+





+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+
| airline|source_city|destination_city|  class|duration|price|stops|departure_time|arrival_time|
+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+
|SpiceJet|      Delhi|          Mumbai|Economy|    2.33| 5953|    0| Early_Morning|     Morning|
| Vistara|      Delhi|          Mumbai|Economy|    2.25| 5955|    0|       Morning|   Afternoon|
| Vistara|      Delhi|          Mumbai|Economy|    2.33| 5955|    0|       Morning|   Afternoon|
| Vistara|      Delhi|          Mumbai|Economy|    2.08| 6060|    0|       Morning|     Morning|
|GO_FIRST|      Delhi|          Mumbai|Economy|    2.25| 5954|    0|     Afternoon|     Evening|
|  Indigo|      Delhi|          Mumbai|Economy|    2.17| 5955|    0| Early_Morning|     Morning|
|  Indigo|      Delhi|          Mumbai|Economy|    2.17| 5955|    0|       Morning|   Afternoon|
+--------+-----------+--------

                                                                                

### Create a new column called route: “Delhi → Mumbai” from source_city and destination_city

In [58]:
from pyspark.sql.functions import col, concat_ws

airlines_final = airlines_final.withColumn(
    "Route: Delhi -> Mumbai",
    when(
        (col("source_city") == "Delhi") & (col("destination_city") == "Mumbai"), 1
    ).otherwise(0)
)

# Show the result
airlines_final.show(n=5, truncate=False)



+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+
|airline |source_city|destination_city|class  |duration|price|stops|departure_time|arrival_time|Route: Delhi -> Mumbai|
+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+
|SpiceJet|Delhi      |Mumbai          |Economy|2.33    |5953 |0    |Early_Morning |Morning     |1                     |
|Vistara |Delhi      |Mumbai          |Economy|2.25    |5955 |0    |Morning       |Afternoon   |1                     |
|Vistara |Delhi      |Mumbai          |Economy|2.33    |5955 |0    |Morning       |Afternoon   |1                     |
|Vistara |Delhi      |Mumbai          |Economy|2.08    |6060 |0    |Morning       |Morning     |1                     |
|GO_FIRST|Delhi      |Mumbai          |Economy|2.25    |5954 |0    |Afternoon     |Evening     |1                     |
+--------+-----------+----------------+-

                                                                                

### Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.)

In [60]:
airlines_final = airlines_final.withColumn("departure_time", 
    when(col("departure_time") == "Early_Morning", 0)
    .when(col("departure_time") == "Morning", 1)
    .when(col("departure_time") == "Afternoon", 2)
    .otherwise(3)
)

airlines_final = airlines_final.withColumn("arrival_time", 
    when(col("arrival_time") == "Early_Morning", 0)
    .when(col("arrival_time") == "Morning", 1)
    .when(col("arrival_time") == "Afternoon", 2)
    .otherwise(3)
)

airlines_final.show(n=7)



+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+
| airline|source_city|destination_city|  class|duration|price|stops|departure_time|arrival_time|Route: Delhi -> Mumbai|
+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+
|SpiceJet|      Delhi|          Mumbai|Economy|    2.33| 5953|    0|             0|           1|                     1|
| Vistara|      Delhi|          Mumbai|Economy|    2.25| 5955|    0|             1|           2|                     1|
| Vistara|      Delhi|          Mumbai|Economy|    2.33| 5955|    0|             1|           2|                     1|
| Vistara|      Delhi|          Mumbai|Economy|    2.08| 6060|    0|             1|           1|                     1|
|GO_FIRST|      Delhi|          Mumbai|Economy|    2.25| 5954|    0|             2|           3|                     1|
|  Indigo|      Delhi|          Mumbai|E

                                                                                

### Add a new column is_expensive: when(price > 6000, True).otherwise(False)

In [61]:
airlines_final = airlines_final.withColumn("is_expensive", 
    when(col("price") > 600, 1).otherwise(0)
)

airlines_final.show(n=7)



+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+------------+
| airline|source_city|destination_city|  class|duration|price|stops|departure_time|arrival_time|Route: Delhi -> Mumbai|is_expensive|
+--------+-----------+----------------+-------+--------+-----+-----+--------------+------------+----------------------+------------+
|SpiceJet|      Delhi|          Mumbai|Economy|    2.33| 5953|    0|             0|           1|                     1|           1|
| Vistara|      Delhi|          Mumbai|Economy|    2.25| 5955|    0|             1|           2|                     1|           1|
| Vistara|      Delhi|          Mumbai|Economy|    2.33| 5955|    0|             1|           2|                     1|           1|
| Vistara|      Delhi|          Mumbai|Economy|    2.08| 6060|    0|             1|           1|                     1|           1|
|GO_FIRST|      Delhi|          Mumbai|Economy|    2.25| 5954|    0| 

                                                                                

### Get the average price per airline

In [64]:
average_price_by_airline = airlines_final.groupBy("airline").avg("price").withColumnRenamed("avg(price)", "average_price")
average_price_by_airline.show(n=10, truncate=False)



+---------+------------------+
|airline  |average_price     |
+---------+------------------+
|Indigo   |5324.216303339517 |
|SpiceJet |6179.278881367218 |
|Air_India|23507.01911190229 |
|AirAsia  |4091.0727419555224|
|GO_FIRST |5652.007595045959 |
|Vistara  |30396.53630170735 |
+---------+------------------+



                                                                                

### Average price per route

In [65]:
average_price_per_route = airlines_final.groupBy("source_city", "destination_city").avg("price").withColumnRenamed("avg(price)", "average_price")
average_price_per_route.show(n=10, truncate=False)



+-----------+----------------+------------------+
|source_city|destination_city|average_price     |
+-----------+----------------+------------------+
|Chennai    |Bangalore       |25081.85045433544 |
|Delhi      |Hyderabad       |17347.288379073758|
|Kolkata    |Chennai         |23660.36104013227 |
|Chennai    |Kolkata         |22669.93240727481 |
|Bangalore  |Kolkata         |23500.061228560033|
|Delhi      |Chennai         |19369.881354359924|
|Mumbai     |Kolkata         |22379.146722742422|
|Chennai    |Mumbai          |22765.849646605267|
|Hyderabad  |Mumbai          |20080.865759141496|
|Hyderabad  |Kolkata         |20823.89320145236 |
+-----------+----------------+------------------+
only showing top 10 rows


                                                                                

### Minimum and maximum price per airline

In [71]:
from pyspark.sql.functions import max, min

max_and_min_price = airlines_final.groupBy("airline").agg(
    max("price").alias("max_price"),
    min("price").alias("min_price")
)
max_and_min_price.show(n=10, truncate=False)



+---------+---------+---------+
|airline  |max_price|min_price|
+---------+---------+---------+
|Indigo   |31952    |1105     |
|SpiceJet |34158    |1106     |
|Air_India|90970    |1526     |
|AirAsia  |31917    |1105     |
|GO_FIRST |32803    |1105     |
|Vistara  |123071   |1714     |
+---------+---------+---------+



                                                                                

### Count flights by departure_time category

In [74]:
flights_by_departure = df_airlines.groupBy("departure_time").count()

flights_by_departure.show(n=5)



+--------------+-----+
|departure_time|count|
+--------------+-----+
|       Evening|65102|
|       Morning|71146|
|    Late_Night| 1306|
|     Afternoon|47794|
| Early_Morning|66790|
+--------------+-----+
only showing top 5 rows


                                                                                

In [75]:
sc.stop()