# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Andre Jair Sanchez Contreras

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql.functions import col, when, count, expr, min , max, avg, round

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType

In [4]:
from pyspark.sql import SparkSession

In [5]:
spark = SparkSession.builder \
    .appName("AirlinesDataCleaning") \
    .getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 03:19:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
airlines_schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("airline", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("source_city", StringType(), True),
    StructField("departure_time", StringType(), True),
    StructField("stops", StringType(), True),
    StructField("arrival_time", StringType(), True),
    StructField("destination_city", StringType(), True),
    StructField("class", StringType(), True),
    StructField("duration", DoubleType(), True),
    StructField("days_left", IntegerType(), True),
    StructField("price", IntegerType(), True)
])

In [7]:
#Upload dataset
df_airlines = spark.read \
    .option("header", "true") \
    .csv("/opt/Dataset/airlines_flights_data.csv")

df_airlines.show(10)

                                                                                

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
|    5| Vistara| UK-945|

In [8]:
#count null values 
df_airlines.select([
    count(when(col(dataset).isNull(), dataset)).alias(dataset) for dataset in df_airlines.columns
]).show()

[Stage 2:>                                                          (0 + 6) / 6]

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [10]:
#categorical values "zero" -> 0 and "one" -> 1 etc
#using expr function spark because allows me to execute expressions directly 
df_airlines = df_airlines.withColumn(
    "stops_normalized",
    expr("""
        CASE 
            WHEN stops = 'zero' THEN 0
            WHEN stops = 'one' THEN 1
            WHEN stops = 'two' THEN 2
            WHEN stops = 'three' THEN 3
            ELSE NULL
        END
    """)
)

df_airlines.select("stops", "stops_normalized").show(20)


+-----+----------------+
|stops|stops_normalized|
+-----+----------------+
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
| zero|               0|
|  one|               1|
|  one|               1|
+-----+----------------+
only showing top 20 rows


In [11]:
#Route column using concat in both string and the function expr
df_airlines = df_airlines.withColumn(
    "route",
    expr("concat(source_city, ' → ', destination_city)")
)

df_airlines.select("source_city", "destination_city", "route").show(5)


+-----------+----------------+--------------+
|source_city|destination_city|         route|
+-----------+----------------+--------------+
|      Delhi|          Mumbai|Delhi → Mumbai|
|      Delhi|          Mumbai|Delhi → Mumbai|
|      Delhi|          Mumbai|Delhi → Mumbai|
|      Delhi|          Mumbai|Delhi → Mumbai|
|      Delhi|          Mumbai|Delhi → Mumbai|
+-----------+----------------+--------------+
only showing top 5 rows


In [12]:
#Create column is_expensive when the price is > 6000
df_airlines = df_airlines.withColumn(
    "is_expensive",
    expr("CASE WHEN price > 6000 THEN True ELSE False END")
)

df_airlines.select("price", "is_expensive").show(10)


+-----+------------+
|price|is_expensive|
+-----+------------+
| 5953|       false|
| 5953|       false|
| 5956|       false|
| 5955|       false|
| 5955|       false|
| 5955|       false|
| 6060|        true|
| 6060|        true|
| 5954|       false|
| 5954|       false|
+-----+------------+
only showing top 10 rows


In [14]:
#Average price, using the rounding function to display the result with 2 decimal points
df_airlines.groupBy("airline") \
    .agg(round(avg("price"), 2).alias("avg_price")) \
    .orderBy("avg_price") \
    .show(10)

+---------+---------+
|  airline|avg_price|
+---------+---------+
|  AirAsia|  4091.07|
|   Indigo|  5324.22|
| GO_FIRST|  5652.01|
| SpiceJet|  6179.28|
|Air_India| 23507.02|
|  Vistara| 30396.54|
+---------+---------+



In [16]:
#Average duration, using the rounding function to display the result with 2 decimal points
df_airlines.groupBy("route") \
    .agg(round(avg("duration"), 2).alias("avg_duration")) \
    .orderBy("avg_duration") \
    .show(10)

+------------------+------------+
|             route|avg_duration|
+------------------+------------+
| Bangalore → Delhi|        9.78|
|    Mumbai → Delhi|        9.82|
| Delhi → Bangalore|       10.35|
|    Delhi → Mumbai|       10.37|
| Hyderabad → Delhi|       10.83|
|Bangalore → Mumbai|       10.91|
|   Chennai → Delhi|       11.15|
|   Kolkata → Delhi|        11.6|
|Mumbai → Bangalore|       11.61|
|Hyderabad → Mumbai|       11.96|
+------------------+------------+
only showing top 10 rows


In [17]:
#Min and max of the prices of each airline
df_airlines.groupBy("airline") \
    .agg(
        min("price").alias("min_price"),
        max("price").alias("max_price")
    ) \
    .orderBy("airline") \
    .show(10)

[Stage 21:>                                                         (0 + 6) / 6]

+---------+---------+---------+
|  airline|min_price|max_price|
+---------+---------+---------+
|  AirAsia|    10001|     9922|
|Air_India|    10005|     9998|
| GO_FIRST|    10011|     9999|
|   Indigo|    10003|     9998|
| SpiceJet|    10012|     9971|
|  Vistara|    10000|     9998|
+---------+---------+---------+



                                                                                

In [18]:
#count flights by departure time
df_airlines.groupBy("departure_time") \
    .agg(count("*").alias("flight_count")) \
    .orderBy("flight_count") \
    .show()




+--------------+------------+
|departure_time|flight_count|
+--------------+------------+
|    Late_Night|        1306|
|     Afternoon|       47794|
|         Night|       48015|
|       Evening|       65102|
| Early_Morning|       66790|
|       Morning|       71146|
+--------------+------------+



                                                                                