# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Sergio Villa Rodríguez

**Professor**: Pablo Camarillo Ramirez

## Find PySpark Installation

In [1]:
import findspark
findspark.init()

## Create Spark Session

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://f0e377311925:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/20 19:29:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Define the schema

In [3]:
from sergiovillaa.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

## Load CSV

In [4]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

## Data cleaning

In [5]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

                                                                                

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

## Normalize categorical values

In [8]:
#Different values on column "stops"
airlines_clean_v2.select("stops").distinct().show()



+-----------+
|      stops|
+-----------+
|two_or_more|
|        one|
|       zero|
+-----------+



                                                                                

In [12]:
from pyspark.sql.functions import when, lit
airlines_t1 = airlines_clean_v2.withColumn("stops_numeric",
                                           when(airlines_clean_v2.stops == "zero", lit(0))
                                               .when(airlines_clean_v2.stops == "one", lit(1))
                                          .when(airlines_clean_v2.stops == "two_or_more", lit(2)))
airlines_t1.show(n=20)

+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------------+
|index|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|
+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------------+
|    0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|            0|
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|
|    2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|            0|
|    3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|            0|
|    4|  Vistara| UK-963|      Delhi|    

## Creating new column Route

In [13]:
from pyspark.sql.functions import concat_ws
airlines_t2 = airlines_t1.withColumn("route", concat_ws("->", "source_city", "destination_city"))
airlines_t2.show()

+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------------+-------------+
|index|  airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|        route|
+-----+---------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+-------------+-------------+
|    0| SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|            0|Delhi->Mumbai|
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|Delhi->Mumbai|
|    2|  AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|            0|Delhi->Mumbai|
|    3|  Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Econo

## Transforming departure_time and arrival_time to numeric

In [14]:
#Different values on column "stops"
airlines_t2.select("departure_time").distinct().show()
airlines_t2.select("arrival_time").distinct().show()

                                                                                

+--------------+
|departure_time|
+--------------+
|       Evening|
|       Morning|
|    Late_Night|
|     Afternoon|
| Early_Morning|
|         Night|
+--------------+





+-------------+
| arrival_time|
+-------------+
|      Evening|
|      Morning|
|   Late_Night|
|    Afternoon|
|Early_Morning|
|        Night|
+-------------+



                                                                                

In [16]:
# Early_Morning = 0
# Morning = 1
# Afternoon = 2
# Evening = 3
# Night = 4
# Late_Night = 5

airlines_t3 = airlines_t2.withColumn("departure_time_numerical",
                                     when(airlines_t2.departure_time == "Early_Morning", lit(0))
                                     .when(airlines_t2.departure_time == "Morning", lit(1))
                                     .when(airlines_t2.departure_time == "Afternoon", lit(2))
                                     .when(airlines_t2.departure_time == "Evening", lit(3))
                                     .when(airlines_t2.departure_time == "Night", lit(4))
                                     .when(airlines_t2.departure_time == "Late_Night", lit(5)))
airlines_t3.select("departure_time", "departure_time_numerical").show()

+--------------+------------------------+
|departure_time|departure_time_numerical|
+--------------+------------------------+
|       Evening|                       3|
| Early_Morning|                       0|
| Early_Morning|                       0|
|       Morning|                       1|
|       Morning|                       1|
|       Morning|                       1|
|       Morning|                       1|
|     Afternoon|                       2|
| Early_Morning|                       0|
|     Afternoon|                       2|
|     Afternoon|                       2|
|       Morning|                       1|
| Early_Morning|                       0|
|       Morning|                       1|
|     Afternoon|                       2|
|       Morning|                       1|
| Early_Morning|                       0|
| Early_Morning|                       0|
|       Evening|                       3|
|       Evening|                       3|
+--------------+------------------

In [17]:
airlines_t4 = airlines_t3.withColumn("arrival_time_numerical",
                                     when(airlines_t2.arrival_time == "Early_Morning", lit(0))
                                     .when(airlines_t2.arrival_time == "Morning", lit(1))
                                     .when(airlines_t2.arrival_time == "Afternoon", lit(2))
                                     .when(airlines_t2.arrival_time == "Evening", lit(3))
                                     .when(airlines_t2.arrival_time == "Night", lit(4))
                                     .when(airlines_t2.arrival_time == "Late_Night", lit(5)))
airlines_t4.select("arrival_time", "arrival_time_numerical").show()

+-------------+----------------------+
| arrival_time|arrival_time_numerical|
+-------------+----------------------+
|        Night|                     4|
|      Morning|                     1|
|Early_Morning|                     0|
|    Afternoon|                     2|
|      Morning|                     1|
|    Afternoon|                     2|
|      Morning|                     1|
|      Evening|                     3|
|      Morning|                     1|
|      Evening|                     3|
|      Evening|                     3|
|    Afternoon|                     2|
|      Morning|                     1|
|    Afternoon|                     2|
|      Evening|                     3|
|      Morning|                     1|
|      Morning|                     1|
|      Morning|                     1|
|Early_Morning|                     0|
|      Morning|                     1|
+-------------+----------------------+
only showing top 20 rows


## Adding new column is_expensive

In [18]:
airlines_t5 = airlines_t4.withColumn("is_expensive", when(airlines_t4.price > 6000, True).otherwise(False))
airlines_t5.select("class", "price", "is_expensive").show()

+-------+-----+------------+
|  class|price|is_expensive|
+-------+-----+------------+
|Economy| 5953|       false|
|Economy| 5953|       false|
|Economy| 5956|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 6060|        true|
|Economy| 6060|        true|
|Economy| 5954|       false|
|Economy| 5954|       false|
|Economy| 5954|       false|
|Economy| 5954|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5955|       false|
|Economy| 5949|       false|
|Economy| 5949|       false|
+-------+-----+------------+
only showing top 20 rows


# Aggregations

## Average price per airline

In [19]:
from pyspark.sql.functions import avg
airlines_t5.groupBy(col("airline")).agg(avg("price")).show()



+---------+------------------+
|  airline|        avg(price)|
+---------+------------------+
|   Indigo| 5324.216303339517|
| SpiceJet| 6179.278881367218|
|Air_India| 23507.01911190229|
|  AirAsia|4091.0727419555224|
| GO_FIRST| 5652.007595045959|
|  Vistara| 30396.53630170735|
+---------+------------------+



                                                                                

## Average duration per route

In [20]:
airlines_t5.groupBy(col("route")).agg(avg("duration")).show()



+--------------------+------------------+
|               route|     avg(duration)|
+--------------------+------------------+
|Hyderabad->Bangalore| 12.09331678643705|
|    Delhi->Hyderabad|12.518350118710492|
|      Kolkata->Delhi| 11.60498857561711|
|      Delhi->Kolkata| 12.73596614766045|
|   Bangalore->Mumbai| 10.90507225639642|
|    Bangalore->Delhi|  9.77995566082195|
|     Mumbai->Chennai|12.665900287564627|
|  Hyderabad->Kolkata|13.535322410033165|
|  Kolkata->Bangalore| 13.79294687524098|
|  Bangalore->Kolkata|13.099143404859825|
|  Kolkata->Hyderabad|13.853107514948396|
|       Delhi->Mumbai|10.367774213738123|
|  Bangalore->Chennai|14.480207509137166|
|    Kolkata->Chennai|14.774181563782903|
|  Hyderabad->Chennai|13.293238468912078|
|     Kolkata->Mumbai|12.991932481150478|
|    Delhi->Bangalore| 10.35412503844018|
|      Delhi->Chennai|12.433964745763944|
|   Mumbai->Bangalore|11.612022516178817|
|Bangalore->Hyderabad|14.162432783513621|
+--------------------+------------

                                                                                

## Minimun and maximun price per airline

In [21]:
from pyspark.sql.functions import min, max
airlines_t5.groupBy(col("airline")).agg(
    min("price").alias("min_price"),
    max("price").alias("max_temp")
).show()



+---------+---------+--------+
|  airline|min_price|max_temp|
+---------+---------+--------+
|   Indigo|     1105|   31952|
| SpiceJet|     1106|   34158|
|Air_India|     1526|   90970|
|  AirAsia|     1105|   31917|
| GO_FIRST|     1105|   32803|
|  Vistara|     1714|  123071|
+---------+---------+--------+



                                                                                

## Count flights by departure time category

In [24]:
airlines_t5.groupBy(col("departure_time_numerical")).count().show()
# Early_Morning = 0
# Morning = 1
# Afternoon = 2
# Evening = 3
# Night = 4
# Late_Night = 5



+------------------------+-----+
|departure_time_numerical|count|
+------------------------+-----+
|                       1|71146|
|                       3|65102|
|                       5| 1306|
|                       4|48015|
|                       2|47794|
|                       0|66790|
+------------------------+-----+



                                                                                

In [None]:
sc.stop