# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**:

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://38f00c0193b4:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/21 17:38:59 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Define airlines schema

In [3]:
from axel2293.spark_utils import SparkUtils

airline_schema_cols = [
    ("index", "int"),
    ("airline", "string"),
    ("flight", "string"),
    ("source_city", "string"),
    ("departure_time", "string"),
    ("stops", "string"),
    ("arrival_time", "string"),
    ("destination_city", "string"),
    ("class", "string"),
    ("duration", "float"),
    ("days_left", "int"),
    ("price", "int"),
]

airline_schema = SparkUtils.generate_schema(airline_schema_cols)
airline_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

# Load Dataset CSV

In [4]:
df_airline = spark.read \
                .option("header", "true") \
                .schema(airline_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airline.show(n=4)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
only showing top 4 rows


                                                                                

## 1
Drop unnecessary columns. Count how many null values the dataset has before/after the cleaning process.

In [5]:
from pyspark.sql.functions import trim, col, count, isnull, when

# Show null values in each column
df_airline.select([count(when(col(c).isNull(), c)).alias(c) for c in df_airline.columns]).show()



+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [6]:
print(f"Records before cleaning: {df_airline.count()}")

Records before cleaning: 300153


In [11]:
airline_clean = df_airline \
                .dropDuplicates(["index"]) \
                .withColumn("airline", trim("airline")) \
                .withColumn("source_city", trim("source_city")) \
                .withColumn("destination_city", trim("destination_city")) \
                .filter(col("price").isNotNull())
airline_clean = df_airline.dropna()

In [12]:
print(f"Records after cleaning: {df_airline.count()}")

Records after cleaning: 300153


## 2
Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

In [14]:
from pyspark.sql.functions import when, lit

airline_t2 = airline_clean.withColumn("stops_num",
                                            when(airline_clean.stops == "zero", lit(0))
                                            .when(airline_clean.stops == "one", lit(1))
                                           )
airline_t2.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|stops_num|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+---------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|        0|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|        0|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|        0|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|        0|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|     

## 3
Create a new column called route: “Delhi → Mumbai” from source_city and destination_city.

In [24]:
from pyspark.sql.functions import concat, col

airline_t3 = airline_clean.withColumn("route",
                                            concat(col("source_city"), lit(" -> "), col("destination_city"))
                                           )
airline_t3.select("source_city", "destination_city", "route").distinct().show(n=5)
    

+-----------+----------------+-------------------+
|source_city|destination_city|              route|
+-----------+----------------+-------------------+
|     Mumbai|         Chennai|  Mumbai -> Chennai|
|    Kolkata|          Mumbai|  Kolkata -> Mumbai|
|  Bangalore|           Delhi| Bangalore -> Delhi|
|     Mumbai|         Kolkata|  Mumbai -> Kolkata|
|     Mumbai|       Bangalore|Mumbai -> Bangalore|
+-----------+----------------+-------------------+
only showing top 5 rows


## 4
Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.).


In [16]:
airline_clean.select("departure_time").distinct().show()
airline_clean.select("arrival_time").distinct().show()

                                                                                

+--------------+
|departure_time|
+--------------+
|       Evening|
|       Morning|
|    Late_Night|
|     Afternoon|
| Early_Morning|
|         Night|
+--------------+

+-------------+
| arrival_time|
+-------------+
|      Evening|
|      Morning|
|   Late_Night|
|    Afternoon|
|Early_Morning|
|        Night|
+-------------+



                                                                                

In [17]:
from pyspark.sql.functions import col, when, lit

airline_clean.printSchema()

def encode_time(column):
    return when(col(column) == "Early_Morning", lit(0)) \
            .when(col(column) == "Morning", lit(1)) \
            .when(col(column) == "Afternoon", lit(2)) \
            .when(col(column) == "Evening", lit(3)) \
            .when(col(column) == "Night", lit(4)) \
            .when(col(column) == "Late_Night", lit(5))

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)



In [25]:
airline_t4 = airline_clean \
    .withColumn("departure_time_encoded", encode_time("departure_time")) \
    .withColumn("arrival_time_encoded", encode_time("arrival_time"))
airline_t4.select("departure_time_encoded", "arrival_time_encoded").show()

+----------------------+--------------------+
|departure_time_encoded|arrival_time_encoded|
+----------------------+--------------------+
|                     3|                   4|
|                     0|                   1|
|                     0|                   0|
|                     1|                   2|
|                     1|                   1|
|                     1|                   2|
|                     1|                   1|
|                     2|                   3|
|                     0|                   1|
|                     2|                   3|
|                     2|                   3|
|                     1|                   2|
|                     0|                   1|
|                     1|                   2|
|                     2|                   3|
|                     1|                   1|
|                     0|                   1|
|                     0|                   1|
|                     3|          

## 5
Add a new column is_expensive: when(price > 6000, True).otherwise(False).

In [26]:
airline_t5 = airline_clean.withColumn(
    "is_expensive",
    when(col("price") > 6000, True).otherwise(False)
)
airline_t5.select("index", "price", "is_expensive").show(n=10)

+-----+-----+------------+
|index|price|is_expensive|
+-----+-----+------------+
|    0| 5953|       false|
|    1| 5953|       false|
|    2| 5956|       false|
|    3| 5955|       false|
|    4| 5955|       false|
|    5| 5955|       false|
|    6| 6060|        true|
|    7| 6060|        true|
|    8| 5954|       false|
|    9| 5954|       false|
+-----+-----+------------+
only showing top 10 rows


# Agregations
Get the average price per airline.

In [27]:
from pyspark.sql.functions import col, avg

airline_avg_price = airline_clean.groupBy(col("airline")) \
    .agg(avg(col("price")))

airline_avg_price.show()
        

+---------+------------------+
|  airline|        avg(price)|
+---------+------------------+
|   Indigo| 5324.216303339517|
| SpiceJet| 6179.278881367218|
|Air_India| 23507.01911190229|
|  AirAsia|4091.0727419555224|
| GO_FIRST| 5652.007595045959|
|  Vistara| 30396.53630170735|
+---------+------------------+



Average duration per route.

In [30]:


airline_avg_route = airline_t3.groupBy(col("route"))\
    .agg(avg(col("duration")).alias("avg_duration")) \
    .orderBy(col("avg_duration").asc())

airline_avg_route.show()

+--------------------+------------------+
|               route|      avg_duration|
+--------------------+------------------+
|  Bangalore -> Delhi|  9.77995566082195|
|     Mumbai -> Delhi|  9.81805726844943|
|  Delhi -> Bangalore| 10.35412503844018|
|     Delhi -> Mumbai|10.367774213738123|
|  Hyderabad -> Delhi|10.829816602522587|
| Bangalore -> Mumbai| 10.90507225639642|
|    Chennai -> Delhi|  11.1493744312541|
|    Kolkata -> Delhi| 11.60498857561711|
| Mumbai -> Bangalore|11.612022516178817|
| Hyderabad -> Mumbai|11.962923295795918|
|Hyderabad -> Bang...| 12.09331678643705|
|   Chennai -> Mumbai|12.374656244132625|
|    Delhi -> Chennai|12.433964745763944|
|  Delhi -> Hyderabad|12.518350118710492|
|   Mumbai -> Chennai|12.665900287564627|
|    Delhi -> Kolkata| 12.73596614766045|
|   Mumbai -> Kolkata|12.836848115489666|
|   Kolkata -> Mumbai|12.991932481150478|
|Bangalore -> Kolkata|13.099143404859825|
|Chennai -> Hyderabad|13.153984931732971|
+--------------------+------------

Minimum and maximum price per airline.

In [33]:
from pyspark.sql.functions import min, max

airline_max_min_price = airline_clean.groupby(col("airline")) \
                        .agg(
                            min(col("price")).alias("min_price"),
                            max(col("price")).alias("max_price")
                        )
airline_max_min_price.show()

+---------+---------+---------+
|  airline|min_price|max_price|
+---------+---------+---------+
|   Indigo|     1105|    31952|
| SpiceJet|     1106|    34158|
|Air_India|     1526|    90970|
|  AirAsia|     1105|    31917|
| GO_FIRST|     1105|    32803|
|  Vistara|     1714|   123071|
+---------+---------+---------+



                                                                                

Count flights by departure_time category.

In [34]:
airline_count_departure = airline_clean.groupby(col("departure_time")) \
                            .agg(
                                count(col("departure_time")).alias("count_departure")
                            )
airline_count_departure.show()

+--------------+---------------+
|departure_time|count_departure|
+--------------+---------------+
|       Evening|          65102|
|       Morning|          71146|
|    Late_Night|           1306|
|     Afternoon|          47794|
| Early_Morning|          66790|
|         Night|          48015|
+--------------+---------------+



In [35]:
sc.stop()