# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Renata Tejeda Mercado

**Professor**: Pablo Camarillo Ramirez

# Find PySpark Installation

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://34c8a8d7a9e7:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 06:33:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Define the Schema

In [3]:
from codrenatat.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

# Load CSV

In [4]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

# Count how many null values the dataset has before/after the cleaning process.

In [7]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

number of records before cleaning: 300153
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+

number of records after cleaning with dropna: 300153
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|   

# Drop unnecessary columns.

In [8]:
# work from the trimmed one.
cleaned = airlines_clean

# drop columns you don't need.
cleaned = airlines_clean.drop("index", "flight", "class", "days_left")
cleaned.printSchema()

root
 |-- airline: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- price: integer (nullable = true)



# Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

In [9]:
from pyspark.sql.functions import col, trim, lower, when, lit, concat

tmp = cleaned

# 1) Map (zero=0, one=1, else=2)
tmp = tmp.withColumn("stops_trim", trim(col("stops")))
tmp = tmp.withColumn("stops_lower", lower(col("stops_trim")))
tmp = tmp.withColumn(
    "stops_numeric",
    when(col("stops_lower") == "zero", lit(0))
    .when(col("stops_lower") == "one", lit(1))
    .otherwise(lit(2))  # assume anything else means 2
)

# Create a new column called route: “Delhi → Mumbai” from source_city and destination_city.

In [10]:
# Delhi → Mumbai
tmp = tmp.withColumn(
    "route",
    concat(trim(col("source_city")), lit(" -> "), trim(col("destination_city")))
)

# Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.).

In [11]:
tmp = tmp.withColumn("departure_time_low", lower(trim(col("departure_time"))))
tmp = tmp.withColumn("arrival_time_low",   lower(trim(col("arrival_time"))))

# map to numbers:
# 0=early_morning, 1=morning, 2=afternoon, 3=evening, 4=night, 5=late_night
tmp = tmp.withColumn(
    "departure_time_cat",
    when(col("departure_time_low") == "early_morning", lit(0))
    .when(col("departure_time_low") == "morning",      lit(1))
    .when(col("departure_time_low") == "afternoon",    lit(2))
    .when(col("departure_time_low") == "evening",      lit(3))
    .when(col("departure_time_low") == "night",        lit(4))
    .when(col("departure_time_low") == "late_night",   lit(5))
    .otherwise(lit(None).cast("int"))
)

tmp = tmp.withColumn(
    "arrival_time_cat",
    when(col("arrival_time_low") == "early_morning", lit(0))
    .when(col("arrival_time_low") == "morning",      lit(1))
    .when(col("arrival_time_low") == "afternoon",    lit(2))
    .when(col("arrival_time_low") == "evening",      lit(3))
    .when(col("arrival_time_low") == "night",        lit(4))
    .when(col("arrival_time_low") == "late_night",   lit(5))
    .otherwise(lit(None).cast("int"))
)

# Add a new column is_expensive: when(price > 6000, True).otherwise(False).

In [12]:
# 4) IS_EXPENSIVE
tmp = tmp.withColumn("is_expensive", col("price") > 6000)

# final df
airlines_final = tmp

# quick check
airlines_final.select(
    "airline", "route",
    "stops", "stops_numeric",
    "departure_time", "departure_time_cat",
    "arrival_time", "arrival_time_cat",
    "price", "is_expensive"
).show(15, truncate=False)



+---------+---------------+-----+-------------+--------------+------------------+------------+----------------+-----+------------+
|airline  |route          |stops|stops_numeric|departure_time|departure_time_cat|arrival_time|arrival_time_cat|price|is_expensive|
+---------+---------------+-----+-------------+--------------+------------------+------------+----------------+-----+------------+
|SpiceJet |Delhi -> Mumbai|zero |0            |Early_Morning |0                 |Morning     |1               |5953 |false       |
|Vistara  |Delhi -> Mumbai|zero |0            |Morning       |1                 |Afternoon   |2               |5955 |false       |
|Vistara  |Delhi -> Mumbai|zero |0            |Morning       |1                 |Afternoon   |2               |5955 |false       |
|Vistara  |Delhi -> Mumbai|zero |0            |Morning       |1                 |Morning     |1               |6060 |true        |
|GO_FIRST |Delhi -> Mumbai|zero |0            |Afternoon     |2                 |Ev

                                                                                

In [13]:

# 1) Get the average price per airline.
avg_price_per_airline = airlines_final.groupBy("airline") \
    .agg(round(avg("price"), 2).alias("avg_price")) \
    .orderBy("airline")
avg_price_per_airline.show(truncate=False)

# 2) Average duration per route.
avg_duration_per_route = airlines_final.groupBy("route") \
    .agg(round(avg("duration"), 2).alias("avg_duration_hours")) \
    .orderBy("route")
avg_duration_per_route.show(10, truncate=False)

# 3) Minimum and maximum price per airline.
min_max_price_per_airline = airlines_final.groupBy("airline") \
    .agg(
        spark_min("price").alias("min_price"),
        spark_max("price").alias("max_price")
    ) \
    .orderBy("airline")
min_max_price_per_airline.show(truncate=False)

# 4) Count flights by departure_time category.
count_by_dep_time = airlines_final.groupBy("departure_time", "departure_time_cat") \
    .count() \
    .orderBy("departure_time_cat")
count_by_dep_time.show(truncate=False)


                                                                                

+---------+---------+
|airline  |avg_price|
+---------+---------+
|AirAsia  |4091.07  |
|Air_India|23507.02 |
|GO_FIRST |5652.01  |
|Indigo   |5324.22  |
|SpiceJet |6179.28  |
|Vistara  |30396.54 |
+---------+---------+

+----------------------+------------------+
|route                 |avg_duration_hours|
+----------------------+------------------+
|Bangalore -> Chennai  |14.48             |
|Bangalore -> Delhi    |9.78              |
|Bangalore -> Hyderabad|14.16             |
|Bangalore -> Kolkata  |13.1              |
|Bangalore -> Mumbai   |10.91             |
|Chennai -> Bangalore  |13.95             |
|Chennai -> Delhi      |11.15             |
|Chennai -> Hyderabad  |13.15             |
|Chennai -> Kolkata    |14.52             |
|Chennai -> Mumbai     |12.37             |
+----------------------+------------------+
only showing top 10 rows
+---------+---------+---------+
|airline  |min_price|max_price|
+---------+---------+---------+
|AirAsia  |1105     |31917    |
|Air_India

In [14]:
sc.stop()