# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Francisco Delgado

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://9eae4bedccb3:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/10/02 04:42:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
from pyspark.sql.types import (
    StructType, StructField, StringType, IntegerType, FloatType,
    DoubleType, LongType, ShortType, DecimalType
)
from pyspark.sql.functions import col, when, count, isnan, lit, concat, udf

NUMERIC_TYPES = (DoubleType, FloatType, IntegerType, LongType, ShortType, DecimalType)

def count_nulls(df):
    exprs = []
    for field in df.schema.fields:
        c = field.name
        if isinstance(field.dataType, NUMERIC_TYPES):
            exprs.append(count(when(col(c).isNull() | isnan(col(c)), c)).alias(c))
        else:
            exprs.append(count(when(col(c).isNull(), c)).alias(c))
    return df.select(exprs)



In [4]:
airlines_schema = StructType([
    StructField("index", IntegerType(), True),
    StructField("airline", StringType(), True),
    StructField("flight", StringType(), True),
    StructField("source_city", StringType(), True),
    StructField("departure_time", StringType(), True),
    StructField("stops", StringType(), True),
    StructField("arrival_time", StringType(), True),
    StructField("destination_city", StringType(), True),
    StructField("class", StringType(), True),
    StructField("duration", FloatType(), True),
    StructField("days_left", IntegerType(), True),
    StructField("price", IntegerType(), True),
])
airlines_schema


StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [5]:
DATA_PATH = "/opt/spark/work-dir/data/airline/"  # change if needed
df = spark.read \
    .option("header", "true") \
    .schema(airlines_schema) \
    .csv(DATA_PATH)
df.show(5)
df.printSchema()

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

In [6]:
print("Nulls BEFORE cleaning:")
count_nulls(df).show(truncate=False)

cols_to_drop = []  # e.g., ["index", "flight"]
df_clean = df.drop(*cols_to_drop)

print("Nulls AFTER cleaning:")
count_nulls(df_clean).show(truncate=False)

Nulls BEFORE cleaning:


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|0    |0      |0     |0          |0             |0    |0           |0               |0    |0       |0        |0    |
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+

Nulls AFTER cleaning:




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|0    |0      |0     |0          |0             |0    |0           |0               |0    |0       |0        |0    |
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [7]:
stops_map = {
    "non-stop": 0, "nonstop": 0, "zero": 0, "0": 0,
    "one": 1, "1": 1,
    "two": 2, "2": 2,
    "three": 3, "3": 3
}
def norm_stops(s):
    if s is None:
        return None
    key = s.strip().lower()
    return stops_map.get(key, None)
from pyspark.sql.types import IntegerType
norm_stops_udf = udf(norm_stops, IntegerType())

# 5.2 Time category encoding
time_order = [
    "Early_Morning", "Morning", "Afternoon", "Evening", "Night", "Late_Night"
]
time_to_id = {name: i for i, name in enumerate(time_order)}
def encode_time_bucket(s):
    if s is None:
        return None
    return time_to_id.get(s.strip(), None)
encode_time_udf = udf(encode_time_bucket, IntegerType())

df_tr = (
    df_clean
    .withColumn("stops_n", norm_stops_udf(col("stops")))
    .withColumn("route", concat(col("source_city"), lit(" → "), col("destination_city")))
    .withColumn("dep_time_id", encode_time_udf(col("departure_time")))
    .withColumn("arr_time_id", encode_time_udf(col("arrival_time")))
    .withColumn("is_expensive", (col("price") > 6000))
)
df_tr.select("airline","flight","route","stops","stops_n","departure_time","dep_time_id","arrival_time","arr_time_id","price","is_expensive").show(10, truncate=False)



+--------+-------+--------------+-----+-------+--------------+-----------+-------------+-----------+-----+------------+
|airline |flight |route         |stops|stops_n|departure_time|dep_time_id|arrival_time |arr_time_id|price|is_expensive|
+--------+-------+--------------+-----+-------+--------------+-----------+-------------+-----------+-----+------------+
|SpiceJet|SG-8709|Delhi → Mumbai|zero |0      |Evening       |3          |Night        |4          |5953 |false       |
|SpiceJet|SG-8157|Delhi → Mumbai|zero |0      |Early_Morning |0          |Morning      |1          |5953 |false       |
|AirAsia |I5-764 |Delhi → Mumbai|zero |0      |Early_Morning |0          |Early_Morning|0          |5956 |false       |
|Vistara |UK-995 |Delhi → Mumbai|zero |0      |Morning       |1          |Afternoon    |2          |5955 |false       |
|Vistara |UK-963 |Delhi → Mumbai|zero |0      |Morning       |1          |Morning      |1          |5955 |false       |
|Vistara |UK-945 |Delhi → Mumbai|zero |0

                                                                                

In [8]:
from pyspark.sql.functions import avg, min as spark_min, max as spark_max

avg_price_per_airline = df_tr.groupBy("airline").agg(avg("price").alias("avg_price"))
avg_price_per_airline.orderBy(col("avg_price").desc()).show(20, truncate=False)

avg_duration_per_route = df_tr.groupBy("route").agg(avg("duration").alias("avg_duration"))
avg_duration_per_route.orderBy(col("avg_duration").desc()).show(20, truncate=False)

minmax_price_per_airline = df_tr.groupBy("airline").agg(
    spark_min("price").alias("min_price"),
    spark_max("price").alias("max_price")
)
minmax_price_per_airline.orderBy("airline").show(50, truncate=False)

count_by_dep_cat = df_tr.groupBy("dep_time_id").count().orderBy("dep_time_id")
count_by_dep_cat.show(10, truncate=False)

                                                                                

+---------+------------------+
|airline  |avg_price         |
+---------+------------------+
|Vistara  |30396.53630170735 |
|Air_India|23507.01911190229 |
|SpiceJet |6179.278881367218 |
|GO_FIRST |5652.007595045959 |
|Indigo   |5324.216303339517 |
|AirAsia  |4091.0727419555224|
+---------+------------------+



                                                                                

+---------------------+------------------+
|route                |avg_duration      |
+---------------------+------------------+
|Kolkata → Chennai    |14.774181563782903|
|Chennai → Kolkata    |14.515774035955694|
|Bangalore → Chennai  |14.480207509137166|
|Bangalore → Hyderabad|14.162432783513621|
|Chennai → Bangalore  |13.952593563812163|
|Kolkata → Hyderabad  |13.853107514948396|
|Kolkata → Bangalore  |13.79294687524098 |
|Hyderabad → Kolkata  |13.535322410033165|
|Hyderabad → Chennai  |13.293238468912078|
|Mumbai → Hyderabad   |13.263310412247066|
|Chennai → Hyderabad  |13.153984931732971|
|Bangalore → Kolkata  |13.099143404859825|
|Kolkata → Mumbai     |12.991932481150478|
|Mumbai → Kolkata     |12.836848115489666|
|Delhi → Kolkata      |12.73596614766045 |
|Mumbai → Chennai     |12.665900287564627|
|Delhi → Hyderabad    |12.518350118710492|
|Delhi → Chennai      |12.433964745763944|
|Chennai → Mumbai     |12.374656244132625|
|Hyderabad → Bangalore|12.09331678643705 |
+----------

                                                                                

+---------+---------+---------+
|airline  |min_price|max_price|
+---------+---------+---------+
|AirAsia  |1105     |31917    |
|Air_India|1526     |90970    |
|GO_FIRST |1105     |32803    |
|Indigo   |1105     |31952    |
|SpiceJet |1106     |34158    |
|Vistara  |1714     |123071   |
+---------+---------+---------+





+-----------+-----+
|dep_time_id|count|
+-----------+-----+
|0          |66790|
|1          |71146|
|2          |47794|
|3          |65102|
|4          |48015|
|5          |1306 |
+-----------+-----+



                                                                                