# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Luis Roberto Chávez Mancilla

**Professor**: Pablo Camarillo Ramirez

## Find Spark Installation

In [180]:
import findspark
findspark.init()

## Create SparkSession

In [181]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://9835fefe4923:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

## Define the schema

In [182]:
from robertoman.spark_utils import SparkUtils

airlines_schema_columns = [
    ("index", "int"),
    ("airline", "string"),
    ("flight", "string"),
    ("source_city", "string"),
    ("departure_time", "string"),
    ("stops", "string"),
    ("arrival_time", "string"),
    ("destination_city", "string"),
    ("class", "string"),
    ("duration", "float"),
    ("days_left", "int"),
    ("price", "int"),
]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

## Load CSV

In [183]:
df_airlines = (
    spark.read.option("header", "true")
    .schema(airlines_schema)
    .csv("/opt/spark/work-dir/data/airline/")
)

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

## Data Cleanning
- Drop unnecessary columns. Count how many null values the dataset has before/after the cleaning process.

In [184]:
from pyspark.sql.functions import trim, col, count, isnull, when

# Show null values in each column
df_airlines.select(
    [count(when(col(c).isNull(), c)).alias(c) for c in df_airlines.columns]
).show()



+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

In [185]:
print(f"Records after cleaning: {df_airlines.count()}")



Records after cleaning: 300153


                                                                                

In [186]:
airlines_df_clean = (
    df_airlines.dropDuplicates(["index"])
    .withColumn("airline", trim("airline"))
    .withColumn("source_city", trim("source_city"))
    .withColumn("destination_city", trim("destination_city"))
    .filter(col("price").isNotNull())
)
airlines_df_clean = df_airlines.dropna()

print(f"Records after cleaning: {df_airlines.count()}")



Records after cleaning: 300153


                                                                                

- Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

In [187]:
from pyspark.sql.functions import when, col

#mostrar los unique values para poder normalizar los valores.
airlines_df_clean.select("stops").distinct().show()

airline_categorical = airlines_df_clean.withColumn(
    "stops",
    when(col("stops") == "zero", 0)
    .when(col("stops") == "one", 1)
    .when(col("stops") == "two_or_more", 2),
)

airline_categorical.show(n=7)

                                                                                

+-----------+
|      stops|
+-----------+
|two_or_more|
|        one|
|       zero|
+-----------+

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morn

- Create a new column called route: “Delhi → Mumbai” from source_city and destination_city.

In [188]:
from pyspark.sql.functions import concat_ws

airline_routes = airlines_df_clean.withColumn(
    "route", concat_ws(" → ", col("source_city"), col("destination_city"))
)

airline_routes.show(n=10, truncate=False)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+--------------+
|index|airline |flight |source_city|departure_time|stops|arrival_time |destination_city|class  |duration|days_left|price|route         |
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+--------------+
|0    |SpiceJet|SG-8709|Delhi      |Evening       |zero |Night        |Mumbai          |Economy|2.17    |1        |5953 |Delhi → Mumbai|
|1    |SpiceJet|SG-8157|Delhi      |Early_Morning |zero |Morning      |Mumbai          |Economy|2.33    |1        |5953 |Delhi → Mumbai|
|2    |AirAsia |I5-764 |Delhi      |Early_Morning |zero |Early_Morning|Mumbai          |Economy|2.17    |1        |5956 |Delhi → Mumbai|
|3    |Vistara |UK-995 |Delhi      |Morning       |zero |Afternoon    |Mumbai          |Economy|2.25    |1        |5955 |Delhi → Mumbai|
|4    |Vistara |UK-963 |Delhi      |Morni

- Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.).

In [189]:
# ver los distintos
airlines_df_clean.select("departure_time").distinct().show()
airlines_df_clean.select("arrival_time").distinct().show()

                                                                                

+--------------+
|departure_time|
+--------------+
|       Evening|
|       Morning|
|    Late_Night|
|     Afternoon|
| Early_Morning|
|         Night|
+--------------+





+-------------+
| arrival_time|
+-------------+
|      Evening|
|      Morning|
|   Late_Night|
|    Afternoon|
|Early_Morning|
|        Night|
+-------------+



                                                                                

In [190]:
airlines_df_clean = airlines_df_clean.withColumn("departure_time", 
    when(col("departure_time") == "Early_Morning", 0)
    .when(col("departure_time") == "Morning", 1)
    .when(col("departure_time") == "Afternoon", 2)
    .otherwise(3)
)

airlines_df_clean = airlines_df_clean.withColumn("arrival_time", 
    when(col("arrival_time") == "Early_Morning", 0)
    .when(col("arrival_time") == "Morning", 1)
    .when(col("arrival_time") == "Afternoon", 2)
    .otherwise(3)
)

airlines_df_clean.show(n=10)

+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|             3| zero|           3|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi|             0| zero|           1|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi|             0| zero|           0|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|             1| zero|           2|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|             1| zero|           1|          Mumbai|Economy|    2.33|        1| 5955|
|    5| Vistara| UK-945|      De

- Add a new column is_expensive: when(price > 6000, True).otherwise(False).

In [191]:

airlines_is_expensive = airlines_df_clean.withColumn(
    "is_expensive",
    when(col("price") > 6000, True).otherwise(False)
)

airlines_is_expensive.select("index","airline","price", "is_expensive").show(10, truncate=False)


+-----+--------+-----+------------+
|index|airline |price|is_expensive|
+-----+--------+-----+------------+
|0    |SpiceJet|5953 |false       |
|1    |SpiceJet|5953 |false       |
|2    |AirAsia |5956 |false       |
|3    |Vistara |5955 |false       |
|4    |Vistara |5955 |false       |
|5    |Vistara |5955 |false       |
|6    |Vistara |6060 |true        |
|7    |Vistara |6060 |true        |
|8    |GO_FIRST|5954 |false       |
|9    |GO_FIRST|5954 |false       |
+-----+--------+-----+------------+
only showing top 10 rows


- Get the average price per airline.

In [192]:
from pyspark.sql.functions import avg

airlines_df_clean.groupBy("airline").agg(
    avg("price").alias("avg_price")
).show(truncate=False)




+---------+------------------+
|airline  |avg_price         |
+---------+------------------+
|Indigo   |5324.216303339517 |
|SpiceJet |6179.278881367218 |
|Air_India|23507.01911190229 |
|AirAsia  |4091.0727419555224|
|GO_FIRST |5652.007595045959 |
|Vistara  |30396.53630170735 |
+---------+------------------+



                                                                                

- Average duration per route.

In [193]:
airline_routes.groupBy("route").agg(avg("duration").alias("avg_duration")).orderBy(
    col("avg_duration").desc()
).show(truncate=False)



+---------------------+------------------+
|route                |avg_duration      |
+---------------------+------------------+
|Kolkata → Chennai    |14.774181563782903|
|Chennai → Kolkata    |14.515774035955694|
|Bangalore → Chennai  |14.480207509137166|
|Bangalore → Hyderabad|14.162432783513621|
|Chennai → Bangalore  |13.952593563812163|
|Kolkata → Hyderabad  |13.853107514948396|
|Kolkata → Bangalore  |13.79294687524098 |
|Hyderabad → Kolkata  |13.535322410033165|
|Hyderabad → Chennai  |13.293238468912078|
|Mumbai → Hyderabad   |13.263310412247066|
|Chennai → Hyderabad  |13.153984931732971|
|Bangalore → Kolkata  |13.099143404859825|
|Kolkata → Mumbai     |12.991932481150478|
|Mumbai → Kolkata     |12.836848115489666|
|Delhi → Kolkata      |12.73596614766045 |
|Mumbai → Chennai     |12.665900287564627|
|Delhi → Hyderabad    |12.518350118710492|
|Delhi → Chennai      |12.433964745763944|
|Chennai → Mumbai     |12.374656244132625|
|Hyderabad → Bangalore|12.09331678643705 |
+----------

                                                                                

- Minimum and maximum price per airline.

In [194]:
from pyspark.sql.functions import min, max

airlines_df_clean.groupBy("airline").agg(
    min("price").alias("min_price"),
    max("price").alias("max_price")
).show(truncate=False)




+---------+---------+---------+
|airline  |min_price|max_price|
+---------+---------+---------+
|Indigo   |1105     |31952    |
|SpiceJet |1106     |34158    |
|Air_India|1526     |90970    |
|AirAsia  |1105     |31917    |
|GO_FIRST |1105     |32803    |
|Vistara  |1714     |123071   |
+---------+---------+---------+



                                                                                

- Count flights by departure_time category.

In [195]:
df_airlines.groupBy("departure_time").count().show(truncate=False)



+--------------+-----+
|departure_time|count|
+--------------+-----+
|Evening       |65102|
|Morning       |71146|
|Late_Night    |1306 |
|Afternoon     |47794|
|Early_Morning |66790|
|Night         |48015|
+--------------+-----+



                                                                                

## Stop

In [196]:
sc.stop()