# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Luis Daniel Arellano Núñez

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on storage solutions with PosgreSQL") \
    .master("spark://8c99a6c586f5:7077") \
    .config("spark.jars", "/opt/spark/work-dir/jars/postgresql-42.7.8.jar") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

25/10/01 01:26:35 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


### Define the Schema

In [3]:
from Daniel_Arellano.sql_im import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

### Load the CSV

In [4]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

### Data cleaning

In [6]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"Number of null columns is: {df_airlines.count() - airlines_clean_v2.count()}")

                                                                                

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+





Number of null columns is: 0


                                                                                

### Eliminate unecessary columns for the lab (class and days_left)

In [7]:
df_lab03_clean = airlines_clean_v2.drop("class")
df_lab03_clean = df_lab03_clean.drop("days_left")

df_lab03_clean.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|duration|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|    2.17| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|    2.33| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|    2.17| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|    2.25| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|    2.33| 5955|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
only showing top 5 rows


### Normalize categorical values: map “zero” → 0, “one” → 1, etc. in stops.

In [8]:
from pyspark.sql.functions import when, lit, col

df_stops = df_lab03_clean.withColumn(
    "stops",
    when(df_lab03_clean.stops == "zero", lit(0))
    .when(df_lab03_clean.stops == "one", lit(1))
    .when(df_lab03_clean.stops == "two_or_more", lit(2))
)

df_stops.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|duration|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|    2.17| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|    2.33| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|    2.17| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|    2.25| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning|    0|      Morning|          Mumbai|    2.33| 5955|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+
only showing top 5 rows


### Create a new column called route: “Delhi → Mumbai” from source_city and destination_city.

In [9]:
from pyspark.sql.functions import concat_ws, col

df_routes = df_stops.withColumn(
    "route", concat_ws(" → ", df_stops.source_city, df_stops.destination_city)
)

df_routes.show(5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+--------------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|duration|price|         route|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+--------+-----+--------------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|    2.17| 5953|Delhi → Mumbai|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|    2.33| 5953|Delhi → Mumbai|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|    2.17| 5956|Delhi → Mumbai|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|    2.25| 5955|Delhi → Mumbai|
|    4| Vistara| UK-963|      Delhi|       Morning|    0|      Morning|          Mumbai|    2.33| 5955|Delhi → Mumbai|
+-----+--------+-------+-----------+------------

### Transform departure_time and arrival_time to numerical categories (Morning, Afternoon, etc.), then encode as numbers (0=Early_Morning, 1=Morning, etc.).

In [10]:
from pyspark.sql.functions import when, lit, col

df_times = df_routes.withColumn(
    "departure_time",
    when(df_routes.departure_time == "Early_Morning", lit(0))
    .when(df_routes.departure_time == "Morning", lit(1))
    .when(df_routes.departure_time == "Afternoon", lit(2))
    .when(df_routes.departure_time == "Evening", lit(3))
    .when(df_routes.departure_time == "Night", lit(4))
    .when(df_routes.departure_time == "Late_Night", lit(5))
)

df_times = df_times.withColumn(
    "arrival_time",
    when(df_times.arrival_time == "Early_Morning", lit(0))
    .when(df_times.arrival_time == "Morning", lit(1))
    .when(df_times.arrival_time == "Afternoon", lit(2))
    .when(df_times.arrival_time == "Evening", lit(3))
    .when(df_times.arrival_time == "Night", lit(4))
    .when(df_times.arrival_time == "Late_Night", lit(5))
)

df_times.show(n=5)

+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|duration|price|         route|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+
|    0|SpiceJet|SG-8709|      Delhi|             3|    0|           4|          Mumbai|    2.17| 5953|Delhi → Mumbai|
|    1|SpiceJet|SG-8157|      Delhi|             0|    0|           1|          Mumbai|    2.33| 5953|Delhi → Mumbai|
|    2| AirAsia| I5-764|      Delhi|             0|    0|           0|          Mumbai|    2.17| 5956|Delhi → Mumbai|
|    3| Vistara| UK-995|      Delhi|             1|    0|           2|          Mumbai|    2.25| 5955|Delhi → Mumbai|
|    4| Vistara| UK-963|      Delhi|             1|    0|           1|          Mumbai|    2.33| 5955|Delhi → Mumbai|
+-----+--------+-------+-----------+--------------+-----

### Add a new column is_expensive: when(price > 6000, True).otherwise(False).

In [11]:
from pyspark.sql.functions import when

df_expensive = df_times.withColumn(
    "is_expensive",
    when(df_times.price > 6000, True)
    .otherwise(False)
)

df_expensive.show(n=10)

df_final = df_expensive

+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|duration|price|         route|is_expensive|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+------------+
|    0|SpiceJet|SG-8709|      Delhi|             3|    0|           4|          Mumbai|    2.17| 5953|Delhi → Mumbai|       false|
|    1|SpiceJet|SG-8157|      Delhi|             0|    0|           1|          Mumbai|    2.33| 5953|Delhi → Mumbai|       false|
|    2| AirAsia| I5-764|      Delhi|             0|    0|           0|          Mumbai|    2.17| 5956|Delhi → Mumbai|       false|
|    3| Vistara| UK-995|      Delhi|             1|    0|           2|          Mumbai|    2.25| 5955|Delhi → Mumbai|       false|
|    4| Vistara| UK-963|      Delhi|             1|    0|           1|          Mum

### Average price per airline.

In [18]:
# SELECT AVG(price) FROM df AS AVG_price GROUP BY airline
from pyspark.sql.functions import col, avg, min, max, count

df_final.groupBy(col("airline")).agg(avg("price").alias("Average Price")).show()



+---------+------------------+
|  airline|     Average Price|
+---------+------------------+
|   Indigo| 5324.216303339517|
| SpiceJet| 6179.278881367218|
|Air_India| 23507.01911190229|
|  AirAsia|4091.0727419555224|
| GO_FIRST| 5652.007595045959|
|  Vistara| 30396.53630170735|
+---------+------------------+



                                                                                

### Average duration per route.

In [19]:
# SELECT AVG(duration) FROM df AS AVG_duration GROUP BY route
from pyspark.sql.functions import col, avg, min, max, count

df_final.groupBy(col("route")).agg(avg("duration").alias("Average duration")).show()



+--------------------+------------------+
|               route|  Average duration|
+--------------------+------------------+
|Hyderabad → Banga...| 12.09331678643705|
|    Mumbai → Kolkata|12.836848115489666|
|    Mumbai → Chennai|12.665900287564627|
|  Mumbai → Hyderabad|13.263310412247066|
|  Mumbai → Bangalore|11.612022516178817|
|   Bangalore → Delhi|  9.77995566082195|
| Kolkata → Bangalore| 13.79294687524098|
|   Hyderabad → Delhi|10.829816602522587|
| Bangalore → Chennai|14.480207509137166|
|  Bangalore → Mumbai| 10.90507225639642|
|      Mumbai → Delhi|  9.81805726844943|
|  Hyderabad → Mumbai|11.962923295795918|
|   Kolkata → Chennai|14.774181563782903|
| Kolkata → Hyderabad|13.853107514948396|
|   Delhi → Bangalore| 10.35412503844018|
|      Delhi → Mumbai|10.367774213738123|
| Hyderabad → Chennai|13.293238468912078|
|Bangalore → Hyder...|14.162432783513621|
|     Kolkata → Delhi| 11.60498857561711|
|   Delhi → Hyderabad|12.518350118710492|
+--------------------+------------

                                                                                

### Minimum and maximum price per airline.

In [20]:
# SELECT min(price), max(price) FROM df GROUP BY airline
from pyspark.sql.functions import col, avg, min, max, count

df_final.groupBy(col("airline")).agg(min("price").alias("Min price"), max("price").alias("Max price")).show()



+---------+---------+---------+
|  airline|Min price|Max price|
+---------+---------+---------+
|   Indigo|     1105|    31952|
| SpiceJet|     1106|    34158|
|Air_India|     1526|    90970|
|  AirAsia|     1105|    31917|
| GO_FIRST|     1105|    32803|
|  Vistara|     1714|   123071|
+---------+---------+---------+



                                                                                

### Count flights by departure_time category.

In [24]:
# SELECT count(flight) FROM df GROUP BY departure_time
from pyspark.sql.functions import col, avg, count

#Here i use another df version where the departure time was still a string 
#Is not an error, i think the string version is more understandable for the user
df_routes.groupBy(col("departure_time")).agg(count("flight").alias("# of flights")).show()



+--------------+------------+
|departure_time|# of flights|
+--------------+------------+
|       Evening|       65102|
|       Morning|       71146|
|    Late_Night|        1306|
|     Afternoon|       47794|
| Early_Morning|       66790|
|         Night|       48015|
+--------------+------------+



                                                                                

In [12]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
table_name = "airlines_Daniel_Arellano"

df_final.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("user", "postgres") \
    .option("password", "Admin@1234") \
    .option("driver", "org.postgresql.Driver") \
    .save()

print("DataFrame successfully written into a PosgreSQL DB !")



DataFrame successfully written into a PosgreSQL DB !


                                                                                

In [13]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
db_properties = {
      "user": "postgres",      
      "password": "Admin@1234",
      "driver": "org.postgresql.Driver"
  }

df = spark.read \
    .jdbc(url=jdbc_url, table=table_name, properties=db_properties)

df.printSchema()
df.show(5, truncate=False)

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: integer (nullable = true)
 |-- stops: integer (nullable = true)
 |-- arrival_time: integer (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- price: integer (nullable = true)
 |-- route: string (nullable = true)
 |-- is_expensive: boolean (nullable = true)



[Stage 37:>                                                         (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+------------+
|index|airline |flight |source_city|departure_time|stops|arrival_time|destination_city|duration|price|route         |is_expensive|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+--------+-----+--------------+------------+
|0    |SpiceJet|SG-8709|Delhi      |3             |0    |4           |Mumbai          |2.17    |5953 |Delhi → Mumbai|false       |
|1    |SpiceJet|SG-8157|Delhi      |0             |0    |1           |Mumbai          |2.33    |5953 |Delhi → Mumbai|false       |
|2    |AirAsia |I5-764 |Delhi      |0             |0    |0           |Mumbai          |2.17    |5956 |Delhi → Mumbai|false       |
|3    |Vistara |UK-995 |Delhi      |1             |0    |2           |Mumbai          |2.25    |5955 |Delhi → Mumbai|false       |
|4    |Vistara |UK-963 |Delhi      |1             |0    |1           |Mumbai       

                                                                                

In [14]:
sc.stop()