# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 30th 2025

**Student Name**: Juan Alonso

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Airlines with PostgreSQL") \
    .master("spark://5fded284cb17:7077") \
    .config("spark.jars", "/opt/spark/work-dir/jars/postgresql-42.7.8.jar") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

25/09/30 14:45:41 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


# Define schema

In [3]:
from juanalonso.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema 

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

# Load CSV

In [5]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

# Data Cleaning

In [6]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")

df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

airlines_clean_v2 = df_airlines.dropna()
print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()
print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

                                                                                

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

# MAPING STOPS

In [7]:
from pyspark.sql.functions import when, lit

airlines_t1 = airlines_clean_v2.withColumn("stops",
                                           when(airlines_clean_v2.stops == "zero", lit(0))
                                           .when(airlines_clean_v2.stops == "one", lit(1))
                                           .when(airlines_clean_v2.stops == "two_or_more", lit(2))
                                          ) 
airlines_t1.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

# New Column Route

In [8]:
from pyspark.sql.functions import concat_ws

airlines_clean_v3 = airlines_t1.withColumn("route",
                                           concat_ws("→",col("source_city"),col("destination_city")))
airlines_clean_v3.show(n=5)

                                

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+------------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|       route|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+------------+
|    0|SpiceJet|SG-8709|      Delhi|       Evening|    0|        Night|          Mumbai|Economy|    2.17|        1| 5953|Delhi→Mumbai|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5953|Delhi→Mumbai|
|    2| AirAsia| I5-764|      Delhi| Early_Morning|    0|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|Delhi→Mumbai|
|    3| Vistara| UK-995|      Delhi|       Morning|    0|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|Delhi→Mumbai|
|    4| Vistara| UK-963|      Delhi|       Morning|    

# Departure and Arrival to numerical

In [9]:
airlines_clean_v3.select("departure_time").distinct().show()
airlines_clean_v3.select("arrival_time").distinct().show()

                                                                                

+--------------+
|departure_time|
+--------------+
|       Evening|
|       Morning|
|    Late_Night|
|     Afternoon|
| Early_Morning|
|         Night|
+--------------+





+-------------+
| arrival_time|
+-------------+
|      Evening|
|      Morning|
|   Late_Night|
|    Afternoon|
|Early_Morning|
|        Night|
+-------------+



                                                                                

In [10]:

airlines_t1 = airlines_clean_v3 .withColumn("departure_time",
                                            when(airlines_clean_v3.departure_time == "Early_Morning", lit(0)).when(airlines_clean_v3.departure_time == "Morning", lit(1))
                                            .when(airlines_clean_v3.departure_time == "Afternoon", lit(2)).when(airlines_clean_v3.departure_time == "Evening", lit(3))
                                            .when(airlines_clean_v3.departure_time == "Night", lit(4)).when(airlines_clean_v3.departure_time == "Late_Night", lit(5))
) 
airlines_t1.show(n=5)
airlines_t1 = airlines_t1 .withColumn("arrival_time",
                                            when(airlines_t1.arrival_time == "Early_Morning", lit(0)).when(airlines_t1.arrival_time == "Morning", lit(1))
                                            .when(airlines_t1.arrival_time == "Afternoon", lit(2)).when(airlines_t1.arrival_time == "Evening", lit(3))
                                            .when(airlines_t1.arrival_time == "Night", lit(4)).when(airlines_t1.arrival_time == "Late_Night", lit(5))
) 
airlines_t1.show(n=5)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+------------+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|       route|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+------------+
|    0|SpiceJet|SG-8709|      Delhi|             3|    0|        Night|          Mumbai|Economy|    2.17|        1| 5953|Delhi→Mumbai|
|    1|SpiceJet|SG-8157|      Delhi|             0|    0|      Morning|          Mumbai|Economy|    2.33|        1| 5953|Delhi→Mumbai|
|    2| AirAsia| I5-764|      Delhi|             0|    0|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|Delhi→Mumbai|
|    3| Vistara| UK-995|      Delhi|             1|    0|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|Delhi→Mumbai|
|    4| Vistara| UK-963|      Delhi|             1|    

# Expensive column

In [11]:
airlines_t1 = airlines_t1 .withColumn("is_expensive",when(col("price")>6000,True).otherwise(False))
airlines_t1.show(n=5)

+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+------------+------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|       route|is_expensive|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+------------+------------+
|    0|SpiceJet|SG-8709|      Delhi|             3|    0|           4|          Mumbai|Economy|    2.17|        1| 5953|Delhi→Mumbai|       false|
|    1|SpiceJet|SG-8157|      Delhi|             0|    0|           1|          Mumbai|Economy|    2.33|        1| 5953|Delhi→Mumbai|       false|
|    2| AirAsia| I5-764|      Delhi|             0|    0|           0|          Mumbai|Economy|    2.17|        1| 5956|Delhi→Mumbai|       false|
|    3| Vistara| UK-995|      Delhi|             1|    0|           2|          Mumbai|Economy|    2.25|        1| 595

## POSTRESQL DataFrame

In [12]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
table_name = "airlines_transformed"

df_airlines.write \
    .format("jdbc") \
    .option("url", jdbc_url) \
    .option("dbtable", table_name) \
    .option("user", "postgres") \
    .option("password", "Admin@1234") \
    .option("driver", "org.postgresql.Driver") \
    .save()

print("DataFrame successfully written into a PosgreSQL DB !")



DataFrame successfully written into a PosgreSQL DB !


                                                                                

In [13]:
jdbc_url = "jdbc:postgresql://postgres-iteso:5432/postgres"
db_properties = {
      "user": "postgres",      
      "password": "Admin@1234",
      "driver": "org.postgresql.Driver"
  }

df = spark.read \
    .jdbc(url=jdbc_url, table=table_name, properties=db_properties)

df.printSchema()
df.show(5, truncate=False)

root
 |-- index: integer (nullable = true)
 |-- airline: string (nullable = true)
 |-- flight: string (nullable = true)
 |-- source_city: string (nullable = true)
 |-- departure_time: string (nullable = true)
 |-- stops: string (nullable = true)
 |-- arrival_time: string (nullable = true)
 |-- destination_city: string (nullable = true)
 |-- class: string (nullable = true)
 |-- duration: float (nullable = true)
 |-- days_left: integer (nullable = true)
 |-- price: integer (nullable = true)

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index|airline |flight |source_city|departure_time|stops|arrival_time |destination_city|class  |duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|0    |SpiceJet|SG-8709|Delhi      |Evening       |zero |Night        |Mumbai          |Economy|2.17    |1        |5953 |
|1    |SpiceJet|S