# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Francisco Delgado

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

ModuleNotFoundError: No module named 'findspark'

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://c88eb4f6e4be:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [None]:
from pcamarillor.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

In [None]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/Airline/")

df_airlines.show(n=5)

In [None]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, concat_ws, avg, min, max, count

# Crear sesión Spark
spark = SparkSession.builder.appName("Lab03_DataCleaning").getOrCreate()

# Cargar dataset (ajusta la ruta según donde lo tengas guardado)
df = spark.read.csv("airlines.csv", header=True, inferSchema=True)

# Vista inicial
df.show(5)
df.printSchema()


In [None]:
# Contar nulos antes
df.select([count(when(col(c).isNull(), c)).alias(c) for c in df.columns]).show()

# Eliminar columnas innecesarias (ejemplo: 'flight', 'additional_info')
df_clean = df.drop("flight", "additional_info")

# Contar nulos después
df_clean.select([count(when(col(c).isNull(), c)).alias(c) for c in df_clean.columns]).show()


In [None]:
# Normalizar stops
df_clean = df_clean.withColumn("stops", 
    when(col("stops") == "zero", 0)
    .when(col("stops") == "one", 1)
    .when(col("stops") == "two_or_more", 2)
    .otherwise(col("stops"))
)

# Crear route
df_clean = df_clean.withColumn("route", concat_ws(" → ", col("source_city"), col("destination_city")))

# Categorizar departure_time
df_clean = df_clean.withColumn("departure_cat",
    when(col("departure_time").isin("Early_Morning"), 0)
    .when(col("departure_time").isin("Morning"), 1)
    .when(col("departure_time").isin("Afternoon"), 2)
    .when(col("departure_time").isin("Evening"), 3)
    .when(col("departure_time").isin("Night"), 4)
)

# Categorizar arrival_time
df_clean = df_clean.withColumn("arrival_cat",
    when(col("arrival_time").isin("Early_Morning"), 0)
    .when(col("arrival_time").isin("Morning"), 1)
    .when(col("arrival_time").isin("Afternoon"), 2)
    .when(col("arrival_time").isin("Evening"), 3)
    .when(col("arrival_time").isin("Night"), 4)
)

# Columna is_expensive
df_clean = df_clean.withColumn("is_expensive", when(col("price") > 6000, True).otherwise(False))


In [None]:
# Promedio de precio por aerolínea
df_clean.groupBy("airline").agg(avg("price").alias("avg_price")).show()

# Duración promedio por ruta
df_clean.groupBy("route").agg(avg("duration").alias("avg_duration")).show()

# Precio min y max por aerolínea
df_clean.groupBy("airline").agg(min("price").alias("min_price"), max("price").alias("max_price")).show()

# Conteo de vuelos por categoría de horario de salida
df_clean.groupBy("departure_cat").count().show()
