# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Computer Systems Engineering  </center>
---
### <center> Big Data Processing </center>
---
#### <center> **Autumn 2025** </center>
---

**Lab 03**: Data Cleaning and Transformation Pipeline

**Date**: September 18th 2025

**Student Name**: Juan Carlos Alonso

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

: 

: 

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Lab 3: Juan ALonso") \
    .master("spark://5fded284cb17:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()


sc = spark.sparkContext
sc.setLogLevel("ERROR")

In [None]:
from pyspark.sql import SparkSession, functions as F
from juanalonso.spark_utils import SparkUtils

In [None]:
def drop_unnecessary(df):
    return df.drop("flight", "class") if all(c in df.columns for c in ["flight", "class"]) else df

def count_nulls(df):
    return df.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in df.columns])

def normalize_stops(df):
    mapping = {
        "zero": 0, "non-stop": 0,
        "one": 1,
        "two_or_more": 2, "2_or_more": 2, "two or more": 2,
    }
    mapping_expr = F.create_map([F.lit(x) for kv in mapping.items() for x in kv])
    return df.withColumn("stops", mapping_expr.getItem(F.lower(F.col("stops"))).cast("int"))

def add_route(df):
    return df.withColumn("route", F.concat_ws(" → ", F.col("source_city"), F.col("destination_city")))

def encode_times(df):
    order = ["Early_Morning", "Morning", "Afternoon", "Evening", "Night", "Late_Night"]
    mapping_expr = F.create_map([F.lit(x) for kv in enumerate(order) for x in (kv[1].lower(), kv[0])])

    df = df.withColumn("departure_time_id", mapping_expr.getItem(F.lower(F.col("departure_time"))))
    df = df.withColumn("arrival_time_id", mapping_expr.getItem(F.lower(F.col("arrival_time"))))
    return df

def add_is_expensive(df):
    return df.withColumn("is_expensive", (F.col("price") > 6000))

def avg_price_per_airline(df):
    return df.groupBy("airline").agg(F.avg("price").alias("avg_price"))

def avg_duration_per_route(df):
    return df.groupBy("route").agg(F.avg("duration").alias("avg_duration"))

def min_max_price_per_airline(df):
    return df.groupBy("airline").agg(
        F.min("price").alias("min_price"),
        F.max("price").alias("max_price")
    )

def count_by_departure(df):
    return df.groupBy("departure_time").count()


In [None]:
import pandas as pd

airlines_data = pd.read_csv('../../data/airlines_flights_data.csv')

airlines_schema = SparkUtils.generate_schema([
    ("index", "int"),
    ("airline", "string"),
    ("flight", "string"),
    ("source_city", "string"),
    ("departure_time", "string"),
    ("stops", "string"),
    ("arrival_time", "string"),
    ("destination_city", "string"),
    ("class", "string"),
    ("duration", "double"),
    ("days_left", "int"),
    ("price", "int"),
])

df = spark.createDataFrame(airlines_data, airlines_schema)
