# <center> <img src="../../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> Program: _Procesamiento de Datos Masivos_  </center>
---
### <center> **Autumn 2025** </center>
---

**Lab 03**:

**September 20, 2025**:

**Luis Alberto González Escamilla**:

**Professor**: Pablo Camarillo Ramirez

In [1]:
import findspark
findspark.init()

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://2abf04f86c87:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/22 08:51:42 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
25/09/22 08:51:43 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
from luis_gonzalez.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

In [None]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 0) / 1]

In [None]:
from pyspark.sql.functions import trim, col, count, isnull, when, avg, min, max, lit, concat
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

### #1. Stops transformation

In [None]:
airlines_t1 = airlines_clean_v2.withColumn("stops_numeric",
                                 when(col("stops") == "zero", lit(0))
                                .when(col("stops") == "one", lit(1))
                                .when(col("stops") == "two_or_more", lit(2))
                                .when(col("stops") == "two", lit(2))
                                .otherwise(lit(0)))

airlines_t1.show()

### #2. Create route column

In [None]:
airlines_t2 = airlines_t1.withColumn("route", concat(col("source_city"), lit(" → "), col("destination_city")))

airlines_t2.show()

### #3. Transform time columns to numerical

In [None]:
time_mapping = {
    "Early_Morning": 0,
    "Morning": 1,
    "Afternoon": 2,
    "Evening": 3,
    "Night": 4
}

airlines_t3 = airlines_t2 \
    .withColumn("departure_time_numeric",
                when(col("departure_time") == "Early_Morning", lit(0))
                .when(col("departure_time") == "Morning", lit(1))
                .when(col("departure_time") == "Afternoon", lit(2))
                .when(col("departure_time") == "Evening", lit(3))
                .when(col("departure_time") == "Night", lit(4))
                .otherwise(lit(0))) \
    .withColumn("arrival_time_numeric",
                when(col("arrival_time") == "Early_Morning", lit(0))
                .when(col("arrival_time") == "Morning", lit(1))
                .when(col("arrival_time") == "Afternoon", lit(2))
                .when(col("arrival_time") == "Evening", lit(3))
                .when(col("arrival_time") == "Night", lit(4))
                .otherwise(lit(0)))

airlines_t3.show()

### #4. Add is_expensive column

In [None]:
airlines_final = airlines_t3.withColumn("is_expensive", when(col("price") > 6000, lit(True)).otherwise(lit(False)))
airlines_final.show()

## **Aggregations**

### #1. Average price per airline

In [None]:
avg_price_per_airline = airlines_final.groupBy("airline").agg(avg("price").alias("avg_price"))
avg_price_per_airline.orderBy("avg_price", ascending=False).show()

### #2. Average duration per route

In [None]:
avg_duration_per_route = airlines_final.groupBy("route").agg(avg("duration").alias("avg_duration"))
avg_duration_per_route.orderBy("avg_duration", ascending=False).show()

### #3. Minimum and maximum price per airline

In [None]:
min_max_price_per_airline = airlines_final.groupBy("airline").agg(
    min("price").alias("min_price"),
    max("price").alias("max_price")
)
min_max_price_per_airline.orderBy("min_price").show()

### #4. Count flights by departure_time category

In [None]:
flights_by_departure = airlines_final.groupBy("departure_time").agg(count("*").alias("flight_count"))
flights_by_departure.orderBy("flight_count", ascending=False).show()

In [None]:
sc.stop()