# <center> <img src="../img/ITESOLogo.png" alt="ITESO" width="480" height="130"> </center>
# <center> **Departamento de Electrónica, Sistemas e Informática** </center>
---
## <center> **Big Data** </center>
---
### <center> **Autumn 2025** </center>
---
### <center> **Examples on Spark Data Sources (Files) and Transformations** </center>
---
**Profesor**: Pablo Camarillo Ramirez

# Find the PySpark Installation

In [1]:
import findspark
findspark.init()

# Create SparkSession

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Examples on data sources (Files)") \
    .master("spark://9d7a5c0179a1:7077") \
    .config("spark.ui.port", "4040") \
    .getOrCreate()

sc = spark.sparkContext
sc.setLogLevel("ERROR")

Using Spark's default log4j profile: org/apache/spark/log4j2-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/09/19 15:24:00 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


## Define the schema

In [3]:
from pcamarillor.spark_utils import SparkUtils
airlines_schema_columns = [("index", "int"), 
     ("airline", "string"), 
     ("flight", "string"),
     ("source_city", "string"),
     ("departure_time", "string"),
     ("stops", "string"),
     ("arrival_time", "string"),
     ("destination_city", "string"),
     ("class", "string"),
     ("duration", "float"),
     ("days_left", "int"),
     ("price", "int")
     ]
airlines_schema = SparkUtils.generate_schema(airlines_schema_columns)
airlines_schema

StructType([StructField('index', IntegerType(), True), StructField('airline', StringType(), True), StructField('flight', StringType(), True), StructField('source_city', StringType(), True), StructField('departure_time', StringType(), True), StructField('stops', StringType(), True), StructField('arrival_time', StringType(), True), StructField('destination_city', StringType(), True), StructField('class', StringType(), True), StructField('duration', FloatType(), True), StructField('days_left', IntegerType(), True), StructField('price', IntegerType(), True)])

## Load CSV

In [4]:
df_airlines = spark.read \
                .option("header", "true") \
                .schema(airlines_schema) \
                .csv("/opt/spark/work-dir/data/airline/")

df_airlines.show(n=5)

[Stage 0:>                                                          (0 + 1) / 1]

+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|index| airline| flight|source_city|departure_time|stops| arrival_time|destination_city|  class|duration|days_left|price|
+-----+--------+-------+-----------+--------------+-----+-------------+----------------+-------+--------+---------+-----+
|    0|SpiceJet|SG-8709|      Delhi|       Evening| zero|        Night|          Mumbai|Economy|    2.17|        1| 5953|
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5953|
|    2| AirAsia| I5-764|      Delhi| Early_Morning| zero|Early_Morning|          Mumbai|Economy|    2.17|        1| 5956|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|    Afternoon|          Mumbai|Economy|    2.25|        1| 5955|
|    4| Vistara| UK-963|      Delhi|       Morning| zero|      Morning|          Mumbai|Economy|    2.33|        1| 5955|
+-----+--------+-------+

                                                                                

## Data Cleaning

In [5]:
from pyspark.sql.functions import trim, col, count, isnull, when
print(f"number of records before cleaning: {df_airlines.count()}")
# Get number of null values for each column before cleaning 
df_airlines.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

# Perform data cleaning with trim (column by column)
airlines_clean = df_airlines \
        .dropDuplicates(["index"]) \
        .withColumn("airline", trim("airline")) \
        .withColumn("source_city", trim("source_city")) \
        .withColumn("destination_city", trim("destination_city")) \
        .filter(col("price").isNotNull())

# Simply using dropna()
airlines_clean_v2 = df_airlines.dropna()

print(f"number of records after cleaning with trim: {airlines_clean.count()}")
airlines_clean.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

print(f"number of records after cleaning with dropna: {airlines_clean_v2.count()}")
airlines_clean_v2.select([count(when(isnull(c[0]) | col(c[0]).isNull(), c[0])).alias(c[0]) for c in airlines_schema_columns]).show()

number of records before cleaning: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with trim: 300153


                                                                                

+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

number of records after cleaning with dropna: 300153




+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|index|airline|flight|source_city|departure_time|stops|arrival_time|destination_city|class|duration|days_left|price|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+
|    0|      0|     0|          0|             0|    0|           0|               0|    0|       0|        0|    0|
+-----+-------+------+-----------+--------------+-----+------------+----------------+-----+--------+---------+-----+



                                                                                

## Create columns

## Normalize categorical values: map ”zero” → 0, ”one” → 1, etc. in stops

In [6]:
from pyspark.sql.functions import when, lit
# Perform the 1st transformation
airlines_t1 = airlines_clean.withColumn("stops_numeric",
                                           when(airlines_clean.stops == "zero", lit(0)) \
                                           .when(airlines_clean.stops == "one", lit(1)) \
                                           .otherwise(lit(3)))

In [7]:
airlines_t1.show(15)



+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+
|index|  airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|
+-----+---------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+
|    1| SpiceJet|SG-8157|      Delhi| Early_Morning| zero|     Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|
|    3|  Vistara| UK-995|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|            0|
|    5|  Vistara| UK-945|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.33|        1| 5955|            0|
|    6|  Vistara| UK-927|      Delhi|       Morning| zero|     Morning|          Mumbai|Economy|    2.08|        1| 6060|            0|
|    9| GO_FIRST| G8-336|      Delhi|     Aftern

                                                                                

## Create a new column called route: ”Delhi → Mumbai” from source city and destination city

In [8]:
from pyspark.sql.functions import concat_ws

airlines_t2 = airlines_t1.withColumn("route", concat_ws(" → ", col("source_city"), col("destination_city")))



In [9]:
airlines_t2.show(5)



+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|         route|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|     Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|Delhi → Mumbai|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|            0|Delhi → Mumbai|
|    5| Vistara| UK-945|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.33|        1| 5955|            0|Delhi → Mumbai|
|    6| Vistara| UK-927|      Delhi|       Morning| zero|     Morning|          Mumbai|Economy|    2

                                                                                

## Transform departure time and arrival time to numerical category (Morning, Afternoon, etc.), then encode as numbers (0=Early Morning, 1=Morning, etc.).

In [10]:

airlines_t3 = airlines_t2.withColumn("departure_time_numeric",
                                           when(airlines_t2.departure_time == "Early_Morning", lit(0)) \
                                           .when(airlines_t2.departure_time == "Morning", lit(1)) \
                                           .when(airlines_t2.departure_time == "Afternoon", lit(2)) \
                                           .when(airlines_t2.departure_time == "Evening", lit(3)) \
                                           .when(airlines_t2.departure_time == "Night", lit(4)) \
                                           .otherwise(lit(-1)))

airlines_t3 = airlines_t3.withColumn("arrival_time_numeric",
                                           when(airlines_t3.arrival_time == "Early_Morning", lit(0)) \
                                           .when(airlines_t3.arrival_time == "Morning", lit(1)) \
                                           .when(airlines_t3.arrival_time == "Afternoon", lit(2)) \
                                           .when(airlines_t3.arrival_time == "Evening", lit(3)) \
                                           .when(airlines_t3.arrival_time == "Night", lit(4)) \
                                           .otherwise(lit(-1)))


In [11]:
airlines_t3.show(5)



+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+----------------------+--------------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|         route|departure_time_numeric|arrival_time_numeric|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+----------------------+--------------------+
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|     Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|Delhi → Mumbai|                     0|                   1|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|            0|Delhi → Mumbai|                     1|                   2|
|    5| Vistara| UK-945|      

                                                                                

## Add a new column is expensive: when(price > 6000, True).otherwise(False).

In [12]:
airlines_t4 = airlines_t3.withColumn("is_expensive", \
                                     when(airlines_t3.price > lit(6000), True) \
                                     .otherwise(False)
												 )

In [13]:
airlines_t4.show(5)



+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+----------------------+--------------------+------------+
|index| airline| flight|source_city|departure_time|stops|arrival_time|destination_city|  class|duration|days_left|price|stops_numeric|         route|departure_time_numeric|arrival_time_numeric|is_expensive|
+-----+--------+-------+-----------+--------------+-----+------------+----------------+-------+--------+---------+-----+-------------+--------------+----------------------+--------------------+------------+
|    1|SpiceJet|SG-8157|      Delhi| Early_Morning| zero|     Morning|          Mumbai|Economy|    2.33|        1| 5953|            0|Delhi → Mumbai|                     0|                   1|       false|
|    3| Vistara| UK-995|      Delhi|       Morning| zero|   Afternoon|          Mumbai|Economy|    2.25|        1| 5955|            0|Delhi → Mumbai|                     1|

                                                                                

### • Get the average price per airline.


In [14]:
from pyspark.sql.functions import avg

airlines_average = airlines_t4.groupBy("airline") \
    									.avg("price")


airlines_average.show()



+---------+------------------+
|  airline|        avg(price)|
+---------+------------------+
|   Indigo| 5324.216303339517|
| SpiceJet| 6179.278881367218|
|Air_India| 23507.01911190229|
|  AirAsia|4091.0727419555224|
| GO_FIRST| 5652.007595045959|
|  Vistara| 30396.53630170735|
+---------+------------------+



                                                                                


### • Average duration per route.


In [15]:
duration_average = airlines_t4.groupBy("route") \
										.avg("duration")

duration_average.show()



+--------------------+------------------+
|               route|     avg(duration)|
+--------------------+------------------+
|Hyderabad → Banga...| 12.09331678643705|
|    Mumbai → Kolkata|12.836848115489666|
|    Mumbai → Chennai|12.665900287564627|
|  Mumbai → Hyderabad|13.263310412247066|
|  Mumbai → Bangalore|11.612022516178817|
|   Bangalore → Delhi|  9.77995566082195|
| Kolkata → Bangalore| 13.79294687524098|
|   Hyderabad → Delhi|10.829816602522587|
| Bangalore → Chennai|14.480207509137166|
|    Chennai → Mumbai|12.374656244132625|
|  Bangalore → Mumbai| 10.90507225639642|
| Chennai → Bangalore|13.952593563812163|
|      Mumbai → Delhi|  9.81805726844943|
|  Hyderabad → Mumbai|11.962923295795918|
|   Chennai → Kolkata|14.515774035955694|
|   Kolkata → Chennai|14.774181563782903|
| Kolkata → Hyderabad|13.853107514948396|
|   Delhi → Bangalore| 10.35412503844018|
|      Delhi → Mumbai|10.367774213738123|
| Hyderabad → Chennai|13.293238468912078|
+--------------------+------------

                                                                                


### • Minimum and maximum price per airline.


In [16]:
from pyspark.sql.functions import max, min

min_max_airline = airlines_t4.groupBy("airline").agg(
    max("price").alias("precio_maximo"),
    min("price").alias("precio_minimo")
)

min_max_airline.show()



+---------+-------------+-------------+
|  airline|precio_maximo|precio_minimo|
+---------+-------------+-------------+
|   Indigo|        31952|         1105|
| SpiceJet|        34158|         1106|
|Air_India|        90970|         1526|
|  AirAsia|        31917|         1105|
| GO_FIRST|        32803|         1105|
|  Vistara|       123071|         1714|
+---------+-------------+-------------+



                                                                                


### • Count flights by departure time category


In [17]:

count_flights_per_departure = airlines_t4.groupBy("departure_time") \
    													.count()

count_flights_per_departure.show()



+--------------+-----+
|departure_time|count|
+--------------+-----+
|       Evening|65102|
|       Morning|71146|
|    Late_Night| 1306|
|     Afternoon|47794|
| Early_Morning|66790|
|         Night|48015|
+--------------+-----+



                                                                                

In [18]:
sc.stop()