# Project 3: Big graphs

The objective of this project is to use Spark’s APIs to analyze the flight interconnected data to understand the popularity of the airports and flight patterns.

## Task 1: Data Ingestion and Preparation

### Session Setup

In [1]:
import pyspark

# Prepare the Spark builder
spark = pyspark.sql.SparkSession.builder.appName("Project_4") \
    .config("spark.driver.memory", "6g") \
    .config("spark.driver.maxResultSize", "2g") \
    .config("spark.executor.memory", "6g") \
    .getOrCreate()

spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")
spark.conf.set("spark.sql.shuffle.partitions", 8)
spark.conf.set("spark.sql.repl.eagerEval.truncate", 500)

In [2]:
import pyspark.sql.functions as F
from pyspark.sql.types import *

### Data Ingestion

In [3]:
schema = StructType([
    StructField("FL_DATE", DateType(), True),
    StructField("OP_CARRIER", StringType(), True),
    StructField("OP_CARRIER_FL_NUM", IntegerType(), True),
    StructField("ORIGIN", StringType(), True),
    StructField("DEST", StringType(), True),
    StructField("CRS_DEP_TIME", IntegerType(), True),
    StructField("DEP_TIME", DoubleType(), True),
    StructField("DEP_DELAY", DoubleType(), True),
    StructField("TAXI_OUT", DoubleType(), True),
    StructField("WHEELS_OFF", DoubleType(), True),
    StructField("WHEELS_ON", DoubleType(), True),
    StructField("TAXI_IN", DoubleType(), True),
    StructField("CRS_ARR_TIME", IntegerType(), True),
    StructField("ARR_TIME", DoubleType(), True),
    StructField("ARR_DELAY", DoubleType(), True),
    StructField("CANCELLED", DoubleType(), True),
    StructField("CANCELLATION_CODE", StringType(), True),
    StructField("DIVERTED", DoubleType(), True),
    StructField("CRS_ELAPSED_TIME", DoubleType(), True),
    StructField("ACTUAL_ELAPSED_TIME", DoubleType(), True),
    StructField("AIR_TIME", DoubleType(), True),
    StructField("DISTANCE", DoubleType(), True),
    StructField("CARRIER_DELAY", DoubleType(), True),
    StructField("WEATHER_DELAY", DoubleType(), True),
    StructField("NAS_DELAY", DoubleType(), True),
    StructField("SECURITY_DELAY", DoubleType(), True),
    StructField("LATE_AIRCRAFT_DELAY", DoubleType(), True),
    StructField("Unnamed: 27", StringType(), True)
])

# Reading in the datasets
flight_df = spark.read.csv("input/2009.csv", header=True, schema=schema)
test_df = spark.read.csv("input/2010.csv", header=True, schema=schema)

flight_df.cache()
test_df.cache()

DataFrame[FL_DATE: date, OP_CARRIER: string, OP_CARRIER_FL_NUM: int, ORIGIN: string, DEST: string, CRS_DEP_TIME: int, DEP_TIME: double, DEP_DELAY: double, TAXI_OUT: double, WHEELS_OFF: double, WHEELS_ON: double, TAXI_IN: double, CRS_ARR_TIME: int, ARR_TIME: double, ARR_DELAY: double, CANCELLED: double, CANCELLATION_CODE: string, DIVERTED: double, CRS_ELAPSED_TIME: double, ACTUAL_ELAPSED_TIME: double, AIR_TIME: double, DISTANCE: double, CARRIER_DELAY: double, WEATHER_DELAY: double, NAS_DELAY: double, SECURITY_DELAY: double, LATE_AIRCRAFT_DELAY: double, Unnamed: 27: string]

### Partitioned Parquet

TBA

## Task 2: Cleaning and Preprocessing 

### Renaming columns for consistency

In [4]:
renamed_columns = [
    "Date", "UniqueCarrier", "FlightNumber", "Origin", "Destination",
    "CRSDepTime", "DepartureTime", "DepartureDelay", "TaxiOut", "WheelsOff",
    "WheelsOn", "TaxiIn", "CRSArrivalTime", "ArrivalTime", "ArrivalDelay",
    "Cancelled", "CancellationCode", "Diverted", "CRSElapsedTime",
    "ActualElapsedTime", "AirTime", "Distance", "CarrierDelay",
    "WeatherDelay", "NASDelay", "SecurityDelay", "LateAircraftDelay",
    "UnusedColumn"
]

flight_df = flight_df.toDF(*renamed_columns)
test_df = test_df.toDF(*renamed_columns)

### Creating day of week and month columns

In [5]:
flight_df = flight_df.withColumn("DayofWeek", F.dayofweek("Date")) \
                     .withColumn("Month", F.month("Date"))

test_df = test_df.withColumn("DayofWeek", F.dayofweek("Date")) \
                     .withColumn("Month", F.month("Date"))

### Handindling empty values

As the goal of the model is to predict flight cancellation, only attributes that relate to events which have happened before the departure/ cancellation can be used for predictions. Therefore, columns such as WheelsOff, ArrivalTime etc. are removed from the training data.

In [6]:
# Empty values in dataset
flight_df.select([
    F.count(F.when(F.col(c).isNull() | (F.isnan(c) if dict(flight_df.dtypes)[c] in ('double', 'float') else F.lit(False)), c)).alias(c)
    for c in flight_df.columns
]).show()

test_df.select([
    F.count(F.when(F.col(c).isNull() | (F.isnan(c) if dict(test_df.dtypes)[c] in ('double', 'float') else F.lit(False)), c)).alias(c)
    for c in test_df.columns
]).show()

+----+-------------+------------+------+-----------+----------+-------------+--------------+-------+---------+--------+------+--------------+-----------+------------+---------+----------------+--------+--------------+-----------------+-------+--------+------------+------------+--------+-------------+-----------------+------------+---------+-----+
|Date|UniqueCarrier|FlightNumber|Origin|Destination|CRSDepTime|DepartureTime|DepartureDelay|TaxiOut|WheelsOff|WheelsOn|TaxiIn|CRSArrivalTime|ArrivalTime|ArrivalDelay|Cancelled|CancellationCode|Diverted|CRSElapsedTime|ActualElapsedTime|AirTime|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|UnusedColumn|DayofWeek|Month|
+----+-------------+------------+------+-----------+----------+-------------+--------------+-------+---------+--------+------+--------------+-----------+------------+---------+----------------+--------+--------------+-----------------+-------+--------+------------+------------+--------+-------------+-

In [7]:
flight_df_c = flight_df.filter(F.col("Cancelled") == 1)

print("Cancelled flights:", flight_df_c.count())

flight_df_c.select([
    F.count(F.when(F.col(c).isNull() | (F.isnan(c) if dict(flight_df.dtypes)[c] in ('double', 'float') else F.lit(False)), c)).alias(c)
    for c in flight_df_c.columns
]).show()


test_df_c = test_df.filter(F.col("Cancelled") == 1)

print("Cancelled flights:", test_df_c.count())

test_df_c.select([
    F.count(F.when(F.col(c).isNull() | (F.isnan(c) if dict(test_df.dtypes)[c] in ('double', 'float') else F.lit(False)), c)).alias(c)
    for c in test_df_c.columns
]).show()

Cancelled flights: 87038
+----+-------------+------------+------+-----------+----------+-------------+--------------+-------+---------+--------+------+--------------+-----------+------------+---------+----------------+--------+--------------+-----------------+-------+--------+------------+------------+--------+-------------+-----------------+------------+---------+-----+
|Date|UniqueCarrier|FlightNumber|Origin|Destination|CRSDepTime|DepartureTime|DepartureDelay|TaxiOut|WheelsOff|WheelsOn|TaxiIn|CRSArrivalTime|ArrivalTime|ArrivalDelay|Cancelled|CancellationCode|Diverted|CRSElapsedTime|ActualElapsedTime|AirTime|Distance|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|UnusedColumn|DayofWeek|Month|
+----+-------------+------------+------+-----------+----------+-------------+--------------+-------+---------+--------+------+--------------+-----------+------------+---------+----------------+--------+--------------+-----------------+-------+--------+------------+------------

In [8]:
# Removing columns which have only NULL values for all cancelled fligths (events which occur after successful take-off)
# Including these would give an unfair advantage to the model
flight_df = flight_df.drop("UnusedColumn", "LateAircraftDelay", "SecurityDelay", "NASDelay", "WeatherDelay", "CarrierDelay", "AirTime", "ActualElapsedTime", "ArrivalDelay", "ArrivalTime", "TaxiIn", "WheelsOn", "CancellationCode")

test_df =   test_df  .drop("UnusedColumn", "LateAircraftDelay", "SecurityDelay", "NASDelay", "WeatherDelay", "CarrierDelay", "AirTime", "ActualElapsedTime", "ArrivalDelay", "ArrivalTime", "TaxiIn", "WheelsOn", "CancellationCode")

In [9]:
flight_df.describe().toPandas()

test_df.describe().toPandas()

Unnamed: 0,summary,UniqueCarrier,FlightNumber,Origin,Destination,CRSDepTime,DepartureTime,DepartureDelay,TaxiOut,WheelsOff,CRSArrivalTime,Cancelled,Diverted,CRSElapsedTime,Distance,DayofWeek,Month
0,count,6450117,6450117.0,6450117,6450117,6450117.0,6341340.0,6341340.0,6338168.0,6338168.0,6450117.0,6450117.0,6450117.0,6450100.0,6450117.0,6450117.0,6450117.0
1,mean,,2433.45956034596,,,1323.0126616307891,1330.040559250884,8.261324893476774,15.61777093948914,1353.4626994740436,1500.009710056422,0.0175585962239134,0.0023990262502215,130.3933080107285,734.9153317684005,3.955937698494461,6.5456626600726775
2,stddev,,2082.356979268585,,,461.4298704225147,473.0878523556648,31.860019423969987,9.517364200434738,474.409697593094,475.4615038390932,0.1313403768723643,0.0489210720887771,70.26835129543119,566.3835288820059,1.9605693011586849,3.4148049664755558
3,min,9E,1.0,ABE,ABE,3.0,1.0,-206.0,1.0,1.0,1.0,0.0,0.0,1.0,31.0,1.0,1.0
4,max,YV,8942.0,YUM,YUM,2359.0,2400.0,1626.0,278.0,2400.0,2359.0,1.0,1.0,655.0,4962.0,7.0,12.0


In [10]:
# Replacing missing values with dummy values as mean/median may be too misleading for the model
flight_df = flight_df.fillna({
    "DepartureTime": -1,
    "DepartureDelay": -999,
    "TaxiOut": -1,
    "WheelsOff": -1
})


test_df = test_df.fillna({
    "DepartureTime": -1,
    "DepartureDelay": -999,
    "TaxiOut": -1,
    "WheelsOff": -1
})

### Filtering out diverted flights

In [11]:
flight_df = flight_df.filter(F.col("Diverted") != 1)
flight_df = flight_df.drop("Diverted")


test_df = test_df.filter(F.col("Diverted") != 1)
test_df = test_df.drop("Diverted")

### Result of data cleaning and preprocessing

In [12]:
flight_df.describe().toPandas()

test_df.describe().toPandas()

Unnamed: 0,summary,UniqueCarrier,FlightNumber,Origin,Destination,CRSDepTime,DepartureTime,DepartureDelay,TaxiOut,WheelsOff,CRSArrivalTime,Cancelled,CRSElapsedTime,Distance,DayofWeek,Month
0,count,6434643,6434643.0,6434643,6434643,6434643.0,6434643.0,6434643.0,6434643.0,6434643.0,6434643.0,6434643.0,6434635.0,6434643.0,6434643.0,6434643.0
1,mean,,2433.2922154344847,,,1322.9812953104004,1307.471532142498,-8.794337929858735,15.323183586097938,1329.8237198240836,1499.8885931977889,0.0176008210556514,130.34318667647815,734.5328631285372,3.9560480666915008,6.546284075122738
2,stddev,,2082.097773831356,,,461.48885097670336,499.50716281426935,133.62067734874591,9.671766408220584,502.536249300171,475.5088218056595,0.1314953795423883,70.24118534540357,566.1727283232726,1.9605737402995127,3.415030616508686
3,min,9E,1.0,ABE,ABE,3.0,-1.0,-999.0,-1.0,-1.0,1.0,0.0,1.0,31.0,1.0,1.0
4,max,YV,8942.0,YUM,YUM,2359.0,2400.0,1626.0,278.0,2400.0,2359.0,1.0,655.0,4962.0,7.0,12.0


## Task 4: Feature Engineering

### Processing categorical features with StringIndexer + OneHotEncoder

In [13]:
from pyspark.ml.feature import StringIndexer, OneHotEncoder

categorical_cols = ['Origin', 'Destination', 'CRSDepTime', 'DayofWeek', 'Month'] # FlightNumber

indexers_and_encoders = []
print("Defining stages for categorical features...")
for cat_col in categorical_cols:
    index_col = cat_col + "_Index"
    vec_col = cat_col + "_Vec"

    # Indexer Stage
    string_indexer = StringIndexer(inputCol=cat_col, outputCol=index_col, handleInvalid='keep')
    indexers_and_encoders.append(string_indexer)
    print(f"  Added StringIndexer ({cat_col} -> {index_col})")

    # Encoder Stage
    encoder = OneHotEncoder(inputCols=[index_col], outputCols=[vec_col])
    indexers_and_encoders.append(encoder)
    print(f"  Added OneHotEncoder ({index_col} -> {vec_col})")

# Keep track of the output vector column names for the assembler
categorical_vec_cols = [col + "_Vec" for col in categorical_cols]

Defining stages for categorical features...
  Added StringIndexer (Origin -> Origin_Index)
  Added OneHotEncoder (Origin_Index -> Origin_Vec)
  Added StringIndexer (Destination -> Destination_Index)
  Added OneHotEncoder (Destination_Index -> Destination_Vec)
  Added StringIndexer (CRSDepTime -> CRSDepTime_Index)
  Added OneHotEncoder (CRSDepTime_Index -> CRSDepTime_Vec)
  Added StringIndexer (DayofWeek -> DayofWeek_Index)
  Added OneHotEncoder (DayofWeek_Index -> DayofWeek_Vec)
  Added StringIndexer (Month -> Month_Index)
  Added OneHotEncoder (Month_Index -> Month_Vec)


### Label Indexing

In [14]:
label_col = 'Cancelled'
label_output_col = "label"

label_indexer = StringIndexer(inputCol=label_col, outputCol=label_output_col)
print(f"Added StringIndexer for label ({label_col} -> {label_output_col})")

Added StringIndexer for label (Cancelled -> label)


### Combining features with VectorAssembler

In [15]:
from pyspark.ml.feature import VectorAssembler

numerical_cols = ['CRSArrivalTime', 'CRSElapsedTime', 'Distance'] # 'CRSArrivalTime', 'CRSElapsedTime', 'TaxiOut', 'WheelsOff', 'DepartureTime'

feature_output_col = "features"
assembler_inputs = categorical_vec_cols + numerical_cols

vector_assembler = VectorAssembler(inputCols=assembler_inputs, outputCol=feature_output_col, handleInvalid='skip')
print(f"Added VectorAssembler to create '{feature_output_col}' from {len(assembler_inputs)} columns")

Added VectorAssembler to create 'features' from 8 columns


### Preprocessing Pipeline

In [16]:
from pyspark.ml import Pipeline

# Combine all stages: categorical processing, label indexing, feature assembling
all_stages = indexers_and_encoders + [label_indexer, vector_assembler]
preprocessing_pipeline = Pipeline(stages=all_stages)

print(f"Total stages in preprocessing pipeline: {len(all_stages)}")

print("Fitting preprocessing pipeline...")
# Fit the preprocessing pipeline to the data
preprocessing_model = preprocessing_pipeline.fit(flight_df)
print("Pipeline fitting complete.")

print("Transforming data with fitted pipeline...")
# Transform the data
processed_df = preprocessing_model.transform(flight_df) 
print("Data transformation complete.")

# Select only the relevant columns for modeling: 'label' and 'features'
model_input_df = processed_df.select(label_output_col, feature_output_col)
model_input_df.cache() # Cache the final data for modeling

print("Schema of data ready for modeling:")
model_input_df.printSchema()

# Show a sample row with the final 'label' and 'features' columns
print("\nSample row for modeling:")

Total stages in preprocessing pipeline: 12
Fitting preprocessing pipeline...
Pipeline fitting complete.
Transforming data with fitted pipeline...
Data transformation complete.
Schema of data ready for modeling:
root
 |-- label: double (nullable = false)
 |-- features: vector (nullable = true)


Sample row for modeling:


In [17]:
processed_df.printSchema()

root
 |-- Date: date (nullable = true)
 |-- UniqueCarrier: string (nullable = true)
 |-- FlightNumber: integer (nullable = true)
 |-- Origin: string (nullable = true)
 |-- Destination: string (nullable = true)
 |-- CRSDepTime: integer (nullable = true)
 |-- DepartureTime: double (nullable = false)
 |-- DepartureDelay: double (nullable = false)
 |-- TaxiOut: double (nullable = false)
 |-- WheelsOff: double (nullable = false)
 |-- CRSArrivalTime: integer (nullable = true)
 |-- Cancelled: double (nullable = true)
 |-- CRSElapsedTime: double (nullable = true)
 |-- Distance: double (nullable = true)
 |-- DayofWeek: integer (nullable = true)
 |-- Month: integer (nullable = true)
 |-- Origin_Index: double (nullable = false)
 |-- Origin_Vec: vector (nullable = true)
 |-- Destination_Index: double (nullable = false)
 |-- Destination_Vec: vector (nullable = true)
 |-- CRSDepTime_Index: double (nullable = false)
 |-- CRSDepTime_Vec: vector (nullable = true)
 |-- DayofWeek_Index: double (nullable 

### Task 5: Modeling

In [18]:
# Split data (70% training, 30% testing)
split_ratio = [0.7, 0.3]
seed = 42 # Use a fixed seed for reproducibility

train_data, test_data = model_input_df.randomSplit(split_ratio, seed=seed)

# Cache the splits for performance during training and evaluation
train_data.cache()
test_data.cache()

#print(f"Training data count: {train_data.count()} ({(train_data.count() / model_input_df.count())*100:.1f}%)")
#print(f"Test data count: {test_data.count()} ({(test_data.count() / model_input_df.count())*100:.1f}%)")

DataFrame[label: double, features: vector]

### Logistic Regression, Decision Tree , Random Forest and GBT models

In [19]:
from pyspark.ml.classification import LogisticRegression, DecisionTreeClassifier, RandomForestClassifier, GBTClassifier

# Define 4 models
models = {
    "Logistic Regression": LogisticRegression(labelCol=label_output_col, featuresCol=feature_output_col, maxIter=10, regParam=0.1, elasticNetParam=0.0),
    "Decision Tree": DecisionTreeClassifier(labelCol=label_output_col, featuresCol=feature_output_col, maxDepth=5),
    "Random Forest": RandomForestClassifier(labelCol=label_output_col, featuresCol=feature_output_col, numTrees=50),
    "Gradient Boosted Trees": GBTClassifier(labelCol=label_output_col, featuresCol=feature_output_col, maxIter=20)
}

### Training models and Evaluating Accuracy and AUC

In [20]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator, MulticlassClassificationEvaluator

binary_evaluator = BinaryClassificationEvaluator(labelCol="label", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
multi_evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# Train and evaluate
results = []
for name, model in models.items():
    trained_model = model.fit(train_data)
    predictions = trained_model.transform(test_data)
    
    auc = binary_evaluator.evaluate(predictions)
    accuracy = multi_evaluator.evaluate(predictions)
    
    print(f"{name} - Accuracy: {accuracy:.4f}, AUC: {auc:.4f}")
    results.append((name, accuracy, auc))

Logistic Regression - Accuracy: 0.9865, AUC: 0.7364
Decision Tree - Accuracy: 0.9865, AUC: 0.4549
Random Forest - Accuracy: 0.9865, AUC: 0.6660
Gradient Boosted Trees - Accuracy: 0.9865, AUC: 0.7153
