# set up 


In [0]:
secret_scope = "team_3-4"
secret_key = "key_3_4" # The name of your container created in https://portal.azure.com
blob_container = "team3-4"  # The name of your Storage account created in https://portal.azure.com
storage_account = "daphnelin" 
team_blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, count, when, isnan, lit, udf, desc, mean, year, month, date_format, to_date, sum, expr, round, unix_timestamp, from_unixtime, avg, date_add, date_sub, when, to_date
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DoubleType, StructType, StructField, StringType
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

from pyspark.sql import DataFrame

from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col, unix_timestamp, avg
from pyspark.sql.window import Window
import time
from pyspark.mllib.evaluation import MulticlassMetrics

# feature selection


## import data and define features

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
# df_otpw_12m = spark.read.option("header", "true").option("compression", "gzip").csv(f"{data_BASE_DIR}/OTPW_12M/OTPW_12M_2015.csv.gz")


df_otpw_60m = spark.read.format("csv").option("header", "true").option("inferSchema", "true").option("compression", "gzip").load(f"{data_BASE_DIR}/OTPW_60M/")

#df_otpw_60m_sample = df_otpw_60m.limit(500)

#display(df_otpw_60m_sample)


In [0]:

# columns_to_select = [
#     "DEP_DEL15", "CANCELLED", "QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE',

#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",

#     'HourlyAltimeterSetting',
#     'HourlyDewPointTemperature',
#     'HourlyDryBulbTemperature',
#     'HourlyRelativeHumidity',
#     'HourlyStationPressure',
#     'HourlyVisibility',
#     'HourlyWetBulbTemperature',
#     'HourlyWindDirection',
#     'HourlyWindSpeed',
# ]

#take out TAIL_NUM AND OP_CARRIER_AIRLINE_ID

#daphne's features
# df = df.select('DEP_DELAY', 'sched_depart_date_time_UTC', 'ORIGIN', 'DEST', 'TAIL_NUM', 'YEAR', 'QUARTER', 'MONTH', 'FL_DATE', 'DEP_DEL15')

columns_to_select = [
    "DEP_DEL15", "CANCELLED","QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
    "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE',
    "ORIGIN", 
    "DEST",
    "DEP_DELAY",

    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

# categorical_columns = [
#     "QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",
#     ]

categorical_columns = [
    "QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
    "ORIGIN",
    "DEST"
    ]

columns_to_cast = [
    'DEP_DEL15',
    'DISTANCE',
    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

datetime_column = 'sched_depart_date_time_UTC'

columns_to_drop = [datetime_column] + ["CANCELLED","hour","hourIndex","DEP_DELAY","DEP_UNIX_TIME"]

## feature engineering

In [0]:
def filter_and_select_columns(df: DataFrame, columns_to_select: list, columns_to_cast: list) -> DataFrame:
    for column in columns_to_cast:
        df = df.withColumn(column, col(column).cast('integer'))
    
    df = df.select(*columns_to_select).filter(col("CANCELLED") != "1.0").cache()
    return df

In [0]:
def encode_categorical_features(df: DataFrame, categorical_columns: list) -> DataFrame:
    stages = []
    for categorical_col in categorical_columns:
        string_indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + "Index",handleInvalid="keep")
        encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[categorical_col + "classVec"])
        stages += [string_indexer, encoder]
    
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df)
    return pipeline_model.transform(df).cache()

In [0]:
def hour_to_bin(hour):
    return (int(hour) // 4) + 1

def bin_and_encode_hour(df: DataFrame, datetime_column: str) -> DataFrame:
    hour_to_bin_udf = udf(hour_to_bin, IntegerType())
    df = df.withColumn("hour", hour_to_bin_udf(df[datetime_column].substr(12, 2)))
    
    string_indexer = StringIndexer(inputCol="hour", outputCol="hourIndex").setHandleInvalid("skip")
    encoder = OneHotEncoder(inputCols=["hourIndex"], outputCols=["hourVec"])
    pipeline = Pipeline(stages=[string_indexer, encoder])
    pipeline_model = pipeline.fit(df)
    
    return pipeline_model.transform(df).cache()

In [0]:

def add_feature_engineering(df):
    # # including only necessary columns -- trying to minimize shuffle size
    # df = df.select('DEP_DELAY', 'sched_depart_date_time_UTC', 'ORIGIN', 'DEST', 'TAIL_NUM', 'YEAR', 'QUARTER', 'MONTH', 'FL_DATE', 'DEP_DEL15')

    # # cache df
    # df.cache()

    # Feature 1: Is near a major holiday
    holidays = ['2015-12-25', '2016-12-25', '2017-12-25', '2018-12-25', '2019-12-25','2015-11-26', '2016-11-24', '2017-11-23', '2018-11-22', '2019-11-28','2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01','2015-07-04', '2016-07-04', '2017-07-04', '2018-07-04', '2019-07-04']

    # generate dates for 3 days before and after each major holiday
    holiday_dates = []
    for holiday in holidays:
        holiday_date = to_date(lit(holiday))
        #holiday_dates += [date_add(holiday_date, offset) for offset in range(-3, 4)]
        holiday_dates += [date_add(holiday_date, offset) for offset in range(-3, 4) if offset != 0]

    
    # add the is_near_holiday column to df   
    df = df.withColumn('is_near_holiday', when(col('FL_DATE').isin(holiday_dates), 1).otherwise(0))

    # add a departure unix timestamp column for use in windows
    df = df.withColumn('DEP_UNIX_TIME', unix_timestamp('sched_depart_date_time_UTC'))

    # optimize shuffle partitions based on how many cores
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.shuffle.partitions", "200") 

    # repartition the df based on origin to optimize things
    df = df.repartition("ORIGIN")

    # windows for features (using unix time column)
    window_origin_4_to_2h = Window.partitionBy('ORIGIN').orderBy('DEP_UNIX_TIME').rangeBetween(-14400, -7200)
    window_dest_4_to_2h = Window.partitionBy('DEST').orderBy('DEP_UNIX_TIME').rangeBetween(-14400, -7200)
    #window_tail_4_flights = Window.partitionBy('TAIL_NUM').orderBy('DEP_UNIX_TIME').rangeBetween(Window.unboundedPreceding, -7200)

    # calculate and add to df
    # Feature 2: % delays 2 hours before departure at origin
    df = df.withColumn(
        'percent_delays_2h_origin',
        avg('DEP_DEL15').over(window_origin_4_to_2h ) * 100)
    df = df.withColumn(
        'percent_delays_2h_origin',
        when(count('DEP_DEL15').over(window_origin_4_to_2h) == 0, 0)
    .otherwise((sum('DEP_DEL15').over(window_origin_4_to_2h) / count('DEP_DEL15').over(window_origin_4_to_2h)) * 100))
    # # Feature 3: % delays at destination airport before departure at origin
    df = df.withColumn(
        'percent_delays_2h_dest_before_depart',
        when(count('DEP_DEL15').over(window_dest_4_to_2h) == 0, 0)
    .otherwise((sum('DEP_DEL15').over(window_dest_4_to_2h) / count('DEP_DEL15').over(window_dest_4_to_2h)) * 100))
    # Feature 4: last 4 flights avg delay by tail number
    #df = df.withColumn(
    #    'last_4_avg_delay_tail',
    #    avg('DEP_DELAY').over(window_tail_4_flights))


    # # unpersist cached df
    # df.unpersist()
    return df


In [0]:
def drop_and_select_columns(df: DataFrame, categorical_columns: list, columns_to_drop: list) -> DataFrame:
    categorical_columns_indexed = [col + "Index" for col in categorical_columns]
    all_columns_to_drop = categorical_columns + categorical_columns_indexed + columns_to_drop
    remaining_columns = [col for col in df.columns if col not in all_columns_to_drop]
    return df.select(*remaining_columns).cache()

In [0]:
def remove_nan_labels(data: DataFrame, label_column: str) -> DataFrame:
    clean_data = data.filter(col(label_column).isNotNull() & ~col(label_column).isNaN())
    return clean_data

In [0]:
# Step 1: Filter and select columns
df_filtered = filter_and_select_columns(df_otpw_60m, columns_to_select, columns_to_cast)

In [0]:
# Step 2: Encode categorical features
df_encoded_categorical = encode_categorical_features(df_filtered, categorical_columns)

Downloading artifacts:   0%|          | 0/86 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
# Step 3: Include new features 
df_encoded_new = add_feature_engineering(df_encoded_categorical)

In [0]:
df_encoded_new.columns

In [0]:
# # Step 3: Bin and encode the hour
# df_encoded_hour = bin_and_encode_hour(df_encoded_categorical, datetime_column)

In [0]:
# Step 4: Drop unnecessary columns and finalize the DataFrame
df_final = drop_and_select_columns(df_encoded_new, categorical_columns, columns_to_drop)

In [0]:
#Step5: clean data so that labels are not NA
df_final = remove_nan_labels(df_final,"DEP_DEL15")

In [0]:
display(df_final)

In [0]:
df_final.columns

In [0]:
df_final.count()

In [0]:
#CHANGE persistent store
transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_NN_V1"
df_final.write.format("parquet").mode("overwrite").save(transformed_data_path)

In [0]:
transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_NN_V1"

# Read the Parquet file into a DataFrame
df_final = spark.read.parquet(transformed_data_path)

# Show the DataFrame to verify it's loaded correctly
df_final.show()

+---------+--------+----------+----------------------+-------------------------+------------------------+----------------------+---------------------+----------------+------------------------+-------------------+---------------+---------------+---------------+--------------------+-------------------+-----------------+---------------+---------------+------------------------+------------------------------------+
|DEP_DEL15|DISTANCE|   FL_DATE|HourlyAltimeterSetting|HourlyDewPointTemperature|HourlyDryBulbTemperature|HourlyRelativeHumidity|HourlyStationPressure|HourlyVisibility|HourlyWetBulbTemperature|HourlyWindDirection|HourlyWindSpeed|QUARTERclassVec|  MONTHclassVec|DAY_OF_MONTHclassVec|DAY_OF_WEEKclassVec|   ORIGINclassVec|   DESTclassVec|is_near_holiday|percent_delays_2h_origin|percent_delays_2h_dest_before_depart|
+---------+--------+----------+----------------------+-------------------------+------------------------+----------------------+---------------------+----------------+-----

In [0]:
df_final

DataFrame[DEP_DEL15: int, DISTANCE: int, FL_DATE: date, HourlyAltimeterSetting: int, HourlyDewPointTemperature: int, HourlyDryBulbTemperature: int, HourlyRelativeHumidity: int, HourlyStationPressure: int, HourlyVisibility: int, HourlyWetBulbTemperature: int, HourlyWindDirection: int, HourlyWindSpeed: int, QUARTERclassVec: vector, MONTHclassVec: vector, DAY_OF_MONTHclassVec: vector, DAY_OF_WEEKclassVec: vector, ORIGINclassVec: vector, DESTclassVec: vector, is_near_holiday: int, percent_delays_2h_origin: double, percent_delays_2h_dest_before_depart: double]

# split data and compute class weights 

In [0]:
split_date = "2019-01-01"
train_data = df_final.filter(F.col("FL_DATE") < split_date)
test_data = df_final.filter(F.col("FL_DATE") >= split_date)
train_data = train_data.drop('FL_DATE')
test_data = test_data.drop('FL_DATE')


In [0]:
#extract feature list
feature_list = df_final.columns
feature_list.remove('DEP_DEL15')
feature_list.remove('FL_DATE')
feature_list

['DISTANCE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'QUARTERclassVec',
 'MONTHclassVec',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKclassVec',
 'ORIGINclassVec',
 'DESTclassVec',
 'is_near_holiday',
 'percent_delays_2h_origin',
 'percent_delays_2h_dest_before_depart']


## downsampling

In [0]:
def downsample_majority_class(df: DataFrame) -> DataFrame:
    # Separate the DataFrame by class
    majority_df = df.filter(col("DEP_DEL15") == 0)
    minority_df = df.filter(col("DEP_DEL15") == 1)

    # Count the number of instances in the minority class
    minority_class_count = minority_df.count()

    # Calculate the fraction needed to sample the majority class to match the minority class size
    total_majority_count = majority_df.count()
    sample_fraction = minority_class_count / float(total_majority_count) if total_majority_count != 0 else 0

    # Sample the majority class
    downsampled_majority_df = majority_df.sample(withReplacement=False, fraction=sample_fraction)

    # Combine the downsampled majority class DataFrame with the minority class DataFrame
    balanced_df = downsampled_majority_df.union(minority_df)

    return balanced_df


In [0]:
train_data_balanced = downsample_majority_class(train_data)

In [0]:
# def calculate_class_weights(df: DataFrame, label_column: str, weight_column_name: str = 'classWeight') -> DataFrame:
#     """
#     Calculate class weights based on the frequencies of labels in a specified column and add a weight column to the DataFrame.

#     Parameters:
#     df (DataFrame): The input DataFrame.
#     label_column (str): The name of the column to compute class weights for.
#     weight_column_name (str): The name of the new column that will contain the computed weights.

#     Returns:
#     DataFrame: The DataFrame with an additional column containing the weights for each class.
#     """
#     # Calculate class frequencies
#     class_freqs = df.groupBy(label_column).count().collect()

#     # Total count of the dataset
#     total_count = df.count()

#     # Create a dictionary with class weights
#     weight_dict = {row[label_column]: float(total_count) / row['count'] for row in class_freqs}

#     # Broadcast the dictionary to use in withColumn operation
#     broadcast_weights = spark.sparkContext.broadcast(weight_dict)

#     # Function to apply the weight based on the class
#     def get_weight(class_label):
#         return broadcast_weights.value.get(class_label, 0)  # return 0 if the class_label is not found

#     # Register UDF
#     get_weight_udf = udf(get_weight, DoubleType())

#     # Add weight column to DataFrame
#     return df.withColumn(weight_column_name, get_weight_udf(col(label_column)))


# train_data_with_weights = calculate_class_weights(train_data, 'DEP_DEL15')



# model training

In [0]:
train_data_balanced.columns

In [0]:
feature_list

In [0]:
assembler = VectorAssembler(inputCols=feature_list, outputCol="features",handleInvalid="skip")
train_data_assembled = assembler.transform(train_data_balanced)
test_data_assembled = assembler.transform(test_data)


In [0]:
display(train_data_assembled)

DEP_DEL15,DISTANCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyRelativeHumidity,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,QUARTERclassVec,MONTHclassVec,DAY_OF_MONTHclassVec,DAY_OF_WEEKclassVec,ORIGINclassVec,DESTclassVec,is_near_holiday,percent_delays_2h_origin,percent_delays_2h_dest_before_depart,features
0,399,30,21,25,86,30,9,24,0,0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(199), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 13, 24, 49, 59, 263, 433), values -> List(399.0, 30.0, 21.0, 25.0, 86.0, 30.0, 9.0, 24.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,666,30,9,31,40,30,10,25,230,13,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(21), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 85, 433), values -> List(666.0, 30.0, 9.0, 31.0, 40.0, 30.0, 10.0, 25.0, 230.0, 13.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,214,30,24,29,82,29,9,27,0,0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(28), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 13, 24, 49, 59, 92, 433), values -> List(214.0, 30.0, 24.0, 29.0, 82.0, 29.0, 9.0, 27.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,406,30,54,59,82,30,10,56,30,8,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(26), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 90, 433), values -> List(406.0, 30.0, 54.0, 59.0, 82.0, 30.0, 10.0, 56.0, 30.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,674,30,14,27,58,29,10,23,0,0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(70), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 13, 24, 49, 59, 134, 433), values -> List(674.0, 30.0, 14.0, 27.0, 58.0, 29.0, 10.0, 23.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,341,30,32,40,73,29,10,37,30,3,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(104), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 168, 433), values -> List(341.0, 30.0, 32.0, 40.0, 73.0, 29.0, 10.0, 37.0, 30.0, 3.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,516,30,22,34,61,30,10,30,230,7,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(68), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 132, 433), values -> List(516.0, 30.0, 22.0, 34.0, 61.0, 30.0, 10.0, 30.0, 230.0, 7.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,356,30,25,30,82,29,10,28,0,0,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(39), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 13, 24, 49, 59, 103, 433), values -> List(356.0, 30.0, 25.0, 30.0, 82.0, 29.0, 10.0, 28.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,821,30,8,14,77,29,10,13,150,8,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(56), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 120, 433), values -> List(821.0, 30.0, 8.0, 14.0, 77.0, 29.0, 10.0, 13.0, 150.0, 8.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"
0,134,30,30,41,65,29,10,37,20,5,"Map(vectorType -> sparse, length -> 4, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 12, indices -> List(10), values -> List(1.0))","Map(vectorType -> sparse, length -> 31, indices -> List(23), values -> List(1.0))","Map(vectorType -> sparse, length -> 7, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 369, indices -> List(71), values -> List(1.0))","Map(vectorType -> sparse, length -> 368, indices -> List(0), values -> List(1.0))",0,0.0,0.0,"Map(vectorType -> sparse, length -> 804, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 13, 24, 49, 59, 135, 433), values -> List(134.0, 30.0, 30.0, 41.0, 65.0, 29.0, 10.0, 37.0, 20.0, 5.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))"


In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.feature import VectorAssembler
from pyspark.ml import Pipeline

inputLayerSize = 804
outputLayerSize = 2 

# Model 1: One hidden layer with 4 nodes
layers1 = [inputLayerSize, 4, outputLayerSize]
mlp1 = MultilayerPerceptronClassifier(maxIter = 50, stepSize = 0.03, layers=layers1, featuresCol='features', blockSize = 128, labelCol='DEP_DEL15', predictionCol='prediction')

# Model 2: Two hidden layers with 4 nodes each
layers2 = [inputLayerSize, 4, 4, outputLayerSize]
mlp2 = MultilayerPerceptronClassifier(maxIter = 50, stepSize = 0.03, layers=layers2, featuresCol='features', blockSize = 128, labelCol='DEP_DEL15', predictionCol='prediction')

nn_model1 = mlp1.fit(train_data_assembled)
nn_model2 = mlp2.fit(train_data_assembled)

#nn_test1_predictions = nn_model1.transform(test_data_assembled)
#nn_test2_predictions = nn_model2.transform(test_data_assembled)


# Evaluation

In [0]:
def evaluate_model(train_data_assembled, test_data_assembled, model):
   
    def get_metrics(dataset):
        # Make predictions
        predictions = model.transform(dataset)

        # Select the prediction and label columns and convert to RDD
        predictionAndLabels = predictions.select(
            col('prediction').cast('float'),
            col('DEP_DEL15').cast('float')
        ).rdd

        # Compute metrics using MulticlassMetrics
        metrics = MulticlassMetrics(predictionAndLabels)
        confusion_matrix = metrics.confusionMatrix().toArray()

        # Calculate metrics
        TN = confusion_matrix[0,0]
        FP = confusion_matrix[0,1]
        FN = confusion_matrix[1,0]
        TP = confusion_matrix[1,1]

        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        beta = 2
        f2 = ((1 + beta**2) * precision * recall) / ((beta**2 * precision) + recall) if ((beta**2 * precision) + recall) > 0 else 0

        return {
            'TN': TN, 'FP': FP, 'FN': FN, 'TP': TP,
            'Recall': recall, 'Precision': precision, 'F1 Score': f1, 'F2 Score': f2
        }

    # Evaluate on both training and test data
    train_metrics = get_metrics(train_data_assembled)
    test_metrics = get_metrics(test_data_assembled)

    return {
        'Training': train_metrics,
        'Testing': test_metrics
    }




In [0]:
results_nn1 = evaluate_model(train_data_assembled, test_data_assembled, nn_model1)

#nn_test1_predictions = nn_model1.transform(test_data_assembled)
#nn_test2_predictions = nn_model2.transform(test_data_assembled)

print(results_nn1)



{'Training': {'TN': 6.0, 'FP': 3893543.0, 'FN': 2.0, 'TP': 3964771.0, 'Recall': 0.999999495557501, 'Precision': 0.5045320153915968, 'F1 Score': 0.6706828766463445, 'F2 Score': 0.8358357149175588}, 'Testing': {'TN': 3.0, 'FP': 5267559.0, 'FN': 0.0, 'TP': 1225795.0, 'Recall': 1.0, 'Precision': 0.18877686323585624, 'F1 Score': 0.31759848138700264, 'F2 Score': 0.5377928938745762}}


In [0]:
results_nn2 = evaluate_model(train_data_assembled, test_data_assembled, nn_model2)

#nn_test1_predictions = nn_model1.transform(test_data_assembled)
#nn_test2_predictions = nn_model2.transform(test_data_assembled)

print(results_nn2)

{'Training': {'TN': 0.0, 'FP': 3893549.0, 'FN': 0.0, 'TP': 3964773.0, 'Recall': 1.0, 'Precision': 0.5045317562706135, 'F1 Score': 0.6706827611551798, 'F2 Score': 0.8358358546172023}, 'Testing': {'TN': 0.0, 'FP': 5267562.0, 'FN': 0.0, 'TP': 1225795.0, 'Recall': 1.0, 'Precision': 0.1887767760189375, 'F1 Score': 0.31759835795434527, 'F2 Score': 0.5377927523071263}}


In [0]:

# predictionAndLabels = test_predictions.select(
#     col('prediction').cast('float'),
#     col('DEP_DEL15').cast('float')
# ).rdd

# metrics = MulticlassMetrics(predictionAndLabels)
# confusion_matrix = metrics.confusionMatrix().toArray()

# print("Confusion Matrix:\n", confusion_matrix)

# print("details:")
# print("          Predicted: No    Predicted: Yes")
# print(f"Actual: No  TN = {confusion_matrix[0,0]:7.0f}    FP = {confusion_matrix[0,1]:7.0f}")
# print(f"Actual: Yes FN = {confusion_matrix[1,0]:7.0f}    TP = {confusion_matrix[1,1]:7.0f}")


In [0]:
# #calculating recall and precision manually
# TP = confusion_matrix[1, 1]
# FN = confusion_matrix[1, 0]
# FP = confusion_matrix[0, 1]
# recall_for_class_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
# precision_for_class_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
# beta = 2
# f_beta_for_class_1 = ((1 + beta**2) * precision_for_class_1 * recall_for_class_1) / ((beta**2 * precision_for_class_1) + recall_for_class_1) if ((beta**2 * precision_for_class_1) + recall_for_class_1) > 0 else 0

# print("Recall for class 1 (Delays):", recall_for_class_1)
# print("Precision for class 1 (Delays):", precision_for_class_1)
# print("F2 Score for class 1 (Delays):", f_beta_for_class_1)

## grid search


In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

mlp = MultilayerPerceptronClassifier(featuresCol='features', labelCol='DEP_DEL15')
paramGrid = ParamGridBuilder() \
    .addGrid(mlp.layers, [[804, hidden_layer_size, 2] for hidden_layer_size in [5, 10]]) \
    .addGrid(mlp.blockSize, [128]) \
    .addGrid(mlp.maxIter, [50, 100]) \
    .addGrid(mlp.stepSize, [0.01, 0.03, 0.05]) \
    .build()

evaluator = MulticlassClassificationEvaluator(metricName='weightedRecall', labelCol='DEP_DEL15')

crossval = CrossValidator(estimator=mlp,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=3)

cvModel = crossval.fit(train_data_assembled)

In [0]:
results_gs_nn = evaluate_model(train_data_assembled, test_data_assembled, cvModel)

#nn_test1_predictions = nn_model1.transform(test_data_assembled)
#nn_test2_predictions = nn_model2.transform(test_data_assembled)

print(results_gs_nn)



{'Training': {'TN': 2825751.0, 'FP': 1071219.0, 'FN': 1781137.0, 'TP': 2183636.0, 'Recall': 0.5507594003490238, 'Precision': 0.6708857998282566, 'F1 Score': 0.604916486001772, 'F2 Score': 0.5712153538983864}, 'Testing': {'TN': 3719337.0, 'FP': 1548225.0, 'FN': 539495.0, 'TP': 686300.0, 'Recall': 0.5598815462618137, 'Precision': 0.30713462592721047, 'F1 Score': 0.3966685162065936, 'F2 Score': 0.4807567698580987}}


In [0]:
nn_gridsearch_predictions = cvModel.transform(test_data_assembled)

In [0]:
nn_gridsearch_predictions

DataFrame[DEP_DEL15: int, DISTANCE: int, HourlyAltimeterSetting: int, HourlyDewPointTemperature: int, HourlyDryBulbTemperature: int, HourlyRelativeHumidity: int, HourlyStationPressure: int, HourlyVisibility: int, HourlyWetBulbTemperature: int, HourlyWindDirection: int, HourlyWindSpeed: int, QUARTERclassVec: vector, MONTHclassVec: vector, DAY_OF_MONTHclassVec: vector, DAY_OF_WEEKclassVec: vector, ORIGINclassVec: vector, DESTclassVec: vector, is_near_holiday: int, percent_delays_2h_origin: double, percent_delays_2h_dest_before_depart: double, features: vector, rawPrediction: vector, probability: vector, prediction: double]

In [0]:

# Persistent store
nn_gridsearch_predictions_path = f"{team_blob_url}/data_OPTW_60MON_NN_V1_predictions"
nn_gridsearch_predictions.write.format("parquet").mode("overwrite").save(nn_gridsearch_predictions_path)

In [0]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
# paramGrid = ParamGridBuilder() \
#     .addGrid(classifier.regParam, [0.01, 0.1, 1.0]) \
#     .addGrid(classifier.elasticNetParam, [0.0, 0.5, 1.0]) \
#     .addGrid(classifier.maxIter, [10, 50, 100]) \
#     .build()
# evaluator = MulticlassClassificationEvaluator(labelCol='DEP_DEL15', predictionCol="prediction", metricName="recallByLabel", metricLabel=1)


In [0]:
# crossval = CrossValidator(estimator=classifier,
#                           estimatorParamMaps=paramGrid,
#                           evaluator=evaluator,
#                           numFolds=5)  
# cvModel = crossval.fit(train_data_assembled)


In [0]:
# predictions = cvModel.bestModel.transform(test_data_assembled)
# final_score = evaluator.evaluate(predictions)
# print(f"Best model's score on test data: {final_score}")


In [0]:
# # Access the best model
# best_model = cvModel.bestModel

# # Print the hyperparameters of the best model
# print("Best Model's Hyperparameters:")
# print(f" - Regularization Parameter (regParam): {best_model._java_obj.getRegParam()}")
# print(f" - Elastic Net Parameter (elasticNetParam): {best_model._java_obj.getElasticNetParam()}")
# print(f" - Maximum Iterations (maxIter): {best_model._java_obj.getMaxIter()}")

