# set up 


In [0]:
secret_scope = "team_3-4"
secret_key = "key_3_4" # The name of your container created in https://portal.azure.com
blob_container = "team3-4"  # The name of your Storage account created in https://portal.azure.com
storage_account = "daphnelin" 
team_blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, count, when, isnan, lit, udf, desc, mean, year, month, date_format, to_date, sum, expr, round, unix_timestamp, from_unixtime, avg, date_add, date_sub, when, to_date
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DoubleType, StructType, StructField, StringType
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

from pyspark.sql import DataFrame

from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col, unix_timestamp, avg
from pyspark.sql.window import Window
import time
from pyspark.mllib.evaluation import MulticlassMetrics

# feature selection


## import data and define features

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
# df_otpw_12m = spark.read.option("header", "true").option("compression", "gzip").csv(f"{data_BASE_DIR}/OTPW_12M/OTPW_12M_2015.csv.gz")


df_otpw_60m = spark.read.format("csv").option("header", "true").option("inferSchema", "true").option("compression", "gzip").load(f"{data_BASE_DIR}/OTPW_60M/")

# df_otpw_60m_sample = df_otpw_60m.limit(500)

# display(df_otpw_60m_sample)


In [0]:

# columns_to_select = [
#     "DEP_DEL15", "CANCELLED", "QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE',

#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",

#     'HourlyAltimeterSetting',
#     'HourlyDewPointTemperature',
#     'HourlyDryBulbTemperature',
#     'HourlyRelativeHumidity',
#     'HourlyStationPressure',
#     'HourlyVisibility',
#     'HourlyWetBulbTemperature',
#     'HourlyWindDirection',
#     'HourlyWindSpeed',
# ]

#take out TAIL_NUM AND OP_CARRIER_AIRLINE_ID

#daphne's features
# df = df.select('DEP_DELAY', 'sched_depart_date_time_UTC', 'ORIGIN', 'DEST', 'TAIL_NUM', 'YEAR', 'QUARTER', 'MONTH', 'FL_DATE', 'DEP_DEL15')

columns_to_select = [
    "DEP_DEL15", "CANCELLED", "YEAR","QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
    "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE',
    "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
    "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",
    "DEP_DELAY","TAIL_NUM",

    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

# categorical_columns = [
#     "QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",
#     ]

categorical_columns = [
    "YEAR","QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
    "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
    "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",
    ]

columns_to_cast = [
    'DEP_DEL15',
    'DISTANCE',
    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

datetime_column = 'sched_depart_date_time_UTC'

columns_to_drop = [datetime_column] + ["CANCELLED","hour","hourIndex","DEP_DELAY","TAIL_NUM","DEP_UNIX_TIME"]

## feature engineering

In [0]:
def filter_and_select_columns(df: DataFrame, columns_to_select: list, columns_to_cast: list) -> DataFrame:
    for column in columns_to_cast:
        df = df.withColumn(column, col(column).cast('integer'))
    
    df = df.select(*columns_to_select).filter(col("CANCELLED") != "1.0").cache()
    return df

In [0]:
def encode_categorical_features(df: DataFrame, categorical_columns: list) -> DataFrame:
    stages = []
    for categorical_col in categorical_columns:
        string_indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + "Index",handleInvalid="keep")
        encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[categorical_col + "classVec"])
        stages += [string_indexer, encoder]
    
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df)
    return pipeline_model.transform(df).cache()

In [0]:
def hour_to_bin(hour):
    return (int(hour) // 4) + 1

def bin_and_encode_hour(df: DataFrame, datetime_column: str) -> DataFrame:
    hour_to_bin_udf = udf(hour_to_bin, IntegerType())
    df = df.withColumn("hour", hour_to_bin_udf(df[datetime_column].substr(12, 2)))
    
    string_indexer = StringIndexer(inputCol="hour", outputCol="hourIndex").setHandleInvalid("skip")
    encoder = OneHotEncoder(inputCols=["hourIndex"], outputCols=["hourVec"])
    pipeline = Pipeline(stages=[string_indexer, encoder])
    pipeline_model = pipeline.fit(df)
    
    return pipeline_model.transform(df).cache()

In [0]:

def add_feature_engineering(df):
    # # including only necessary columns -- trying to minimize shuffle size
    # df = df.select('DEP_DELAY', 'sched_depart_date_time_UTC', 'ORIGIN', 'DEST', 'TAIL_NUM', 'YEAR', 'QUARTER', 'MONTH', 'FL_DATE', 'DEP_DEL15')

    # # cache df
    # df.cache()

    # Feature 1: Is near a major holiday
    holidays = ['2015-12-25', '2016-12-25', '2017-12-25', '2018-12-25', '2019-12-25','2015-11-26', '2016-11-24', '2017-11-23', '2018-11-22', '2019-11-28','2015-01-01', '2016-01-01', '2017-01-01', '2018-01-01', '2019-01-01','2015-07-04', '2016-07-04', '2017-07-04', '2018-07-04', '2019-07-04']

    # generate dates for 3 days before and after each major holiday
    holiday_dates = []
    for holiday in holidays:
        holiday_date = to_date(lit(holiday))
        holiday_dates += [date_add(holiday_date, offset) for offset in range(-3, 4)]

    
    # add the is_near_holiday column to df   
    df = df.withColumn('is_near_holiday', when(col('FL_DATE').isin(holiday_dates), 1).otherwise(0))

    # add a departure unix timestamp column for use in windows
    df = df.withColumn('DEP_UNIX_TIME', unix_timestamp('sched_depart_date_time_UTC'))

    # optimize shuffle partitions based on how many cores
    spark = SparkSession.builder.getOrCreate()
    spark.conf.set("spark.sql.shuffle.partitions", "200") 

    # repartition the df based on origin to optimize things
    df = df.repartition("ORIGIN")

    # windows for features (using unix time column)
    window_origin_4_to_2h = Window.partitionBy('ORIGIN').orderBy('DEP_UNIX_TIME').rangeBetween(-14400, -7200)
    window_dest_4_to_2h = Window.partitionBy('DEST').orderBy('DEP_UNIX_TIME').rangeBetween(-14400, -7200)
    window_tail_4_flights = Window.partitionBy('TAIL_NUM').orderBy('DEP_UNIX_TIME').rangeBetween(Window.unboundedPreceding, -7200)

    # calculate and add to df
    # Feature 2: % delays 2 hours before departure at origin
    df = df.withColumn(
        'percent_delays_2h_origin',
        avg('DEP_DEL15').over(window_origin_4_to_2h ) * 100)
    df = df.withColumn(
        'percent_delays_2h_origin',
        when(count('DEP_DEL15').over(window_origin_4_to_2h) == 0, 0)
    .otherwise((sum('DEP_DEL15').over(window_origin_4_to_2h) / count('DEP_DEL15').over(window_origin_4_to_2h)) * 100))
    # # Feature 3: % delays at destination airport before departure at origin
    df = df.withColumn(
        'percent_delays_2h_dest_before_depart',
        when(count('DEP_DEL15').over(window_dest_4_to_2h) == 0, 0)
    .otherwise((sum('DEP_DEL15').over(window_dest_4_to_2h) / count('DEP_DEL15').over(window_dest_4_to_2h)) * 100))
    # Feature 4: last 4 flights avg delay by tail number
    df = df.withColumn(
        'last_4_avg_delay_tail',
        avg('DEP_DELAY').over(window_tail_4_flights))


    # # unpersist cached df
    # df.unpersist()
    return df


In [0]:
def drop_and_select_columns(df: DataFrame, categorical_columns: list, columns_to_drop: list) -> DataFrame:
    categorical_columns_indexed = [col + "Index" for col in categorical_columns]
    all_columns_to_drop = categorical_columns + categorical_columns_indexed + columns_to_drop
    remaining_columns = [col for col in df.columns if col not in all_columns_to_drop]
    return df.select(*remaining_columns).cache()

In [0]:
def remove_nan_labels(data: DataFrame, label_column: str) -> DataFrame:
    clean_data = data.filter(col(label_column).isNotNull() & ~col(label_column).isNaN())
    return clean_data

In [0]:
# Step 1: Filter and select columns
df_filtered = filter_and_select_columns(df_otpw_60m, columns_to_select, columns_to_cast)

In [0]:
# Step 2: Encode categorical features
df_encoded_categorical = encode_categorical_features(df_filtered, categorical_columns)

Downloading artifacts:   0%|          | 0/156 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
# Step 3: Include new features 
df_encoded_new = add_feature_engineering(df_encoded_categorical)

In [0]:
df_encoded_new.columns

['DEP_DEL15',
 'CANCELLED',
 'YEAR',
 'QUARTER',
 'MONTH',
 'DAY_OF_MONTH',
 'DAY_OF_WEEK',
 'sched_depart_date_time_UTC',
 'DISTANCE',
 'FL_DATE',
 'ORIGIN',
 'ORIGIN_CITY_MARKET_ID',
 'ORIGIN_STATE_ABR',
 'DEST',
 'DEST_CITY_MARKET_ID',
 'DEST_STATE_ABR',
 'DEP_DELAY',
 'TAIL_NUM',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'YEARIndex',
 'YEARclassVec',
 'QUARTERIndex',
 'QUARTERclassVec',
 'MONTHIndex',
 'MONTHclassVec',
 'DAY_OF_MONTHIndex',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKIndex',
 'DAY_OF_WEEKclassVec',
 'ORIGINIndex',
 'ORIGINclassVec',
 'ORIGIN_CITY_MARKET_IDIndex',
 'ORIGIN_CITY_MARKET_IDclassVec',
 'ORIGIN_STATE_ABRIndex',
 'ORIGIN_STATE_ABRclassVec',
 'DESTIndex',
 'DESTclassVec',
 'DEST_CITY_MARKET_IDIndex',
 'DEST_CITY_MARKET_IDclassVec',
 'DEST_STATE_ABRIndex',
 'DEST_STATE_ABRcl

In [0]:
# # Step 3: Bin and encode the hour
# df_encoded_hour = bin_and_encode_hour(df_encoded_categorical, datetime_column)

In [0]:
# Step 4: Drop unnecessary columns and finalize the DataFrame
df_final = drop_and_select_columns(df_encoded_new, categorical_columns, columns_to_drop)

In [0]:
#Step5: clean data so that labels are not NA
df_final = remove_nan_labels(df_final,"DEP_DEL15")

In [0]:
# display(df_final)

In [0]:
df_final.columns

['DEP_DEL15',
 'DISTANCE',
 'FL_DATE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'YEARclassVec',
 'QUARTERclassVec',
 'MONTHclassVec',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKclassVec',
 'ORIGINclassVec',
 'ORIGIN_CITY_MARKET_IDclassVec',
 'ORIGIN_STATE_ABRclassVec',
 'DESTclassVec',
 'DEST_CITY_MARKET_IDclassVec',
 'DEST_STATE_ABRclassVec',
 'is_near_holiday',
 'percent_delays_2h_origin',
 'percent_delays_2h_dest_before_depart',
 'last_4_avg_delay_tail']

In [0]:
df_final.count()

31179957

In [0]:
#persistent store
transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_LR"
df_final.write.format("parquet").mode("overwrite").save(transformed_data_path)

In [0]:
transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_LR"
df_final = spark.read.format("parquet").load(transformed_data_path)



## run from below after transformed data stored

In [0]:
display(dbutils.fs.ls(f"{team_blob_url}"))

path,name,size,modificationTime
wasbs://team3-4@daphnelin.blob.core.windows.net/TP/,TP/,0,1713057733000
wasbs://team3-4@daphnelin.blob.core.windows.net/airport_codes/,airport_codes/,0,1713482499000
wasbs://team3-4@daphnelin.blob.core.windows.net/avg_delay_by_time/,avg_delay_by_time/,0,1713126605000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_12MON_LR/,data_OPTW_12MON_LR/,0,1713229256000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_12MON_LR_GS_preds/,data_OPTW_12MON_LR_GS_preds/,0,1713357494000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_12MON_LR_less_cat/,data_OPTW_12MON_LR_less_cat/,0,1713357396000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_12MON_XGB/,data_OPTW_12MON_XGB/,0,1713310615000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_60MON_LR/,data_OPTW_60MON_LR/,0,1713551036000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_60MON_LR_less_cat/,data_OPTW_60MON_LR_less_cat/,0,1713500651000
wasbs://team3-4@daphnelin.blob.core.windows.net/data_OPTW_60MON_NN_V1/,data_OPTW_60MON_NN_V1/,0,1713525202000


In [0]:
# #persistent store load

# transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_LR"
# df_final = spark.read.format("parquet").load(transformed_data_path)


# split data and downsample

In [0]:
split_date = "2019-01-01"
train_data = df_final.filter(F.col("FL_DATE") < split_date)
test_data = df_final.filter(F.col("FL_DATE") >= split_date)
train_data = train_data.drop('FL_DATE')
test_data = test_data.drop('FL_DATE')


In [0]:
#extract feature list
feature_list = df_final.columns
feature_list.remove('DEP_DEL15')
feature_list.remove('FL_DATE')
feature_list

['DISTANCE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'YEARclassVec',
 'QUARTERclassVec',
 'MONTHclassVec',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKclassVec',
 'ORIGINclassVec',
 'ORIGIN_CITY_MARKET_IDclassVec',
 'ORIGIN_STATE_ABRclassVec',
 'DESTclassVec',
 'DEST_CITY_MARKET_IDclassVec',
 'DEST_STATE_ABRclassVec',
 'is_near_holiday',
 'percent_delays_2h_origin',
 'percent_delays_2h_dest_before_depart',
 'last_4_avg_delay_tail']


## downsampling

In [0]:
def downsample_majority_class(df: DataFrame) -> DataFrame:
    # Separate the DataFrame by class
    majority_df = df.filter(col("DEP_DEL15") == 0)
    minority_df = df.filter(col("DEP_DEL15") == 1)

    # Count the number of instances in the minority class
    minority_class_count = minority_df.count()

    # Calculate the fraction needed to sample the majority class to match the minority class size
    total_majority_count = majority_df.count()
    sample_fraction = minority_class_count / float(total_majority_count) if total_majority_count != 0 else 0

    # Sample the majority class
    downsampled_majority_df = majority_df.sample(withReplacement=False, fraction=sample_fraction)

    # Combine the downsampled majority class DataFrame with the minority class DataFrame
    balanced_df = downsampled_majority_df.union(minority_df)

    return balanced_df


In [0]:
train_data_balanced = downsample_majority_class(train_data)


# model training

In [0]:
assembler = VectorAssembler(inputCols=feature_list, outputCol="features",handleInvalid="skip")
train_data_assembled = assembler.transform(train_data_balanced)
test_data_assembled = assembler.transform(test_data)


In [0]:
classifier = LogisticRegression(featuresCol='features', labelCol='DEP_DEL15', 
                                predictionCol='prediction', regParam = 0.1, elasticNetParam = 0.5, maxIter = 50)
    #                             regParam=0.1,          # Regularization parameter
    # elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)
    # maxIter=10)             # Maximum number of iterations

fitted_classifier = classifier.fit(train_data_assembled)


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]


## save model

In [0]:
# #loading model next time 
# from xgboost.spark import SparkXGBClassifier

# # Load the model
# loaded_model = SparkXGBClassifier.load(model_path)


# Evaluation

In [0]:
total_train = train_data_balanced.count()
label_distribution_train = train_data_balanced.groupBy('DEP_DEL15').count()
label_distribution_train = label_distribution_train.withColumn('percentage', (col('count') / lit(total_train) * 100))
label_distribution_train.show()


+---------+-------+------------------+
|DEP_DEL15|  count|        percentage|
+---------+-------+------------------+
|        0|4313629|50.004462998151745|
|        1|4312859| 49.99553700184826|
+---------+-------+------------------+



In [0]:
total_test = test_data.count()
label_distribution_test = test_data.groupBy('DEP_DEL15').count()
label_distribution_test = label_distribution_test.withColumn('percentage', (col('count') / lit(total_test) * 100))
label_distribution_test.show()

+---------+-------+------------------+
|DEP_DEL15|  count|        percentage|
+---------+-------+------------------+
|        1|1353702|18.648203544351922|
|        0|5905453| 81.35179645564807|
+---------+-------+------------------+



In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

train_predictions = fitted_classifier.transform(train_data_assembled)
test_predictions = fitted_classifier.transform(test_data_assembled)

#calculate percentage of predicted delays vs. no delays
total_predictions = test_predictions.count()
predictions_group = test_predictions.groupBy('prediction').count()
percentage_predictions = predictions_group.withColumn('percentage', col('count') / total_predictions * 100)
percentage_predictions.show()



+----------+-------+-----------------+
|prediction|  count|       percentage|
+----------+-------+-----------------+
|       0.0|4254799|65.52863235976749|
|       1.0|2238239|34.47136764023251|
+----------+-------+-----------------+



In [0]:
def evaluate_model(train_data_assembled, test_data_assembled, model):
   
    def get_metrics(dataset):
        # Make predictions
        predictions = model.transform(dataset)

        # Select the prediction and label columns and convert to RDD
        predictionAndLabels = predictions.select(
            col('prediction').cast('float'),
            col('DEP_DEL15').cast('float')
        ).rdd

        # Compute metrics using MulticlassMetrics
        metrics = MulticlassMetrics(predictionAndLabels)
        confusion_matrix = metrics.confusionMatrix().toArray()

        # Calculate metrics
        TN = confusion_matrix[0,0]
        FP = confusion_matrix[0,1]
        FN = confusion_matrix[1,0]
        TP = confusion_matrix[1,1]

        recall = TP / (TP + FN) if (TP + FN) > 0 else 0
        precision = TP / (TP + FP) if (TP + FP) > 0 else 0
        f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
        beta = 2
        f2 = ((1 + beta**2) * precision * recall) / ((beta**2 * precision) + recall) if ((beta**2 * precision) + recall) > 0 else 0

        return {
            'TN': TN, 'FP': FP, 'FN': FN, 'TP': TP,
            'Recall': recall, 'Precision': precision, 'F1 Score': f1, 'F2 Score': f2
        }

    # Evaluate on both training and test data
    train_metrics = get_metrics(train_data_assembled)
    test_metrics = get_metrics(test_data_assembled)

    return {
        'Training': train_metrics,
        'Testing': test_metrics
    }




## performance metrics

In [0]:
results = evaluate_model(train_data_assembled, test_data_assembled, fitted_classifier)
print(results)



{'Training': {'TN': 2836924.0, 'FP': 1060460.0, 'FN': 1793831.0, 'TP': 2169300.0, 'Recall': 0.5473702484222702, 'Precision': 0.6716598137322897, 'F1 Score': 0.6031788887110899, 'F2 Score': 0.5684068007791939}, 'Testing': {'TN': 3711275.0, 'FP': 1556072.0, 'FN': 540134.0, 'TP': 685557.0, 'Recall': 0.5593228635928631, 'Precision': 0.3058298228654251, 'F1 Score': 0.39543912877957615, 'F2 Score': 0.4797867362559702}}


In [0]:

# predictionAndLabels = test_predictions.select(
#     col('prediction').cast('float'),
#     col('DEP_DEL15').cast('float')
# ).rdd

# metrics = MulticlassMetrics(predictionAndLabels)
# confusion_matrix = metrics.confusionMatrix().toArray()

# print("Confusion Matrix:\n", confusion_matrix)

# print("details:")
# print("          Predicted: No    Predicted: Yes")
# print(f"Actual: No  TN = {confusion_matrix[0,0]:7.0f}    FP = {confusion_matrix[0,1]:7.0f}")
# print(f"Actual: Yes FN = {confusion_matrix[1,0]:7.0f}    TP = {confusion_matrix[1,1]:7.0f}")


In [0]:
# #calculating recall and precision manually
# TP = confusion_matrix[1, 1]
# FN = confusion_matrix[1, 0]
# FP = confusion_matrix[0, 1]
# recall_for_class_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
# precision_for_class_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
# beta = 2
# f_beta_for_class_1 = ((1 + beta**2) * precision_for_class_1 * recall_for_class_1) / ((beta**2 * precision_for_class_1) + recall_for_class_1) if ((beta**2 * precision_for_class_1) + recall_for_class_1) > 0 else 0

# print("Recall for class 1 (Delays):", recall_for_class_1)
# print("Precision for class 1 (Delays):", precision_for_class_1)
# print("F2 Score for class 1 (Delays):", f_beta_for_class_1)

## grid search


In [0]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


In [0]:
paramGrid = ParamGridBuilder() \
    .addGrid(classifier.regParam, [0.01, 0.1, 0.5]) \
    .addGrid(classifier.elasticNetParam, [0.0, 0.3, 0.5, 1.0]) \
    .addGrid(classifier.maxIter, [10, 50]) \
    .build()
evaluator = MulticlassClassificationEvaluator(labelCol='DEP_DEL15', predictionCol="prediction", metricName="f1", metricLabel=1)


In [0]:
crossval = CrossValidator(estimator=classifier,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)  
cvModel = crossval.fit(train_data_assembled)


Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
#bestModel = cvModel.bestModel

# Evaluate the best model on the training and testing data
results_lr = evaluate_model(train_data_assembled, test_data_assembled, cvModel)

# Print results
print(results_lr)

{'Training': {'TN': 2770768.0, 'FP': 1127033.0, 'FN': 1623872.0, 'TP': 2339259.0, 'Recall': 0.5902552804840415, 'Precision': 0.6748591867044093, 'F1 Score': 0.6297283113372332, 'F2 Score': 0.6054353952126258}, 'Testing': {'TN': 3602490.0, 'FP': 1664857.0, 'FN': 487105.0, 'TP': 738586.0, 'Recall': 0.6025874384326881, 'Precision': 0.30730331445347364, 'F1 Score': 0.40703153975576545, 'F2 Score': 0.5054510500455298}}


In [0]:
results_lr_train = evaluate_model(train_data_assembled, train_data_assembled, cvModel)

# Print results
print(results_lr_train)

{'Training': {'TN': 2770768.0, 'FP': 1127033.0, 'FN': 1623872.0, 'TP': 2339259.0, 'Recall': 0.5902552804840415, 'Precision': 0.6748591867044093, 'F1 Score': 0.6297283113372332, 'F2 Score': 0.6054353952126258}, 'Testing': {'TN': 2770768.0, 'FP': 1127033.0, 'FN': 1623872.0, 'TP': 2339259.0, 'Recall': 0.5902552804840415, 'Precision': 0.6748591867044093, 'F1 Score': 0.6297283113372332, 'F2 Score': 0.6054353952126258}}


In [0]:
logreg_predictions = cvModel.transform(test_data_assembled)
logreg_predictions_path = f"{team_blob_url}/data_OPTW_60MON_LOGREG_V1_predictions"
logreg_predictions.write.format("parquet").mode("overwrite").save(logreg_predictions_path)

In [0]:
best_params = cvModel.bestModel.extractParamMap()
for param, value in best_params.items():
    print(f"{param.name}: {value}")

aggregationDepth: 2
elasticNetParam: 0.0
family: auto
featuresCol: features
fitIntercept: True
labelCol: DEP_DEL15
maxBlockSizeInMB: 0.0
maxIter: 10
predictionCol: prediction
probabilityCol: probability
rawPredictionCol: rawPrediction
regParam: 0.01
standardization: True
threshold: 0.5
tol: 1e-06
