In [0]:
secret_scope = "team_3-4"
secret_key = "key_3_4" # The name of your container created in https://portal.azure.com
blob_container = "team3-4"  # The name of your Storage account created in https://portal.azure.com
storage_account = "daphnelin" 
team_blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, count, when, isnan, lit, udf, desc, mean, year, month, date_format, to_date, sum, expr, round, unix_timestamp, from_unixtime, avg, date_add, date_sub, when, to_date
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DoubleType, StructType, StructField, StringType
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

from pyspark.sql import DataFrame

from pyspark.ml.classification import LogisticRegression
from pyspark.sql.functions import col, unix_timestamp, avg
from pyspark.sql.window import Window
import time
from pyspark.mllib.evaluation import MulticlassMetrics

In [0]:
lr_gridsearch_predictions_path = f"{team_blob_url}/data_OPTW_60MON_LOGREG_V1_predictions"
lr_predictions = spark.read.format("parquet").load(lr_gridsearch_predictions_path)

In [0]:
nn_gridsearch_predictions_path = f"{team_blob_url}/data_OPTW_60MON_NN_V1_predictions"
nn_predictions = spark.read.format("parquet").load(nn_gridsearch_predictions_path)

In [0]:
xgboost_predictions_path = f"{team_blob_url}/data_OPTW_60MON_XGBOOST_V1_predictions"
xgboost_predictions = spark.read.format("parquet").load(xgboost_predictions_path)

In [0]:
ordered_class = (lr_predictions.orderBy(col('DISTANCE').asc(), col('HourlyDewPointTemperature').asc(), col('HourlyDryBulbTemperature').asc(), col('HourlyRelativeHumidity').asc(), col('HourlyStationPressure').asc()))
correct_class = ordered_class.select('DEP_DEL15').withColumnRenamed('DEP_DEL15', 'correct_class')

In [0]:
ordered_lr = (lr_predictions.orderBy(col('DISTANCE').asc(), col('HourlyDewPointTemperature').asc(), col('HourlyDryBulbTemperature').asc(), col('HourlyRelativeHumidity').asc(), col('HourlyStationPressure').asc()))

ordered_nn = (nn_predictions.orderBy(col('DISTANCE').asc(), col('HourlyDewPointTemperature').asc(), col('HourlyDryBulbTemperature').asc(), col('HourlyRelativeHumidity').asc(), col('HourlyStationPressure').asc()))

ordered_xgboost = (xgboost_predictions.orderBy(col('DISTANCE').asc(), col('HourlyDewPointTemperature').asc(), col('HourlyDryBulbTemperature').asc(), col('HourlyRelativeHumidity').asc(), col('HourlyStationPressure').asc()))

In [0]:
from pyspark.sql.functions import monotonically_increasing_id

nn_pred = ordered_nn.select('prediction').withColumnRenamed('prediction', 'neural_network')
lr_pred = ordered_lr.select('prediction').withColumnRenamed('prediction', 'logistic_regression')
xg_pred = ordered_xgboost.select('prediction').withColumnRenamed('prediction', 'xgboost')

df1 = correct_class.withColumn("id", monotonically_increasing_id())
df2 = nn_pred.withColumn("id", monotonically_increasing_id())
df3 = lr_pred.withColumn("id", monotonically_increasing_id())  
df4 = xg_pred.withColumn("id", monotonically_increasing_id())                       

df_joined1 = df1.join(df2, "id", "inner")
df_joined2 = df_joined1.join(df3, "id", "inner")
df_joined3 = df_joined2.join(df4, "id", "inner")
display(df_joined3.orderBy('id'))

id,correct_class,neural_network,logistic_regression,xgboost
0,1,0.0,0.0,0.0
1,0,0.0,0.0,0.0
2,0,0.0,0.0,0.0
3,0,0.0,0.0,0.0
4,0,0.0,0.0,0.0
5,0,0.0,0.0,0.0
6,0,0.0,0.0,0.0
7,0,0.0,0.0,0.0
8,0,0.0,0.0,0.0
9,0,0.0,0.0,0.0


In [0]:
from pyspark.sql.functions import expr

df_majority = df_joined3.withColumn('majority_prediction', expr('''(neural_network + logistic_regression + xgboost) >= 2''').cast('int'))
#df_majority = df_majority.drop('id')
display(df_majority.orderBy('id'))

id,correct_class,neural_network,logistic_regression,xgboost,majority_prediction
0,1,0.0,0.0,0.0,0
1,0,0.0,0.0,0.0,0
2,0,0.0,0.0,0.0,0
3,0,0.0,0.0,0.0,0
4,0,0.0,0.0,0.0,0
5,0,0.0,0.0,0.0,0
6,0,0.0,0.0,0.0,0
7,0,0.0,0.0,0.0,0
8,0,0.0,0.0,0.0,0
9,0,0.0,0.0,0.0,0


In [0]:
def get_metrics(dataset):
    
    # Select the prediction and label columns and convert to RDD
    predictionAndLabels = dataset.select(
        col('majority_prediction').cast('float'),
        col('correct_class').cast('float')
    ).rdd

    # Compute metrics using MulticlassMetrics
    metrics = MulticlassMetrics(predictionAndLabels)
    confusion_matrix = metrics.confusionMatrix().toArray()

    # Calculate metrics
    TN = confusion_matrix[0,0]
    FP = confusion_matrix[0,1]
    FN = confusion_matrix[1,0]
    TP = confusion_matrix[1,1]

    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    beta = 2
    f2 = ((1 + beta**2) * precision * recall) / ((beta**2 * precision) + recall) if ((beta**2 * precision) + recall) > 0 else 0

    return {
        'TN': TN, 'FP': FP, 'FN': FN, 'TP': TP,
        'Recall': recall, 'Precision': precision, 'F1 Score': f1, 'F2 Score': f2
    }

In [0]:
ensemble_results = get_metrics(df_majority)
print(ensemble_results)



{'TN': 3437492.0, 'FP': 1549177.0, 'FN': 784948.0, 'TP': 374296.0, 'Recall': 0.32287939381182906, 'Precision': 0.19459384145241446, 'F1 Score': 0.24283513536922138, 'F2 Score': 0.2852670602271277}
