# set up 


In [0]:
secret_scope = "team_3-4"
secret_key = "key_3_4" # The name of your container created in https://portal.azure.com
blob_container = "team3-4"  # The name of your Storage account created in https://portal.azure.com
storage_account = "daphnelin" 
team_blob_url = f"wasbs://{blob_container}@{storage_account}.blob.core.windows.net"

spark.conf.set(
  f"fs.azure.sas.{blob_container}.{storage_account}.blob.core.windows.net",
  dbutils.secrets.get(scope = secret_scope, key = secret_key)
)

In [0]:
from pyspark.sql.functions import col, count, when, isnan, lit, udf, desc, mean, year, month, date_format, to_date
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, DoubleType, StructType, StructField, StringType
from pyspark.sql import functions as F

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

from pyspark.ml.feature import StringIndexer, OneHotEncoder,VectorAssembler
from pyspark.ml import Pipeline

from pyspark.sql import DataFrame

from pyspark.ml.classification import LogisticRegression


# feature selection


## import data and define features

In [0]:
data_BASE_DIR = "dbfs:/mnt/mids-w261/"
df_otpw_12m = spark.read.option("header", "true").option("compression", "gzip").csv(f"{data_BASE_DIR}/OTPW_12M/OTPW_12M_2015.csv.gz")


#df_otpw_60m = spark.read.format("csv").option("header", "true").option("inferSchema", "true").option("compression", "gzip").load(f"{data_BASE_DIR}/OTPW_60M/")
#display(df_otpw_60m)


In [0]:

# columns_to_select = [
#     "DEP_DEL15", "CANCELLED", "QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE',

#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",

#     'HourlyAltimeterSetting',
#     'HourlyDewPointTemperature',
#     'HourlyDryBulbTemperature',
#     'HourlyRelativeHumidity',
#     'HourlyStationPressure',
#     'HourlyVisibility',
#     'HourlyWetBulbTemperature',
#     'HourlyWindDirection',
#     'HourlyWindSpeed',
# ]

#take out TAIL_NUM AND OP_CARRIER_AIRLINE_ID
columns_to_select = [
    "DEP_DEL15", "CANCELLED", "QUARTER", "MONTH", "DAY_OF_MONTH", "DAY_OF_WEEK",
    "sched_depart_date_time_UTC", "DISTANCE", 'FL_DATE', "OP_CARRIER_AIRLINE_ID",
    "ORIGIN",
    "DEST",

    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

# categorical_columns = [
#     "QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
#     "OP_CARRIER_AIRLINE_ID", "OP_CARRIER_FL_NUM","TAIL_NUM",
#     "ORIGIN", "ORIGIN_CITY_MARKET_ID","ORIGIN_STATE_ABR",
#     "DEST", "DEST_CITY_MARKET_ID","DEST_STATE_ABR",
#     ]

categorical_columns = [
    "QUARTER", "MONTH","DAY_OF_MONTH","DAY_OF_WEEK",
    "OP_CARRIER_AIRLINE_ID", "ORIGIN", "DEST"
    ]

columns_to_cast = [
    'DEP_DEL15',
    'DISTANCE',
    'HourlyAltimeterSetting',
    'HourlyDewPointTemperature',
    'HourlyDryBulbTemperature',
    'HourlyRelativeHumidity',
    'HourlyStationPressure',
    'HourlyVisibility',
    'HourlyWetBulbTemperature',
    'HourlyWindDirection',
    'HourlyWindSpeed',
]

datetime_column = 'sched_depart_date_time_UTC'

columns_to_drop = [datetime_column] + ["CANCELLED","hour","hourIndex"]

## feature engineering

In [0]:
def filter_and_select_columns(df: DataFrame, columns_to_select: list, columns_to_cast: list) -> DataFrame:
    for column in columns_to_cast:
        df = df.withColumn(column, col(column).cast('integer'))
    
    df = df.select(*columns_to_select).filter(col("CANCELLED") != "1.0").cache()
    return df

In [0]:
def encode_categorical_features(df: DataFrame, categorical_columns: list) -> DataFrame:
    stages = []
    for categorical_col in categorical_columns:
        string_indexer = StringIndexer(inputCol=categorical_col, outputCol=categorical_col + "Index")
        encoder = OneHotEncoder(inputCols=[string_indexer.getOutputCol()], outputCols=[categorical_col + "classVec"])
        stages += [string_indexer, encoder]
    
    pipeline = Pipeline(stages=stages)
    pipeline_model = pipeline.fit(df)
    return pipeline_model.transform(df).cache()

In [0]:
def hour_to_bin(hour):
    return (int(hour) // 4) + 1

def bin_and_encode_hour(df: DataFrame, datetime_column: str) -> DataFrame:
    hour_to_bin_udf = udf(hour_to_bin, IntegerType())
    df = df.withColumn("hour", hour_to_bin_udf(df[datetime_column].substr(12, 2)))
    
    string_indexer = StringIndexer(inputCol="hour", outputCol="hourIndex").setHandleInvalid("skip")
    encoder = OneHotEncoder(inputCols=["hourIndex"], outputCols=["hourVec"])
    pipeline = Pipeline(stages=[string_indexer, encoder])
    pipeline_model = pipeline.fit(df)
    
    return pipeline_model.transform(df).cache()

In [0]:
def drop_and_select_columns(df: DataFrame, categorical_columns: list, columns_to_drop: list) -> DataFrame:
    categorical_columns_indexed = [col + "Index" for col in categorical_columns]
    all_columns_to_drop = categorical_columns + categorical_columns_indexed + columns_to_drop
    remaining_columns = [col for col in df.columns if col not in all_columns_to_drop]
    return df.select(*remaining_columns).cache()

In [0]:
# Step 1: Filter and select columns
df_filtered = filter_and_select_columns(df_otpw_12m, columns_to_select, columns_to_cast)

In [0]:
# Step 2: Encode categorical features
df_encoded_categorical = encode_categorical_features(df_filtered, categorical_columns)

Downloading artifacts:   0%|          | 0/100 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
# Step 3: Bin and encode the hour
df_encoded_hour = bin_and_encode_hour(df_encoded_categorical, datetime_column)

Downloading artifacts:   0%|          | 0/16 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
# Step 4: Drop unnecessary columns and finalize the DataFrame
df_final = drop_and_select_columns(df_encoded_hour, categorical_columns, columns_to_drop)

In [0]:
df_final.columns

['DEP_DEL15',
 'DISTANCE',
 'FL_DATE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'QUARTERclassVec',
 'MONTHclassVec',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKclassVec',
 'OP_CARRIER_AIRLINE_IDclassVec',
 'ORIGINclassVec',
 'DESTclassVec',
 'hourVec']

In [0]:
display(df_final)

com.databricks.backend.common.rpc.CommandCancelledException
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:103)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$2(SequenceExecutionState.scala:103)
	at com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$2$adapted(SequenceExecutionState.scala:100)
	at scala.collection.immutable.Range.foreach(Range.scala:158)
	at com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:100)
	at com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:714)
	at com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:430)
	at scala.Option.getOrElse(Option.scala:189)
	at com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:430)
	at com.databricks.spark.chauffeur.ChauffeurState.cancelExecutio

In [0]:
df_final.count()

31184701

In [0]:
#persistent store
transformed_data_path = f"{team_blob_url}/data_OPTW_60MON_LR_less_cat"
df_final.write.format("parquet").mode("overwrite").save(transformed_data_path)

# split data and compute class weights 

In [0]:
split_date = "2015-10-01"
train_data = df_final.filter(F.col("FL_DATE") < split_date)
test_data = df_final.filter(F.col("FL_DATE") >= split_date)
train_data = train_data.drop('FL_DATE')
test_data = test_data.drop('FL_DATE')


In [0]:
#extract feature list
feature_list = df_final.columns
feature_list.remove('DEP_DEL15')
feature_list.remove('FL_DATE')
feature_list

['DISTANCE',
 'HourlyAltimeterSetting',
 'HourlyDewPointTemperature',
 'HourlyDryBulbTemperature',
 'HourlyRelativeHumidity',
 'HourlyStationPressure',
 'HourlyVisibility',
 'HourlyWetBulbTemperature',
 'HourlyWindDirection',
 'HourlyWindSpeed',
 'QUARTERclassVec',
 'MONTHclassVec',
 'DAY_OF_MONTHclassVec',
 'DAY_OF_WEEKclassVec',
 'OP_CARRIER_AIRLINE_IDclassVec',
 'ORIGINclassVec',
 'DESTclassVec',
 'hourVec']


## calculate class weights

In [0]:
def calculate_class_weights(df: DataFrame, label_column: str, weight_column_name: str = 'classWeight') -> DataFrame:
    """
    Calculate class weights based on the frequencies of labels in a specified column and add a weight column to the DataFrame.

    Parameters:
    df (DataFrame): The input DataFrame.
    label_column (str): The name of the column to compute class weights for.
    weight_column_name (str): The name of the new column that will contain the computed weights.

    Returns:
    DataFrame: The DataFrame with an additional column containing the weights for each class.
    """
    # Calculate class frequencies
    class_freqs = df.groupBy(label_column).count().collect()

    # Total count of the dataset
    total_count = df.count()

    # Create a dictionary with class weights
    weight_dict = {row[label_column]: float(total_count) / row['count'] for row in class_freqs}

    # Broadcast the dictionary to use in withColumn operation
    broadcast_weights = spark.sparkContext.broadcast(weight_dict)

    # Function to apply the weight based on the class
    def get_weight(class_label):
        return broadcast_weights.value.get(class_label, 0)  # return 0 if the class_label is not found

    # Register UDF
    get_weight_udf = udf(get_weight, DoubleType())

    # Add weight column to DataFrame
    return df.withColumn(weight_column_name, get_weight_udf(col(label_column)))


train_data_with_weights = calculate_class_weights(train_data, 'DEP_DEL15')


In [0]:
class_counts = train_data.groupBy('DEP_DEL15').count().collect()
min_count = min(class_counts, key = lambda x: x['count'])['count']
max_count = max(class_counts, key=lambda x: x['count'])['count']
majority_fraction = min_count / max_count
downsampled_train = train_data.sampleBy('DEP_DEL15', fractions={0: majority_fraction, 1: 1.0}, seed=42)


# model training

In [0]:
assembler = VectorAssembler(inputCols=feature_list, outputCol="features",handleInvalid="skip")
train_data_assembled = assembler.transform(downsampled_train)
test_data_assembled = assembler.transform(test_data)

classifier = LogisticRegression(featuresCol='features', labelCol='DEP_DEL15', 
                                predictionCol='prediction', regParam = 0.1, elasticNetParam = 0.5, maxIter = 20)
    #                             regParam=0.1,          # Regularization parameter
    # elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)
    # maxIter=10)             # Maximum number of iterations

fitted_classifier = classifier.fit(train_data_assembled)
lr_test_predictions = fitted_classifier.transform(test_data_assembled)


Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

# Evaluation

In [0]:
total_train = downsampled_train.count()
label_distribution_train = downsampled_train.groupBy('DEP_DEL15').count()
label_distribution_train = label_distribution_train.withColumn('percentage', (col('count') / lit(total_train) * 100))
label_distribution_train.show()


+---------+------+------------------+
|DEP_DEL15| count|        percentage|
+---------+------+------------------+
|        1|826349|49.973209706915405|
|        0|827235|  50.0267902930846|
+---------+------+------------------+



In [0]:
total_test = test_data.count()
label_distribution_test = test_data.groupBy('DEP_DEL15').count()
label_distribution_test = label_distribution_test.withColumn('percentage', (col('count') / lit(total_test) * 100))
label_distribution_test.show()

+---------+-------+------------------+
|DEP_DEL15|  count|        percentage|
+---------+-------+------------------+
|        1| 227335|16.053099119863518|
|        0|1188809| 83.94690088013648|
+---------+-------+------------------+



In [0]:
from pyspark.mllib.evaluation import MulticlassMetrics

#train_predictions = fitted_classifier.transform(train_data_assembled)
#test_predictions = fitted_classifier.transform(test_data_assembled)

#calculate percentage of predicted delays vs. no delays
total_predictions = lr_test_predictions.count()
predictions_group = lr_test_predictions.groupBy('prediction').count()
percentage_predictions = predictions_group.withColumn('percentage', col('count') / total_predictions * 100)
percentage_predictions.show()



+----------+------+------------------+
|prediction| count|        percentage|
+----------+------+------------------+
|       0.0|378788|29.144443452756498|
|       1.0|920904| 70.85555654724351|
+----------+------+------------------+



In [0]:

predictionAndLabels = lr_test_predictions.select(
    col('prediction').cast('float'),
    col('DEP_DEL15').cast('float')
).rdd

metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()

print("Confusion Matrix:\n", confusion_matrix)

print("details:")
print("          Predicted: No    Predicted: Yes")
print(f"Actual: No  TN = {confusion_matrix[0,0]:7.0f}    FP = {confusion_matrix[0,1]:7.0f}")
print(f"Actual: Yes FN = {confusion_matrix[1,0]:7.0f}    TP = {confusion_matrix[1,1]:7.0f}")




Confusion Matrix:
 [[344702. 744182.]
 [ 34086. 176722.]]
details:
          Predicted: No    Predicted: Yes
Actual: No  TN =  344702    FP =  744182
Actual: Yes FN =   34086    TP =  176722


In [0]:
# #calculating recall and precision manually
# TP = confusion_matrix[1, 1]
# FN = confusion_matrix[1, 0]
# FP = confusion_matrix[0, 1]
# recall_for_class_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
# precision_for_class_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
# beta = 2
# f_beta_for_class_1 = ((1 + beta**2) * precision_for_class_1 * recall_for_class_1) / ((beta**2 * precision_for_class_1) + recall_for_class_1) if ((beta**2 * precision_for_class_1) + recall_for_class_1) > 0 else 0

# print("Recall for class 1 (Delays):", recall_for_class_1)
# print("Precision for class 1 (Delays):", precision_for_class_1)
# print("F2 Score for class 1 (Delays):", f_beta_for_class_1)

## grid search


In [0]:
from pyspark.ml.classification import MultilayerPerceptronClassifier
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.classification import LogisticRegression

#lr = LogisticRegression()
lr = LogisticRegression(featuresCol='features', labelCol='DEP_DEL15')

paramGrid = ParamGridBuilder() \
            .addGrid(lr.regParam, [0.01, 0.5, 2.0]) \
            .addGrid(lr.elasticNetParam, [0.0, 0.5, 1.0])  \
            .addGrid(lr.maxIter, [10, 20, 50]) \
            .build()

evaluator = MulticlassClassificationEvaluator(metricName='weightedRecall', labelCol='DEP_DEL15')

crossval = CrossValidator(estimator=lr,
                          estimatorParamMaps=paramGrid,
                          evaluator=evaluator,
                          numFolds=5)

cvModel = crossval.fit(train_data_assembled)

#bestModel = cvModel.bestModel
#print("Best parameters found on grid search:", bestModel._java_obj.parent().getLayers(), bestModel._java_obj.parent().getBlockSize(), bestModel._java_obj.parent().getMaxIter(), bestModel._java_obj.parent().getStepSize())

Downloading artifacts:   0%|          | 0/15 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

Downloading artifacts:   0%|          | 0/9 [00:00<?, ?it/s]

Uploading artifacts:   0%|          | 0/4 [00:00<?, ?it/s]

In [0]:
#bestModel = cvModel.bestModel
lr_gs_predictions = cvModel.transform(test_data_assembled)

In [0]:
total_predictions = lr_gs_predictions.count()
predictions_group = lr_gs_predictions.groupBy('prediction').count()
percentage_predictions = predictions_group.withColumn('percentage', col('count') / total_predictions * 100)
percentage_predictions.show()

+----------+------+----------------+
|prediction| count|      percentage|
+----------+------+----------------+
|       0.0|648466|49.8938209975902|
|       1.0|651226|50.1061790024098|
+----------+------+----------------+



In [0]:
predictionAndLabels = lr_gs_predictions.select(
    col('prediction').cast('float'),
    col('DEP_DEL15').cast('float')
).rdd

metrics = MulticlassMetrics(predictionAndLabels)
confusion_matrix = metrics.confusionMatrix().toArray()

print("Confusion Matrix:\n", confusion_matrix)

print("details:")
print("          Predicted: No    Predicted: Yes")
print(f"Actual: No  TN = {confusion_matrix[0,0]:7.0f}    FP = {confusion_matrix[0,1]:7.0f}")
print(f"Actual: Yes FN = {confusion_matrix[1,0]:7.0f}    TP = {confusion_matrix[1,1]:7.0f}")



Confusion Matrix:
 [[582333. 506551.]
 [ 66133. 144675.]]
details:
          Predicted: No    Predicted: Yes
Actual: No  TN =  582333    FP =  506551
Actual: Yes FN =   66133    TP =  144675


In [0]:
#calculating recall and precision manually
TP = confusion_matrix[1, 1]
FN = confusion_matrix[1, 0]
FP = confusion_matrix[0, 1]
recall_for_class_1 = TP / (TP + FN) if (TP + FN) > 0 else 0
precision_for_class_1 = TP / (TP + FP) if (TP + FP) > 0 else 0
beta = 2
f_beta_for_class_1 = ((1 + beta**2) * precision_for_class_1 * recall_for_class_1) / ((beta**2 * precision_for_class_1) + recall_for_class_1) if ((beta**2 * precision_for_class_1) + recall_for_class_1) > 0 else 0

print("Recall for class 1 (Delays):", recall_for_class_1)
print("Precision for class 1 (Delays):", precision_for_class_1)
print("F2 Score for class 1 (Delays):", f_beta_for_class_1)

Recall for class 1 (Delays): 0.6862879966604682
Precision for class 1 (Delays): 0.2221578991010801
F2 Score for class 1 (Delays): 0.4840383603955413


In [0]:
display(lr_gs_predictions)

DEP_DEL15,DISTANCE,HourlyAltimeterSetting,HourlyDewPointTemperature,HourlyDryBulbTemperature,HourlyRelativeHumidity,HourlyStationPressure,HourlyVisibility,HourlyWetBulbTemperature,HourlyWindDirection,HourlyWindSpeed,QUARTERclassVec,MONTHclassVec,DAY_OF_MONTHclassVec,DAY_OF_WEEKclassVec,OP_CARRIER_AIRLINE_IDclassVec,ORIGINclassVec,DESTclassVec,hourVec,features,rawPrediction,probability,prediction
1,2475,29,32,60,35,29,10,47,330,10,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(12), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 20, 36, 55, 62, 77, 410, 713), values -> List(2475.0, 29.0, 32.0, 60.0, 35.0, 29.0, 10.0, 47.0, 330.0, 10.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.31035619758828537, 0.31035619758828537))","Map(vectorType -> dense, length -> 2, values -> List(0.4230277974590642, 0.5769722025409358))",1.0
0,3784,30,55,63,75,29,10,58,10,11,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(24), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(32), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 18, 48, 59, 62, 75, 424, 715), values -> List(3784.0, 30.0, 55.0, 63.0, 75.0, 29.0, 10.0, 58.0, 10.0, 11.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.3924732799529611, -0.3924732799529611))","Map(vectorType -> dense, length -> 2, values -> List(0.5968779490412968, 0.40312205095870324))",0.0
0,3711,30,68,73,84,29,10,70,70,15,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(50), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21, 28, 59, 62, 123, 394), values -> List(3711.0, 30.0, 68.0, 73.0, 84.0, 29.0, 10.0, 70.0, 70.0, 15.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.10179763661084928, -0.10179763661084928))","Map(vectorType -> dense, length -> 2, values -> List(0.5254274547291755, 0.4745725452708245))",0.0
1,3711,29,54,64,70,29,10,58,180,11,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(13), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(51), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 20, 37, 58, 62, 75, 443, 715), values -> List(3711.0, 29.0, 54.0, 64.0, 70.0, 29.0, 10.0, 58.0, 180.0, 11.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.30667618693342114, -0.30667618693342114))","Map(vectorType -> dense, length -> 2, values -> List(0.5760737494292506, 0.4239262505707494))",0.0
0,2586,30,23,49,36,30,10,39,290,23,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(17), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21, 41, 62, 91, 397, 715), values -> List(2586.0, 30.0, 23.0, 49.0, 36.0, 30.0, 10.0, 39.0, 290.0, 23.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.5194843862360647, -0.5194843862360647))","Map(vectorType -> dense, length -> 2, values -> List(0.6270271906680522, 0.37297280933194776))",0.0
0,2475,29,44,58,60,29,10,51,280,20,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(27), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(), values -> List())","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21, 51, 58, 62, 77, 410), values -> List(2475.0, 29.0, 44.0, 58.0, 60.0, 29.0, 10.0, 51.0, 280.0, 20.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.09753527307139143, 0.09753527307139143))","Map(vectorType -> dense, length -> 2, values -> List(0.475635493895509, 0.524364506104491))",1.0
0,2486,29,68,75,79,29,10,70,60,10,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(16), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(50), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(4), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21, 40, 58, 62, 123, 396, 717), values -> List(2486.0, 29.0, 68.0, 75.0, 79.0, 29.0, 10.0, 70.0, 60.0, 10.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(1.5124795033242622, -1.5124795033242622))","Map(vectorType -> dense, length -> 2, values -> List(0.819428378444872, 0.18057162155512796))",0.0
0,2586,30,48,58,70,29,9,53,230,18,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(16), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 20, 40, 54, 62, 78, 410, 713), values -> List(2586.0, 30.0, 48.0, 58.0, 70.0, 29.0, 9.0, 53.0, 230.0, 18.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.6660685698675537, 0.6660685698675537))","Map(vectorType -> dense, length -> 2, values -> List(0.3393777119391575, 0.6606222880608426))",1.0
0,2586,30,39,60,46,30,10,50,350,10,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(8), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(1), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(3), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 21, 25, 57, 62, 91, 397, 713), values -> List(2586.0, 30.0, 39.0, 60.0, 46.0, 30.0, 10.0, 50.0, 350.0, 10.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(-0.5526173024803298, 0.5526173024803298))","Map(vectorType -> dense, length -> 2, values -> List(0.3652573882644934, 0.6347426117355066))",1.0
0,2586,29,46,51,83,29,9,49,290,24,"Map(vectorType -> sparse, length -> 3, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 11, indices -> List(7), values -> List(1.0))","Map(vectorType -> sparse, length -> 30, indices -> List(0), values -> List(1.0))","Map(vectorType -> sparse, length -> 6, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 13, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 319, indices -> List(5), values -> List(1.0))","Map(vectorType -> sparse, length -> 321, indices -> List(18), values -> List(1.0))","Map(vectorType -> sparse, length -> 5, indices -> List(2), values -> List(1.0))","Map(vectorType -> sparse, length -> 718, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 12, 20, 24, 59, 62, 78, 410, 715), values -> List(2586.0, 29.0, 46.0, 51.0, 83.0, 29.0, 9.0, 49.0, 290.0, 24.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))","Map(vectorType -> dense, length -> 2, values -> List(0.18321446053626467, -0.18321446053626467))","Map(vectorType -> dense, length -> 2, values -> List(0.5456759175482463, 0.4543240824517537))",0.0


In [0]:
transformed_data_path = f"{team_blob_url}/data_OPTW_12MON_LR_GS_preds"
lr_gs_predictions.write.format("parquet").mode("overwrite").save(transformed_data_path)

In [0]:
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
# from pyspark.ml.evaluation import MulticlassClassificationEvaluator


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2926108717732871>, line 11[0m
[1;32m      5[0m classifier [38;5;241m=[39m LogisticRegression(featuresCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, labelCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mDEP_DEL15[39m[38;5;124m'[39m, 
[1;32m      6[0m                                 predictionCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mprediction[39m[38;5;124m'[39m, weightCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mclassWeight[39m[38;5;124m'[39m)
[1;32m      7[0m     [38;5;66;03m#                             regParam=0.1,          # Regularization parameter[39;00m
[1;32m      8[0m     [38;5;66;03m# elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)[39;00m
[1;32m      9[0m     [38;5;66;03m# maxIter=10)             # Maximum n

In [0]:
# paramGrid = ParamGridBuilder() \
#     .addGrid(classifier.regParam, [0.01, 0.1, 1.0]) \
#     .addGrid(classifier.elasticNetParam, [0.0, 0.5, 1.0]) \
#     .addGrid(classifier.maxIter, [10, 50, 100]) \
#     .build()
# evaluator = MulticlassClassificationEvaluator(labelCol='DEP_DEL15', predictionCol="prediction", metricName="recallByLabel", metricLabel=1)


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2926108717732871>, line 11[0m
[1;32m      5[0m classifier [38;5;241m=[39m LogisticRegression(featuresCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, labelCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mDEP_DEL15[39m[38;5;124m'[39m, 
[1;32m      6[0m                                 predictionCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mprediction[39m[38;5;124m'[39m, weightCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mclassWeight[39m[38;5;124m'[39m)
[1;32m      7[0m     [38;5;66;03m#                             regParam=0.1,          # Regularization parameter[39;00m
[1;32m      8[0m     [38;5;66;03m# elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)[39;00m
[1;32m      9[0m     [38;5;66;03m# maxIter=10)             # Maximum n

In [0]:
# crossval = CrossValidator(estimator=classifier,
#                           estimatorParamMaps=paramGrid,
#                           evaluator=evaluator,
#                           numFolds=5)  
# cvModel = crossval.fit(train_data_assembled)


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2926108717732871>, line 11[0m
[1;32m      5[0m classifier [38;5;241m=[39m LogisticRegression(featuresCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, labelCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mDEP_DEL15[39m[38;5;124m'[39m, 
[1;32m      6[0m                                 predictionCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mprediction[39m[38;5;124m'[39m, weightCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mclassWeight[39m[38;5;124m'[39m)
[1;32m      7[0m     [38;5;66;03m#                             regParam=0.1,          # Regularization parameter[39;00m
[1;32m      8[0m     [38;5;66;03m# elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)[39;00m
[1;32m      9[0m     [38;5;66;03m# maxIter=10)             # Maximum n

In [0]:
# predictions = cvModel.bestModel.transform(test_data_assembled)
# final_score = evaluator.evaluate(predictions)
# print(f"Best model's score on test data: {final_score}")


[0;31m---------------------------------------------------------------------------[0m
[0;31mPy4JJavaError[0m                             Traceback (most recent call last)
File [0;32m<command-2926108717732871>, line 11[0m
[1;32m      5[0m classifier [38;5;241m=[39m LogisticRegression(featuresCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mfeatures[39m[38;5;124m'[39m, labelCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mDEP_DEL15[39m[38;5;124m'[39m, 
[1;32m      6[0m                                 predictionCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mprediction[39m[38;5;124m'[39m, weightCol[38;5;241m=[39m[38;5;124m'[39m[38;5;124mclassWeight[39m[38;5;124m'[39m)
[1;32m      7[0m     [38;5;66;03m#                             regParam=0.1,          # Regularization parameter[39;00m
[1;32m      8[0m     [38;5;66;03m# elasticNetParam=1.0,   # Elastic Net parameter (L1 norm)[39;00m
[1;32m      9[0m     [38;5;66;03m# maxIter=10)             # Maximum n