In [1]:
# Import useful libraries:

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics


Realitza algun procés d’enginyeria de variables per millorar-ne la predicció

In [2]:
# A couple of feature engineering tools have been used in these set of Jupyter notebooks. More specifically, 
# null values imputation, feature normalisation and selection and dummy variable creation

In [3]:
# Modify dataset as in the previous notebooks:

# Open dataset:

dades = pd.read_csv("../Sprint 11/DelayedFlights.csv")

# Feature filtering:

dades.drop(columns=["Unnamed: 0", "Year", "FlightNum", "TailNum", "Cancelled", "CancellationCode"], inplace=True)

# Null values imputation:

dades["ArrDelay"].fillna(dades["DepDelay"], inplace = True)                # Arrdelay imputation and transformation

dades["is_delay"] = dades["ArrDelay"].apply(lambda x: 0 if x==0 else 1)

dades["ArrTime"].fillna(dades["CRSArrTime"], inplace = True)               # ArrTime imputation

delay = dades["CRSElapsedTime"] + dades["DepDelay"]                        # ActualElapsedTime transformation   

dades["ActualElapsedTime"].fillna(delay, inplace = True) 

dades["ActualElapsedTime"].fillna(dades["ActualElapsedTime"].mean(), inplace=True)   

dades["TaxiIn"].fillna(dades["TaxiIn"].mean(), inplace=True)               # TaxiIn and TaxiOut imputation

dades["TaxiOut"].fillna(dades["TaxiOut"].mean(), inplace=True)


In [4]:
# Variables normalisation:

scaler = MinMaxScaler()

scaled_df = scaler.fit_transform(dades[["is_delay", "ArrTime", "ActualElapsedTime", "Distance", "TaxiIn", "TaxiOut"]])

scaled_df = pd.DataFrame(scaled_df, columns = ["is_delay", "ArrTime", "ActualElapsedTime", "Distance", "TaxiIn", "TaxiOut"])


_________________________________________________________________________________________________________
#########################################################################################################
_________________________________________________________________________________________________________

No utilitzis la variable DepDelay a l’hora de fer prediccions

In [5]:
# Since DepDelay is highly correlated to ArrDelay (in some cases even the same), its exclusion is expected to
# significantly worsen our models' performance further

In [6]:
# The best performing model so far will be used, logistic regression with balanced weights

In [7]:
# Sampling

y = scaled_df["is_delay"]
X = scaled_df[["ArrTime", "ActualElapsedTime", "Distance", "TaxiIn", "TaxiOut"]]

X_train_log, X_test_log, y_train_log, y_test_log = train_test_split(X, y, random_state = 87, stratify= scaled_df["is_delay"])

In [8]:
# Logistic regression model fit and prediction:

logistic = LogisticRegression(class_weight="balanced", max_iter=600)

logistic.fit(X_train_log, y_train_log)

y_pred_log = logistic.predict(X_test_log)

In [9]:
# Calculating performance scores: accuracy score, confusion matrix, F1 score and AUC score

print(metrics.accuracy_score(y_test_log, y_pred_log))

print(metrics.confusion_matrix(y_test_log, y_pred_log))

print(metrics.f1_score(y_test_log, y_pred_log))

y_pred_prob_log = logistic.predict_proba(X_test_log)[:, 1]

print(metrics.roc_auc_score(y_test_log, y_pred_prob_log))


0.5506123629153844
[[  4839   1921]
 [215668 261762]]
0.706402397475149
0.6761823924248259


In [10]:
# After trial and error, this is the model that provides the most balanced insight into the data we are trying to 
# predict. It predicts most of the minority class instances, which comes at the cost of mislabeling many majority
# class instances and very poor performance metrics.

# In a real case scenario, besides further studying more appropriate models, the classification threshold would
# have real business implications and it would remain a contextual decision to make.