Imports

In [None]:
import pandas as pd
from sklearn import preprocessing, tree
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
import lightgbm as lgb

from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Open and display dataframe

In [None]:
train_df = pd.read_csv('flight_delays_train.csv')
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


Map carrier, origin/destination airport codes and delayed to numeric attributes

In [None]:
# UniqueCarrier
uc_labels = train_df.UniqueCarrier.unique().tolist()
label_dict_uc_train = {}
for index, possible_label in enumerate(uc_labels):
    label_dict_uc_train[possible_label] = index

# Origin
origin_labels = train_df.Origin.unique().tolist()
label_dict_origin_train = {}
for index, possible_label in enumerate(origin_labels):
    label_dict_origin_train[possible_label] = index

# Dest
dest_labels = train_df.Dest.unique().tolist()
label_dict_dest_train = {}
for index, possible_label in enumerate(dest_labels):
    label_dict_dest_train[possible_label] = index

train_df['UniqueCarrier'] = train_df.UniqueCarrier.replace(label_dict_uc_train)
train_df['Origin'] = train_df.Origin.replace(label_dict_origin_train)
train_df['Dest'] = train_df.Dest.replace(label_dict_dest_train)

# Map 'delayed' to 1/0
delayed = train_df['dep_delayed_15min'].map({'Y': 1, 'N': 0}).values

train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,0,0,0,732,N
1,c-4,c-20,c-3,1548,1,1,1,834,N
2,c-9,c-2,c-5,1422,2,2,2,416,N
3,c-11,c-25,c-6,1015,3,3,3,872,N
4,c-10,c-7,c-6,1828,4,4,4,423,Y


Clean attributes related to date and cast them to int

In [None]:
# Cleaning the 'c-' from the data related to dates
month = train_df['Month'].str.split('-')
train_df['Mon']=month.apply(lambda x:int(x[1]))

day = train_df['DayofMonth'].str.split('-')
train_df['DOM']=day.apply(lambda x:int(x[1]))

dow = train_df['DayOfWeek'].str.split('-')
train_df['DOW']=dow.apply(lambda x:int(x[1]))

train_df = train_df.drop(['dep_delayed_15min', 'Month', 'DayofMonth',
                          'DayOfWeek'], axis=1)

train_df.head()

Unnamed: 0,DepTime,UniqueCarrier,Origin,Dest,Distance,Mon,DOM,DOW
0,1934,0,0,0,732,8,21,7
1,1548,1,1,1,834,4,20,3
2,1422,2,2,2,416,9,2,5
3,1015,3,3,3,872,11,25,6
4,1828,4,4,4,423,10,7,6


Combine time and date into one single attribute

In [None]:
# Create column year and set all values as 2015 (the year when data was collected)
train_df['Year'] = 2015

# Rename columns to 'Month' and 'Day'
train_df.rename(columns={'Mon': 'Month',  'DOM': 'Day'}, inplace=True)

# Convert year, month and day to 'datetime'
train_df['Date'] = pd.to_datetime(train_df[['Year', 'Month', 'Day']])

# Separate hours and minutes into their respective columns
train_df['DepHour'] = train_df['DepTime']//100
train_df['DepHour'].replace(to_replace=[24,25], value=0, inplace=True)

train_df['DepMinute'] = train_df['DepTime']%100

# Save the time in minutes
train_df['Minutes'] = train_df['DepMinute'] + train_df['DepHour']*60

# Convert time to 'timedelta'
train_df['Time'] = pd.to_timedelta(train_df['Minutes'], unit='m')

# Combine both date and time to one single attribute
train_df['DateTime'] = pd.to_datetime(train_df['Date'] + train_df['Time'])

# Drop irrelevant columns
train_df = train_df.drop(['Date', 'DepHour', 'DepMinute', 'Minutes', 'Time',
                          'Month', 'Day', 'Year', 'DepTime'], axis=1)

# Check attributes types
# print(pd.DataFrame(train_df.dtypes))

train_df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,Distance,DOW,DateTime
0,0,0,0,732,7,2015-08-21 19:34:00
1,1,1,1,834,3,2015-04-20 15:48:00
2,2,2,2,416,5,2015-09-02 14:22:00
3,3,3,3,872,6,2015-11-25 10:15:00
4,4,4,4,423,6,2015-10-07 18:28:00


Cast 'datetime' to numeric

In [None]:
# Cast 'datetime' to numeric
train_df['DateTime'] = pd.to_numeric(train_df['DateTime'])

train_df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,Distance,DOW,DateTime
0,0,0,0,732,7,1440185640000000000
1,1,1,1,834,3,1429544880000000000
2,2,2,2,416,5,1441203720000000000
3,3,3,3,872,6,1448446500000000000
4,4,4,4,423,6,1444242480000000000


Normalize values

In [None]:
# Save columns names
att = list(train_df.columns.values)

# Normalize
train_values = train_df.values
scaler = preprocessing.MinMaxScaler()
values_scaled = scaler.fit_transform(train_values)

# Save to new dataframe
train_scaled_df = pd.DataFrame(values_scaled,columns=att)

train_scaled_df.head()

Unnamed: 0,UniqueCarrier,Origin,Dest,Distance,DOW,DateTime
0,0.0,0.0,0.0,0.142336,1.0,0.637641
1,0.047619,0.003472,0.003472,0.163017,0.333333,0.300003
2,0.095238,0.006944,0.006944,0.078264,0.666667,0.669945
3,0.142857,0.010417,0.010417,0.170722,0.833333,0.899763
4,0.190476,0.013889,0.013889,0.079684,0.833333,0.766367


Decision Tree Classifier using stratified K-Fold split

In [None]:
# Save dataframe values to 'train_data' variable
train_data = train_scaled_df.values

# Create 'StratifiedKFold' object
skf = StratifiedKFold(n_splits=5)

# Create decision tree object
model = tree.DecisionTreeClassifier(random_state=271828)

# Train and test model using Stratified K-Fold
for train_index,test_index in skf.split(train_data, delayed):
  train, test = train_data[train_index], train_data[test_index]
  delay_train, delay_test = delayed[train_index], delayed[test_index]
  model.fit(train, delay_train)
  delay_pred = model.predict(test)
  print(classification_report(delay_test, delay_pred))

# Print final results
print("ROC AUC score: {}".format(roc_auc_score(delay_pred, delay_test)))

print("Features importance: {}".format(model.feature_importances_))

tn, fp, fn, tp = confusion_matrix(delay_test, delay_pred).ravel()
print("Confusion matrix (TN, FP, FN, TP):\n{}".format((tn, fp, fn, tp)))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     16192
           1       0.25      0.27      0.26      3808

    accuracy                           0.70     20000
   macro avg       0.54      0.54      0.54     20000
weighted avg       0.71      0.70      0.71     20000

              precision    recall  f1-score   support

           0       0.83      0.81      0.82     16191
           1       0.25      0.27      0.26      3809

    accuracy                           0.71     20000
   macro avg       0.54      0.54      0.54     20000
weighted avg       0.72      0.71      0.71     20000

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     16191
           1       0.25      0.28      0.26      3809

    accuracy                           0.70     20000
   macro avg       0.54      0.54      0.54     20000
weighted avg       0.71      0.70      0.71     20000

              preci

Random tests and stuff (not well document, nor it'll be)

Entropy criterion

In [None]:
train_data = train_scaled_df.values

skf = StratifiedKFold(n_splits=5)

model = tree.DecisionTreeClassifier(criterion='entropy', random_state=271828)

for train_index,test_index in skf.split(train_data, delayed):
  train, test = train_data[train_index], train_data[test_index]
  delay_train, delay_test = delayed[train_index], delayed[test_index]
  model.fit(train, delay_train)
  delay_pred = model.predict(test)
  print(classification_report(delay_test, delay_pred))

print(roc_auc_score(delay_pred, delay_test))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82     16192
           1       0.24      0.25      0.25      3808

    accuracy                           0.71     20000
   macro avg       0.53      0.53      0.53     20000
weighted avg       0.71      0.71      0.71     20000

              precision    recall  f1-score   support

           0       0.83      0.82      0.82     16191
           1       0.25      0.26      0.26      3809

    accuracy                           0.71     20000
   macro avg       0.54      0.54      0.54     20000
weighted avg       0.72      0.71      0.71     20000

              precision    recall  f1-score   support

           0       0.82      0.81      0.82     16191
           1       0.24      0.26      0.25      3809

    accuracy                           0.70     20000
   macro avg       0.53      0.53      0.53     20000
weighted avg       0.71      0.70      0.71     20000

              preci

Random splitter

In [None]:
train_data = train_scaled_df.values

skf = StratifiedKFold(n_splits=5)

model = tree.DecisionTreeClassifier(splitter='random', random_state=271828)

for train_index,test_index in skf.split(train_data, delayed):
  train, test = train_data[train_index], train_data[test_index]
  delay_train, delay_test = delayed[train_index], delayed[test_index]
  model.fit(train, delay_train)
  # print(model.feature_importances_)
  delay_pred = model.predict(test)
  print(classification_report(delay_test, delay_pred))

print(roc_auc_score(delay_pred, delay_test))

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     16192
           1       0.22      0.24      0.23      3808

    accuracy                           0.69     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.70      0.69      0.70     20000

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     16191
           1       0.23      0.24      0.23      3809

    accuracy                           0.70     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.71      0.70      0.70     20000

              precision    recall  f1-score   support

           0       0.82      0.80      0.81     16191
           1       0.22      0.24      0.23      3809

    accuracy                           0.70     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.70      0.70      0.70     20000

              preci

In [None]:
train_data = train_scaled_df.values

skf = StratifiedKFold(n_splits=5)

model = tree.DecisionTreeClassifier(max_features='log2', random_state=271828)

for train_index,test_index in skf.split(train_data, delayed):
  train, test = train_data[train_index], train_data[test_index]
  delay_train, delay_test = delayed[train_index], delayed[test_index]
  model.fit(train, delay_train)
  # print(model.feature_importances_)
  delay_pred = model.predict(test)
  print(classification_report(delay_test, delay_pred))

print(roc_auc_score(delay_pred, delay_test))

              precision    recall  f1-score   support

           0       0.82      0.81      0.82     16192
           1       0.23      0.24      0.24      3808

    accuracy                           0.70     20000
   macro avg       0.53      0.53      0.53     20000
weighted avg       0.71      0.70      0.71     20000

              precision    recall  f1-score   support

           0       0.82      0.81      0.82     16191
           1       0.24      0.25      0.24      3809

    accuracy                           0.71     20000
   macro avg       0.53      0.53      0.53     20000
weighted avg       0.71      0.71      0.71     20000

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     16191
           1       0.22      0.23      0.23      3809

    accuracy                           0.70     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.70      0.70      0.70     20000

              preci

In [None]:
train_data = train_scaled_df.values

skf = StratifiedKFold(n_splits=5)

model = tree.DecisionTreeClassifier(criterion='entropy', splitter='random',
                                    max_features='log2', random_state=271828)

for train_index,test_index in skf.split(train_data, delayed):
  train, test = train_data[train_index], train_data[test_index]
  delay_train, delay_test = delayed[train_index], delayed[test_index]
  model.fit(train, delay_train)
  # print(model.feature_importances_)
  delay_pred = model.predict(test)
  print(classification_report(delay_test, delay_pred))

print(roc_auc_score(delay_pred, delay_test))

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     16192
           1       0.23      0.24      0.23      3808

    accuracy                           0.70     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.71      0.70      0.70     20000

              precision    recall  f1-score   support

           0       0.82      0.82      0.82     16191
           1       0.24      0.24      0.24      3809

    accuracy                           0.71     20000
   macro avg       0.53      0.53      0.53     20000
weighted avg       0.71      0.71      0.71     20000

              precision    recall  f1-score   support

           0       0.82      0.81      0.81     16191
           1       0.22      0.23      0.22      3809

    accuracy                           0.70     20000
   macro avg       0.52      0.52      0.52     20000
weighted avg       0.70      0.70      0.70     20000

              preci

Random split

In [None]:
train_data = train_scaled_df.values
train_delayed = delayed.values

train, test, delay_train, delay_test = train_test_split(train_data,
                                                        train_delayed,
                                                        random_state=3141592)

model = tree.DecisionTreeClassifier()
model.fit(train, delay_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [None]:
delay_pred = model.predict(test)
print(classification_report(delay_test, delay_pred))

              precision    recall  f1-score   support

           N       0.83      0.82      0.83     20296
           Y       0.28      0.30      0.29      4704

    accuracy                           0.72     25000
   macro avg       0.56      0.56      0.56     25000
weighted avg       0.73      0.72      0.73     25000



In [None]:
print("Max features: ", model.max_features)
print("Number of classes: ", model.n_classes_)
print("Number of features: ", model.n_features_)
print("Number of outputs: ", model.n_outputs_)

Max features:  None
Number of classes:  2
Number of features:  3
Number of outputs:  1
