In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import os


In [2]:
X = pd.read_csv("transformed_random_data_2004.csv")
df = pd.read_csv("base_data_2k.csv")


In [3]:
# Convert 64 to 16 bit to reduce dataset size
float64_columns = X.select_dtypes(include='float64').columns
X[float64_columns] = X[float64_columns].astype('float16')


In [5]:
# Convert 64 to 16 bit to reduce dataset size
int64_columns = X.select_dtypes(include='int64').columns
X[int64_columns] = X[int64_columns].astype('int16')


In [7]:
# y_bin as binary target (0 or 1) --> where ArrivalDelayGroups = 0 means less than 15min delay
df["BadFlight"] = ((df['ArrivalDelayGroups'] > 0) | (df['Cancelled'] > 0)).astype(int)

y_bin = df["BadFlight"]


In [8]:
# y_cat as multi-categorical target (as strings/categories)
df["ArrivalDelayGroups"]=df["ArrivalDelayGroups"].fillna("Cancelled")

buckets = {
    '_0':[-1.0, -2.0, 0],
    '_1_Less30': [1.0, 2.0],
    '_2_Betw30-105': [3.0, 4.0, 5.0, 6.0, 7.0],
    '_3_Over120/Canc': [8.0, 9.0, 10.0, 11.0, 12.0, 'Cancelled']
}

def map_to_bucket(x):
    for bucket_name,bucket_values in buckets.items():
        if x in bucket_values:
            return bucket_name

# Map original classes to new buckets using the function
df['ArrivalDelay_Groups_y'] = df['ArrivalDelayGroups'].apply(map_to_bucket)

y_cat = df['ArrivalDelay_Groups_y']


In [9]:
# y_num as multi-class target (as classes)
df["ArrivalDelayGroups"]=df["ArrivalDelayGroups"].fillna("Cancelled")

buckets2 = {
    0:[-1.0, -2.0, 0],
    1: [1.0, 2.0],
    2: [3.0, 4.0, 5.0, 6.0, 7.0],
    3: [8.0, 9.0, 10.0, 11.0, 12.0, 'Cancelled']
}

def map_to_bucket(i):
    for bucket_name,bucket_values in buckets2.items():
        if i in bucket_values:
            return bucket_name


# Map original classes to new buckets using the function
df['ArrivalDelay_Groups_y_num'] = df['ArrivalDelayGroups'].apply(map_to_bucket)

y_num = df['ArrivalDelay_Groups_y_num']
# y_num.sample(20)


In [10]:
from sklearn.model_selection import train_test_split

# train-test split on y_bin
(X_train_bin, X_test_bin, y_train_bin, y_test_bin) = train_test_split(X, y_bin, test_size = .3)
print(X_train_bin.shape)
print(y_train_bin.shape)


(1402, 427)
(1402,)


In [11]:
from sklearn.model_selection import train_test_split

# train-test split on y_num
(X_train_num, X_test_num, y_train_num, y_test_num) = train_test_split(X, y_num, test_size = .3)
print(X_train_num.shape)
print(y_train_num.shape)


(1402, 427)
(1402,)


In [12]:
from sklearn.model_selection import train_test_split

# train-test split on y_cat
(X_train_cat, X_test_cat, y_train_cat, y_test_cat) = train_test_split(X, y_cat, test_size = .3)
print(X_train_cat.shape)
print(y_train_cat.shape)


(1402, 427)
(1402,)


In [43]:
# Building a Regular Tree as Baseline
# Created Bagged Trees
# Ran a Random Forest with no Class Weighting (ran a feature importance plot as a QC tool)
# Random Forest with Bootstrat Class Weighting
# Ran a AdaBoost with and without the DEP_DELAY
# RAN Gradient Boosted Trees with and without the DEP_DELAY
# RAN XGBoost with and without the DEP_DELAY


# add recall etc measures
# take out zero data rows and validate/cross validate - data balancing (taking out the zeros) based on larger dataset
# cross validation look at sample, train with loop 20% removing columns where delay = 0 //


# Target Classification

### 1. DecisionTreeClassifier

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import make_scorer, precision_score, recall_score, accuracy_score, f1_score

# Instantiate model
tree_clf = DecisionTreeClassifier()

# Hyperparameter Grid
grid_params = {'criterion': ['gini', 'entropy'],
               'max_depth': [9,10,11],
               'min_samples_split': [2,3,4],
               'min_samples_leaf': [7,8,9],
               'random_state': [42]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_tree = GridSearchCV(
    tree_clf,
    grid_params,
    scoring = 'recall',
    cv = 5,
    n_jobs=-1 # parallelize computation
)

# Fit data to Grid Search
search_tree.fit(X_train_bin, y_train_bin)
print(search_tree.best_estimator_)
print(search_tree.best_score_)
print(search_tree.best_params_)
print(search_tree.scorer_)


# tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42, scoring='recall', cv = 5)
# tree_clf.fit(X_train_bin,y_train_bin)
# print(tree_clf.score(X_test_bin, y_test_bin))


DecisionTreeClassifier(max_depth=11, min_samples_leaf=9, random_state=42)
0.181924882629108
{'criterion': 'gini', 'max_depth': 11, 'min_samples_leaf': 9, 'min_samples_split': 2, 'random_state': 42}
make_scorer(recall_score, average=binary)


In [27]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

tree_clf = DecisionTreeClassifier(criterion='gini', max_depth=11, min_samples_leaf=9, min_samples_split=2, random_state=42,)
tree_clf.fit(X_train_bin,y_train_bin)
cv_tree = cross_val_score(tree_clf, X_train_bin, y_train_bin, cv=5)
cv_tree.mean()


0.7004219623792578

### 2. RandomForestClassifier

In [43]:
from sklearn.ensemble import RandomForestClassifier


# Instantiate model
rf_clf = RandomForestClassifier()

# Hyperparameter Grid
grid_params = {'max_depth': [9,10,11],
               'random_state': [42]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_rf = GridSearchCV(
    rf_clf,
    grid_params,
    scoring = 'accuracy',
    cv = 5,
    n_jobs=-1 # parallelize computation
)

# Fit data to Grid Search
search_rf.fit(X_train_bin, y_train_bin)
print(search_rf.best_estimator_)
print(search_rf.best_score_)
print(search_rf.best_params_)
print(search_rf.scorer_)


RandomForestClassifier(max_depth=9, random_state=42)
0.7446517539400103
{'max_depth': 9, 'random_state': 42}
make_scorer(accuracy_score)


In [44]:
rf_clf = RandomForestClassifier(max_depth=9, random_state=42)
rf_clf.fit(X_train_bin,y_train_bin)
cv_rf = cross_val_score(rf_clf, X_train_bin, y_train_bin, cv=5)
cv_rf.mean()


0.7446517539400103

### 3. GradientBoostingClassifier

In [61]:
from sklearn.ensemble import GradientBoostingClassifier

# Instantiate model
gb_clf = GradientBoostingClassifier()

# Hyperparameter Grid
grid_params = {'max_depth': [1,5,10],
               'n_estimators': [100,150,200],
               'learning_rate': [0.001, 0.01, 0.1],
               'random_state': [42]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_gb = GridSearchCV(
    gb_clf,
    grid_params,
    scoring = 'accuracy',
    cv = 5,
    n_jobs=-1 # parallelize computation
)

# Fit data to Grid Search
search_gb.fit(X_train_bin, y_train_bin)
print(search_gb.best_estimator_)
print(search_gb.best_score_)
print(search_gb.best_params_)
print(search_gb.scorer_)


GradientBoostingClassifier(max_depth=1, random_state=42)
0.7475038129130656
{'learning_rate': 0.1, 'max_depth': 1, 'n_estimators': 100, 'random_state': 42}
make_scorer(accuracy_score)


In [202]:
gb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=42)
gb_clf.fit(X_train_bin,y_train_bin)
cv_gb = cross_val_score(gb_clf, X_train_bin, y_train_bin, cv=5)
cv_gb.mean()


0.7458471760797342


### 4. XGBoost Classifier

In [63]:
from xgboost import XGBClassifier


# Instantiate model
xgb_clf = XGBClassifier()

# Hyperparameter Grid
grid_params = {'max_depth': [1,5,10],
               'n_estimators': [50,100,150,200],
               'learning_rate': [0.001, 0.01, 0.1],
               'random_state': [42],
               'subsample': [0.999],
                'verbosity': [0]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_xgb = GridSearchCV(
    xgb_clf,
    grid_params,
    scoring = 'recall',
    cv = 5,
    n_jobs=-1 # parallelize computation
)

# Fit data to Grid Search
search_xgb.fit(X_train_bin, y_train_bin)
print(search_xgb.best_estimator_)
print(search_xgb.best_score_)
print(search_xgb.best_params_)
print(search_xgb.scorer_)


XGBClassifier(base_score=None, booster=None, callbacks=None,
              colsample_bylevel=None, colsample_bynode=None,
              colsample_bytree=None, device=None, early_stopping_rounds=None,
              enable_categorical=False, eval_metric=None, feature_types=None,
              gamma=None, grow_policy=None, importance_type=None,
              interaction_constraints=None, learning_rate=0.1, max_bin=None,
              max_cat_threshold=None, max_cat_to_onehot=None,
              max_delta_step=None, max_depth=10, max_leaves=None,
              min_child_weight=None, missing=nan, monotone_constraints=None,
              multi_strategy=None, n_estimators=200, n_jobs=None,
              num_parallel_tree=None, random_state=42, ...)
0.19295774647887323
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200, 'random_state': 42, 'subsample': 0.999, 'verbosity': 0}
make_scorer(recall_score, average=binary)


In [64]:
from sklearn.metrics import confusion_matrix

xgb_clf = GradientBoostingClassifier(n_estimators=200, learning_rate=0.1, max_depth=10, subsample= 0.999, random_state=42)
xgb_clf.fit(X_train_bin, y_train_bin)
y_preds_bin = xgb_clf.predict(X_test_bin)
print('accuracy: %.3f' % accuracy_score(y_preds_bin,y_test_bin))
print('f1_score: %.3f' % f1_score(y_test_bin, y_preds_bin, average='binary'))
print('recall: %.3f' % recall_score(y_test_bin, y_preds_bin,average='binary'))
print('precision: %.3f' % precision_score(y_test_bin, y_preds_bin,average='binary'))


print("Confusion Matrix: ", confusion_matrix(y_test_bin,y_preds_bin))


accuracy: 0.714
f1_score: 0.157
recall: 0.128
precision: 0.203
Confusion Matrix:  [[414  63]
 [109  16]]


In [65]:
xgb_clf = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1, random_state=42)
xgb_clf.fit(X_train_bin,y_train_bin)
cv_xgb = cross_val_score(xgb_clf, X_train_bin, y_train_bin, cv=5)
cv_xgb.mean()


0.7475038129130656

### 5. AdaBoostClassifier

In [69]:
from sklearn.ensemble import AdaBoostClassifier

# Instantiate model
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())


# Hyperparameter Grid
grid_params = {'base_estimator__max_depth':[i for i in range(2,11,2)],
              'base_estimator__min_samples_leaf':[5,10],
              'n_estimators':[10,50,100],
              'learning_rate':[0.01,0.1]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_ada = GridSearchCV(
    ada_clf,
    grid_params,
    scoring = 'f1',
    cv = 5,
    n_jobs=-1, # parallelize computation
    verbose = 0
)

# Fit data to Grid Search
search_ada.fit(X_train_bin, y_train_bin)
print(search_ada.best_estimator_)
print(search_ada.best_score_)
print(search_ada.best_params_)
print(search_ada.scorer_)




AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=6,
                                                         min_samples_leaf=10),
                   learning_rate=0.1, n_estimators=100)
0.28562893332761546
{'base_estimator__max_depth': 6, 'base_estimator__min_samples_leaf': 10, 'learning_rate': 0.1, 'n_estimators': 100}
make_scorer(f1_score, average=binary)


In [76]:
ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=6, min_samples_leaf=10), learning_rate=0.1, random_state=42)
ada_clf.fit(X_train_bin,y_train_bin)
cv_ada = cross_val_score(ada_clf, X_train_bin, y_train_bin, cv=5)
cv_ada.mean()




0.7025876970005084

In [87]:

from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
scorer = make_scorer(accuracy_score)
scores = cross_val_score(
    estimator=xgb_clf,
    X=X_train_bin,
    y=y_train_bin,
    scoring=scorer,
    cv=5
)
print('CV  scores:',  scores)
print('CV accuracy:', (np.mean(scores), np.std(scores)))


CV  scores: [0.75088968 0.74377224 0.75357143 0.74285714 0.74642857]
CV accuracy: (0.7475038129130656, 0.004121950076627391)


In [None]:
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier

# Instantiate model
weak_learner = KNeighborsClassifier(n_neighbors=3)
bagged_clf = BaggingClassifier(weak_learner, n_estimators=40)


ada_clf = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())


# Hyperparameter Grid
grid_params = {'base_estimator__max_depth':[i for i in range(2,11,2)],
              'base_estimator__min_samples_leaf':[5,10],
              'n_estimators':[10,50,100],
              'learning_rate':[0.01,0.1]
              }

scorers = {
  'precision_score': make_scorer(precision_score),
  'recall_score': make_scorer(recall_score),
  'accuracy_score': make_scorer(accuracy_score),
  'f1_score': make_scorer(f1_score)
}

# Instantiate Grid Search
search_bag = GridSearchCV(
    bagged_clf,
    grid_params,
    scoring = 'f1',
    cv = 5,
    n_jobs=-1, # parallelize computation
    verbose = 0
)

# Fit data to Grid Search
search_ada.fit(X_train_bin, y_train_bin)
print(search_ada.best_estimator_)
print(search_ada.best_score_)
print(search_ada.best_params_)
print(search_ada.scorer_)


In [None]:

# y_pred = rf.predict(X_test)
# print('recall: ', recall_score(y_test, y_pred, average='macro'))
# print('precision: ', precision_score(y_test, y_pred,  average='macro'))
# print('f1_score: ', f1_score(y_test, y_pred, average='macro'))
# idx = np.random.choice(np.arange(len(X_train)), 2000, replace=False)
# x_sample = X_train[idx]
# y_sample = y_train[idx]
# sfs = SequentialFeatureSelector(
#         estimator=xgb,
#         n_features_to_select=20,
#         direction='forward'
#     )

# sfs.fit(x_sample, y_sample)
# sfs.get_support()
# df.columns
# x_tr=sfs.transform(X_train)
# x_tes=sfs.transform(X_test)


## 2. Regressors for delay prediction in minutes

### Decision Tree Regressor

In [48]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import cross_validate

# tree = DecisionTreeRegressor(min_samples_split=9, max_depth=5, min_samples_leaf=5)

# cv_results = cross_validate(tree, X, y, scoring = "r2", cv=5)


# def plot_histogram_cv_results(cv_results):
#     # Calculating the std and the mean
#     std = cv_results['test_score'].std()
#     mean = cv_results['test_score'].mean()

#     # Getting the number of folds
#     n_cv = len(cv_results['test_score'])

#     # Building plot
#     plt.hist(cv_results['test_score'], bins=n_cv)

#     # Creating red lines
#     plt.vlines(mean, 0, 3, color='red', label=f'mean = {mean:.2f}')
#     plt.hlines(
#         3, mean - 1/2 * std, mean + 1/2 * std,
#         color='red', label=f'std = {std:.2f}', ls='dotted'
#     )

#     # Setting the title
#     plt.title('Histogram of R2 Scores During Cross-Validation')

#     # Setting the labels and xlim
#     plt.xlim((-1, 1))

#     plt.xlabel('r2')
#     plt.ylabel('number of folds')

#     # Showing the legend
#     plt.legend(loc='upper left')

#     plt.show()



# # Custom method
# plot_histogram_cv_results(cv_results)


In [49]:
# from sklearn.tree import DecisionTreeRegressor
# from sklearn.model_selection import cross_validate

# tree = DecisionTreeRegressor(min_samples_split=9, max_depth=5, min_samples_leaf=5)

# cv_results = cross_validate(tree, X, y, scoring = "r2", cv=5)

# # Custom method
# plot_histogram_cv_results(cv_results)


In [52]:
# cv_results['test_score'].mean()


### RandomForestRegressor

In [None]:
from sklearn.ensemble import RandomForestRegressor

forest = RandomForestRegressor(n_estimators=500)

cv_results = cross_validate(forest, X, y, scoring = "r2", cv=5)

plot_histogram_cv_results(cv_results)


### GradientBoostingRegressor

In [91]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3
)


### XGB Regressor

In [51]:
# from xgboost import XGBRegressor

# xgb_reg = XGBRegressor(max_depth=10, n_estimators=50, learning_rate=0.1)

# xgb_reg.fit(X_train, y_train,
#     # stop iterating when eval loss increases 5 times in a row
#     early_stopping_rounds=5
# )

# y_pred = xgb_reg.predict(X_test)

#     # evaluate loss at each iteration
#     # eval_set=[(X_train, y_train), (X_val, y_val)]


### ADABoostRegressor

In [50]:
# from sklearn.ensemble import AdaBoostRegressor

# adaboost = AdaBoostRegressor(
#     DecisionTreeRegressor(max_depth=3),
#     n_estimators=50)

# cv_results = cross_validate(adaboost, X, y, scoring = "r2", cv=5)

# plot_histogram_cv_results(cv_results)
