# IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd

from date_time_preprocessor import *

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import log_loss, classification_report, roc_auc_score
import pickle

# IMPORT DATA

In [2]:
path = '/Users/mitchell.carmen/Documents/FullStack_DS/'
df_raw = pd.read_csv(path + 'data/airline_delay_train.csv')
print(df_raw.shape)

(406045, 8)


In [3]:
# Applying date_time_preprocessor.py
training = feat_eng_datetime(df_raw)
print(training.shape)

Running feature engineering
(406045, 11)


In [4]:
# Balance Proportion
print(training.dep_delayed_15min.sum() / training.shape[0])

0.19120787104877537


# SETTING UP PARTITIONS FOR TRAINING / VALIDATION

In [5]:
X = training.drop(["dep_delayed_15min"], axis=1)
y = training["dep_delayed_15min"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=21,
    test_size=0.1,
    stratify=y)

print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(365440, 10)
(365440,)
(40605, 10)
(40605,)


In [6]:
# Stratification Check
print(y_train.sum()/X_train.shape[0])
print(y_test.sum()/X_test.shape[0])

0.19120785901926446
0.1912079793128925


In [7]:
X_train.head(20)

Unnamed: 0,UniqueCarrier,Origin,Dest,Distance,Day_of_Week,Year,Month,Day,Hour,Minutes
186812,MQ,IND,MIA,1021,Tuesday,2010,1,12,15,56
216851,XE,OKC,IAH,395,Monday,2010,1,4,13,24
115164,YV,PHX,ELP,347,Wednesday,2010,1,27,12,54
212366,WN,DEN,SAT,794,Wednesday,2010,1,20,17,0
113600,AA,LAS,DFW,1055,Friday,2010,1,22,11,25
269534,OO,LAX,ELP,714,Wednesday,2010,1,6,13,14
403202,US,ORD,PHX,1440,Sunday,2010,1,3,3,5
337266,YV,CLT,IAD,321,Sunday,2010,1,3,16,20
297801,AA,MSP,ORD,334,Tuesday,2010,1,5,20,56
265646,WN,LAS,PHX,256,Thursday,2010,1,28,21,8


In [8]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 365440 entries, 186812 to 293692
Data columns (total 10 columns):
 #   Column         Non-Null Count   Dtype   
---  ------         --------------   -----   
 0   UniqueCarrier  365440 non-null  object  
 1   Origin         365440 non-null  object  
 2   Dest           365440 non-null  object  
 3   Distance       365440 non-null  int64   
 4   Day_of_Week    365440 non-null  object  
 5   Year           365440 non-null  category
 6   Month          365440 non-null  category
 7   Day            365440 non-null  category
 8   Hour           365440 non-null  category
 9   Minutes        365440 non-null  category
dtypes: category(5), int64(1), object(4)
memory usage: 18.5+ MB


# CREATE VAR TYPES

In [9]:
numerical_cols = list(X_train.select_dtypes(include=['int64']).columns)
categorical_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)
# datetime_cols = list(X_train.select_dtypes(include=['datetime64[ns]']).columns)

# PRE PROCESSING PIPELINE

In [10]:
# Numerical Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', StandardScaler())])

# Categorical Features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Date Time Features


# Assemble Column Transformer for Pipeline
preprocessor_pipe = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

In [None]:
# Fitting Preprocessor to Training Data -- Do not ever include Test Data here!
#preprocessor_pipe.fit(X_train) # Don't need to fit the preprocessor here as it will be done below

# MODEL BUILD PROTOTYPE

In [None]:
gbtmodel = GradientBoostingClassifier(n_estimators=35, 
                                      learning_rate=0.01,
                                      max_depth=20)

# Append the gbt() to the preprocessor
full_pipe = Pipeline(steps=[('preprocessor', preprocessor_pipe),
                     ('classifier', gbtmodel)])

################################################################################
################################################################################

# WARNING!!!! -------- LONG TIME TRAINING

# hyper_params_grid = {
#     'classifier__n_estimators': [400],
#     'max_depth': [5,10],
#     'learning_rate': [0.001, 0.01, 0.1]
# }

# print('Running Model')
# CV = GridSearchCV(full_pipe, hyper_params_grid, n_jobs= -1,scoring='roc_auc')
# CV.fit(X_train, y_train)  

# print(CV.get_params())
# print(CV.best_params_)

################################################################################
################################################################################

# NOTE!!!! -------- SHORT TIME TRAINING (For quick testing)

# Fit the full pipeline-- preprocessor and model-- on training data
full_pipe.fit(X_train, y_train)

# VALIDATING MODEL PROTOTYPE

In [12]:
# target_names = y_test.unique().astype(str)

# # Cross-Valiation
# y_preds = CV.predict(X_test)

# print(classification_report(y_test, y_preds, target_names=target_names))
# print("{}{}".format("Cross - Validation: ", CV.best_score_))
# print("{}{}".format("Validation: ", CV.score(X_test,y_test)))

# Non Cross Validation
y_preds = full_pipe.predict(X_test)

In [None]:
X_test.head()

In [None]:
full_pipe.predict_proba(X_test)

In [None]:
full_pipe.predict_proba(X_train)

In [13]:
# print(log_loss(y_test, y_preds))
# print(classification_report(y_test, y_preds))
print(roc_auc_score(y_test, y_preds))

0.5956046972987313


# SAVE MODEL PIPELINE

In [None]:
full_pipe.feature_names = list(X_train.columns.values)
pickle.dump(full_pipe, open(path + 'RF_flights_model.sav', 'wb'))