# IMPORT LIBRARIES

In [1]:
import numpy as np
import pandas as pd

from date_time_preprocessor import *

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, classification_report, roc_auc_score
import category_encoders as ce
import lightgbm as lgb

import pickle
import os

In [2]:
!pip list

Package                            Version
---------------------------------- ----------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
applaunchservices                  0.2.1
appnope                            0.1.0
appscript                          1.1.1
argcomplete                        1.11.1
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.4.2
astropy                            4.0.1.post1
atomicwrites                       1.4.0
attrs                              19.3.0
autopep8                           1.5.3
Babel                              2.8.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.1
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.post1
beautifulsoup4           

# IMPORT DATA

In [3]:
path = '/Users/mitchell.carmen/Documents/FullStack_DS/'
df_raw = pd.read_csv(path + 'data/airline_delay_train.csv')
print(df_raw.shape)

(406045, 8)


In [4]:
# Applying date_time_preprocessor.py
training = feat_eng_datetime(df_raw)
print(training.shape)

Running feature engineering
(406045, 13)


In [5]:
training.head()

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week,Year,Month,Day,Hour,Minutes
0,2010-01-17,17:05:00,MQ,CVG,DFW,812,1,Sunday,2010,1,17,17,5
1,2010-01-29,17:03:00,MQ,OMA,ORD,416,0,Friday,2010,1,29,17,3
2,2010-01-31,18:03:00,US,SJC,PHX,622,0,Sunday,2010,1,31,18,3
3,2010-01-26,16:42:00,YV,MTJ,DEN,197,0,Tuesday,2010,1,26,16,42
4,2010-01-06,17:53:00,US,PHL,ORD,678,0,Wednesday,2010,1,6,17,53


In [6]:
X = training.drop(["dep_delayed_15min"], axis=1)
y = training["dep_delayed_15min"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=21,
    test_size=0.1,
    stratify=y)

# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)

# CREATE VAR TYPES

In [7]:
numerical_cols = list(X_train.select_dtypes(include=['int64']).columns)
categorical_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)

# PRE PROCESSING PIPELINE

In [8]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value= -9999)),
    ('catencoder', ce.ordinal.OrdinalEncoder())])
#     ('targetencoder', ce.target_encoder.TargetEncoder(min_samples_leaf = 1, smoothing = 1))])
#     ('countencoder', ce.count.CountEncoder(min_group_size = 10))])
#     ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# MODEL BUILD PROTOTYPE

In [9]:
## hyperparameter tuning for Light GBM

LGBM = lgb.LGBMClassifier(boosting_type='gbdt',random_state = 99,
                          class_weight='balanced', objective= 'binary')

param_grid = { 
    'lgbmclassifier__n_estimators': [150,250],
    'lgbmclassifier__feature_fraction': ['auto', 'sqrt', 0.7],
    'lgbmclassifier__max_depth' : [6,7,8],
    'lgbmclassifier__learning_rate' : [0.1, 0.01],
    'lgbmclassifier__num_leaves' : [70,80],
    'lgbmclassifier__min_data_in_leaf' : [20, 50, 100]
}

lgbm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgbmclassifier', LGBM)])

In [10]:
grid_lgbm = GridSearchCV(lgbm, cv= 3, n_jobs= -1, param_grid= param_grid, scoring='roc_auc')
grid_lgbm.fit(X_train, y_train)



GridSearchCV(cv=3,
             estimator=Pipeline(steps=[('preprocessor',
                                        ColumnTransformer(transformers=[('num',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(strategy='median')),
                                                                                         ('scaler',
                                                                                          StandardScaler())]),
                                                                         ['Distance']),
                                                                        ('cat',
                                                                         Pipeline(steps=[('imputer',
                                                                                          SimpleImputer(fill_value=-9999,
                 

In [11]:
grid_lgbm.best_score_

0.7522433997148169

In [12]:
grid_lgbm.best_params_

{'lgbmclassifier__feature_fraction': 0.7,
 'lgbmclassifier__learning_rate': 0.1,
 'lgbmclassifier__max_depth': 8,
 'lgbmclassifier__min_data_in_leaf': 50,
 'lgbmclassifier__n_estimators': 250,
 'lgbmclassifier__num_leaves': 80}

In [13]:
y_pred = grid_lgbm.predict(X_test)

roc_auc_score(y_test,y_pred)

0.6924264371631438

# SAVE PICKLE DUMP

In [14]:
grid_lgbm.feature_names = list(X_train.columns.values)
pickle.dump(grid_lgbm, open(path + 'Prod_Assets/LGBM_gridCV_flights_model.pkl', 'wb'))

In [15]:
############################################
############################################

# OPEN PICKLE DUMP

In [16]:
grid_lgbm = pickle.load(open(path + 'Prod_Assets/LGBM_gridCV_flights_model.pkl', 'rb'))

In [17]:
# Holdout Test

In [18]:
# Import HO data
test_df = pd.read_csv(path + 'data/airline_delay_test.csv')
# Preprocess DateTime features
test_df = feat_eng_datetime(test_df)
test_df_y = test_df.dep_delayed_15min
test_df_x = test_df.drop(['dep_delayed_15min'], axis=1)

Running feature engineering


In [19]:
test_df_x.head()

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,Day_of_Week,Year,Month,Day,Hour,Minutes
0,2010-01-11,19:45:00,OH,DTW,MDW,229,Monday,2010,1,11,19,45
1,2010-01-12,16:59:00,WN,SAN,PHX,304,Tuesday,2010,1,12,16,59
2,2010-01-21,18:52:00,YV,GJT,DEN,212,Thursday,2010,1,21,18,52
3,2010-01-08,11:02:00,WN,ONT,PHX,325,Friday,2010,1,8,11,2
4,2010-01-22,23:03:00,US,PHL,MSY,1088,Friday,2010,1,22,23,3


In [20]:
# Test on HO data
test_pred = grid_lgbm.predict(test_df_x)

roc_auc_score(test_df_y,test_pred)

0.6890719199601335