# IMPORT LIBRARIES

In [None]:
import numpy as np
import pandas as pd

from date_time_preprocessor import *

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import log_loss, classification_report, roc_auc_score
import category_encoders as ce
import lightgbm as lgb

import pickle

# IMPORT DATA

In [None]:
path = '/Users/mitchell.carmen/Documents/FullStack_DS/'
df_raw = pd.read_csv(path + 'data/airline_delay_train.csv')
print(df_raw.shape)

In [None]:
# Applying date_time_preprocessor.py
training = feat_eng_datetime(df_raw)
print(training.shape)

In [None]:
training.head()

In [None]:
X = training.drop(["dep_delayed_15min"], axis=1)
y = training["dep_delayed_15min"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=21,
    test_size=0.1,
    stratify=y)

# print(X_train.shape)
# print(y_train.shape)
# print(X_test.shape)
# print(y_test.shape)

# CREATE VAR TYPES

In [None]:
numerical_cols = list(X_train.select_dtypes(include=['int64']).columns)
categorical_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)

# PRE PROCESSING PIPELINE

In [None]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value= -9999)),
    ('catencoder', ce.ordinal.OrdinalEncoder())])
#     ('targetencoder', ce.target_encoder.TargetEncoder(min_samples_leaf = 1, smoothing = 1))])
#     ('countencoder', ce.count.CountEncoder(min_group_size = 10))])
#     ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)])

# MODEL BUILD PROTOTYPE

In [None]:
## hyperparameter tuning for Light GBM

LGBM = lgb.LGBMClassifier(boosting_type='gbdt',random_state = 99,
                          class_weight='balanced', objective= 'binary')

param_grid = { 
    'lgbmclassifier__n_estimators': [150,250],
    'lgbmclassifier__feature_fraction': ['auto', 'sqrt', 0.7],
    'lgbmclassifier__max_depth' : [6,7,8],
    'lgbmclassifier__learning_rate' : [0.1, 0.01],
    'lgbmclassifier__num_leaves' : [70,80],
    'lgbmclassifier__min_data_in_leaf' : [20, 50, 100]
}

lgbm = Pipeline(steps=[('preprocessor', preprocessor),
                      ('lgbmclassifier', LGBM)])

In [None]:
grid_lgbm = GridSearchCV(lgbm, cv= 3, n_jobs= -1, param_grid= param_grid, scoring='roc_auc')
grid_lgbm.fit(X_train, y_train)

In [None]:
grid_lgbm.best_score_

In [None]:
grid_lgbm.best_params_

In [None]:
y_pred = grid_lgbm.predict(X_test)

roc_auc_score(y_test,y_pred)

# SAVE PICKLE DUMP

In [None]:
grid_lgbm.feature_names = list(X_train.columns.values)
pickle.dump(grid_lgbm, open(path + 'LGBM_flights_model.sav', 'wb'))

In [None]:
############################################
############################################

# OPEN PICKLE DUMP

In [None]:
grid_lgbm = pickle.load(open(path + 'LGBM_flights_model.sav', 'rb'))

In [None]:
# Holdout Test

In [None]:
# Import HO data
test_df = pd.read_csv(path + 'data/airline_delay_test.csv')
# Preprocess DateTime features
test_df = feat_eng_datetime(test_df)
test_df_y = test_df.dep_delayed_15min
test_df_x = test_df.drop(['dep_delayed_15min'], axis=1)

In [None]:
# Test on HO data
test_pred = grid_lgbm.predict(test_df_x)

roc_auc_score(test_df_y,test_pred)