In [1]:
# IMPORT LIBRARIES

import numpy as np
import pandas as pd

from date_time_preprocessor import *

from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss
import pickle

In [17]:
# IMPORT DATA

df_raw = pd.read_csv('~/Documents/FullStack_DS/data/airline_delay_train.csv')
print(df_raw.shape)

(406045, 8)


In [18]:
# Applying date_time_preprocessor.py

training = feat_eng_datetime(df_raw)
print(training.shape)

Running feature engineering
(406045, 13)


In [19]:
y_train = training.dep_delayed_15min
X_train = training.drop('dep_delayed_15min', axis=1)

In [22]:
# CREATE VAR TYPES

numerical_cols = list(X_train.select_dtypes(include=['int64']).columns)
categorical_cols = list(X_train.select_dtypes(include=['object', 'category']).columns)
datetime_cols = list(X_train.select_dtypes(include=['datetime64[ns]']).columns)

In [None]:
# PRE PROCESSING PIPELINE

# Numerical Features
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', StandardScaler())])

# Categorical Features
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])

# Date Time Features


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feats),
        ('cat', categorical_transformer, cat_feats)])

In [None]:
# MODEL BUILD PROTOTYPE

rf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', RandomForestClassifier())])

param_grid = {'classifier__n_estimators': [400]}

print('Running Model')
CV = GridSearchCV(rf, param_grid, n_jobs= -1,scoring='roc_auc')
CV.fit(X_train, y_train)  
#print(CV.get_params())
#print(CV.best_params_)

In [None]:
# VALIDATING MODEL PROTOTYPE

y_pred = CV.predict(X_validation)
print(classification_report(y_validation, y_pred, target_names=target_names))
print("{}{}".format("Cross - Validation: ", CV.best_score_))
print("{}{}".format("Validation: ", CV.score(X_validation,y_validation)))

In [None]:
# Break Training into Train & Validation
X = train.drop(["dep_delayed_15min","FlightDate","DepTime"], axis=1)
y = train["dep_delayed_15min"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=21,
    test_size=0.2)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
preprocessor.fit(X_train)

In [None]:
pipe_works = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', RandomForestClassifier())])
pipe_works.fit(X_train, y_train)

In [None]:
X_test_preds = pipe_works.predict_proba(X_test)

In [None]:
log_loss(y_test, X_test_preds)

In [None]:
# Holdout
ho_X_preds = pipe_works.predict_proba(ho_X)
log_loss(ho_y, ho_X_preds)

In [None]:
# Cross Validation Pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

#clf.fit(X_train, y_train)
#print("model score: %.3f" % clf.score(X_test, y_test))