In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import log_loss

In [9]:
pip list

Package                            Version
---------------------------------- ----------------------
alabaster                          0.7.12
anaconda-client                    1.7.2
anaconda-navigator                 1.9.12
anaconda-project                   0.8.3
applaunchservices                  0.2.1
appnope                            0.1.0
appscript                          1.1.1
argcomplete                        1.11.1
argh                               0.26.2
asn1crypto                         1.3.0
astroid                            2.4.2
astropy                            4.0.1.post1
atomicwrites                       1.4.0
attrs                              19.3.0
autopep8                           1.5.3
Babel                              2.8.0
backcall                           0.2.0
backports.functools-lru-cache      1.6.1
backports.shutil-get-terminal-size 1.0.0
backports.tempfile                 1.0
backports.weakref                  1.0.post1
beautifulsoup4           

In [2]:
train = pd.read_csv('data/airline_delay_train.csv')
holdout = pd.read_csv('data/airline_delay_test.csv')

In [3]:
# PRE CHECKS

In [4]:
print(train.shape)
print(holdout.shape)

(406045, 8)
(101512, 8)


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 406045 entries, 0 to 406044
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   FlightDate         406045 non-null  object
 1   DepTime            406045 non-null  object
 2   UniqueCarrier      406045 non-null  object
 3   Origin             406045 non-null  object
 4   Dest               406045 non-null  object
 5   Distance           406045 non-null  int64 
 6   dep_delayed_15min  406045 non-null  int64 
 7   Day_of_Week        406045 non-null  object
dtypes: int64(2), object(6)
memory usage: 24.8+ MB


In [5]:
holdout.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101512 entries, 0 to 101511
Data columns (total 8 columns):
 #   Column             Non-Null Count   Dtype 
---  ------             --------------   ----- 
 0   FlightDate         101512 non-null  object
 1   DepTime            101512 non-null  object
 2   UniqueCarrier      101512 non-null  object
 3   Origin             101512 non-null  object
 4   Dest               101512 non-null  object
 5   Distance           101512 non-null  int64 
 6   dep_delayed_15min  101512 non-null  int64 
 7   Day_of_Week        101512 non-null  object
dtypes: int64(2), object(6)
memory usage: 6.2+ MB


In [5]:
train.head(10)

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week
0,1/17/10,17:05,MQ,CVG,DFW,812,1,Sunday
1,1/29/10,17:03,MQ,OMA,ORD,416,0,Friday
2,1/31/10,18:03,US,SJC,PHX,622,0,Sunday
3,1/26/10,16:42,YV,MTJ,DEN,197,0,Tuesday
4,1/6/10,17:53,US,PHL,ORD,678,0,Wednesday
5,1/6/10,16:37,WN,SJC,PHX,622,0,Wednesday
6,1/6/10,19:30,US,RDU,CLT,130,0,Wednesday
7,1/28/10,21:10,MQ,MIA,CVG,948,0,Thursday
8,1/19/10,11:04,MQ,DCA,BNA,562,0,Tuesday
9,1/11/10,21:08,DL,MSP,SNA,1522,0,Monday


In [6]:
# Date / Time Transformations - TRAIN
train.FlightDate = pd.to_datetime(train['FlightDate'])

train['FlightDate_year'] = train['FlightDate'].dt.year
train['FlightDate_month'] = train['FlightDate'].dt.month
train['FlightDate_week'] = train['FlightDate'].dt.isocalendar().week
train['FlightDate_day'] = train['FlightDate'].dt.day

train['DepTime_hour'] = pd.to_datetime(train['DepTime'], format = '%H:%M').dt.hour
train['DepTime_min'] = pd.to_datetime(train['DepTime'], format = '%H:%M').dt.minute

In [7]:
# Date / Time Transformations - HOLDOUT
holdout.FlightDate = pd.to_datetime(holdout['FlightDate'])

holdout['FlightDate_year'] = holdout['FlightDate'].dt.year
holdout['FlightDate_month'] = holdout['FlightDate'].dt.month
holdout['FlightDate_week'] = holdout['FlightDate'].dt.isocalendar().week
holdout['FlightDate_day'] = holdout['FlightDate'].dt.day

holdout['DepTime_hour'] = pd.to_datetime(holdout['DepTime'], format = '%H:%M').dt.hour
holdout['DepTime_min'] = pd.to_datetime(holdout['DepTime'], format = '%H:%M').dt.minute

In [8]:
holdout.head()

Unnamed: 0,FlightDate,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,Day_of_Week,FlightDate_year,FlightDate_month,FlightDate_week,FlightDate_day,DepTime_hour,DepTime_min
0,2010-01-11,19:45,OH,DTW,MDW,229,1,Monday,2010,1,2,11,19,45
1,2010-01-12,16:59,WN,SAN,PHX,304,0,Tuesday,2010,1,2,12,16,59
2,2010-01-21,18:52,YV,GJT,DEN,212,0,Thursday,2010,1,3,21,18,52
3,2010-01-08,11:02,WN,ONT,PHX,325,0,Friday,2010,1,1,8,11,2
4,2010-01-22,23:03,US,PHL,MSY,1088,0,Friday,2010,1,3,22,23,3


In [None]:
# EDA

In [None]:
train.UniqueCarrier.value_counts()

In [None]:
train.Day_of_Week.value_counts()

In [None]:
# MODEL BUILD PROTOTYPE

In [12]:
# Break Training into Train & Validation
X = train.drop(["dep_delayed_15min","FlightDate","DepTime"], axis=1)
y = train["dep_delayed_15min"]

ho_X = holdout.drop(["dep_delayed_15min","FlightDate","DepTime"], axis=1)
ho_y = holdout["dep_delayed_15min"]

In [13]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    random_state=21,
    test_size=0.2)

In [14]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(324836, 11)
(324836,)
(81209, 11)
(81209,)


In [15]:
# PreProcessing Pipeline

# Numericals
numeric_feats = X.select_dtypes(include=['int64', 'UInt32']).columns

numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median', add_indicator=True)),
    ('scaler', StandardScaler())])

# Categoricals
cat_feats = X.select_dtypes(include=['object']).columns

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing', add_indicator=True)),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))])


# Will need to concat date and time to get datetime = ['Flight']
#date_time_feats = 


preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_feats),
        ('cat', categorical_transformer, cat_feats)])

In [16]:
preprocessor.fit(X_train)

ColumnTransformer(transformers=[('num',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                strategy='median')),
                                                 ('scaler', StandardScaler())]),
                                 Index(['Distance', 'FlightDate_year', 'FlightDate_month', 'FlightDate_week',
       'FlightDate_day', 'DepTime_hour', 'DepTime_min'],
      dtype='object')),
                                ('cat',
                                 Pipeline(steps=[('imputer',
                                                  SimpleImputer(add_indicator=True,
                                                                fill_value='missing',
                                                                strategy='constant')),
                                                 ('onehot',
                                        

In [18]:
pipe_works = Pipeline(steps=[('preprocessor', preprocessor),
                            ('model', RandomForestClassifier())])
pipe_works.fit(X_train, y_train)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('num',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('scaler',
                                                                   StandardScaler())]),
                                                  Index(['Distance', 'FlightDate_year', 'FlightDate_month', 'FlightDate_week',
       'FlightDate_day', 'DepTime_hour', 'DepTime_min'],
      dtype='object')),
                                                 ('cat',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(add_indicator=True,
                    

In [19]:
X_test_preds = pipe_works.predict_proba(X_test)

In [21]:
log_loss(y_test, X_test_preds)

0.4146484514510861

In [22]:
# Holdout
ho_X_preds = pipe_works.predict_proba(ho_X)
log_loss(ho_y, ho_X_preds)

0.4041826985070262

In [None]:
# Cross Validation Pipeline

clf = Pipeline(steps=[('preprocessor', preprocessor),
                      ('classifier', LogisticRegression())])

param_grid = {
    'preprocessor__num__imputer__strategy': ['mean', 'median'],
    'classifier__C': [0.1, 1.0, 10, 100],
}

grid_search = GridSearchCV(clf, param_grid, cv=5)
grid_search.fit(X_train, y_train)
print(("best logistic regression from grid search: %.3f"
       % grid_search.score(X_test, y_test)))

#clf.fit(X_train, y_train)
#print("model score: %.3f" % clf.score(X_test, y_test))