# [Flight delays](https://www.kaggle.com/c/flight-delays-spring-2018/overview)
Kaggle InClass competiton by [mlcourse.ai](mlcourse.ai). The task is to predict whether a flight will be delayed for more than 15 minutes

## Initialization

### Perform dataset preparing

In [7]:
# pip install nbformat
# execute `output.show()` to show output figures and text (if they are)
%%capture output  
%run ./2018-12-12-armavox-prepare-dataset.ipynb

### Init required  modules

__Notebook environment__

In [18]:
# pip install watermark
%load_ext watermark
%watermark -v -m -r -b -g -p numpy,pandas,sklearn,matplotlib,statsmodels,xgboost,catboost

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.7.1
IPython 7.2.0

numpy 1.15.4
pandas 0.23.4
sklearn 0.20.1
matplotlib 3.0.2
statsmodels 0.9.0
xgboost 0.81
catboost 0.11.2

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 16.7.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit
Git hash   : 2c95db525d0633c49aa9b273528585fd06366b96
Git repo   : https://github.com/armavox/flight-delays-spring-2018.git
Git branch : master


In [96]:
import numpy as np
import pandas as pd
import itertools
import statsmodels.stats.weightstats as wsts
import scipy.stats as stats
from scipy.sparse import hstack, csr_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['patch.force_edgecolor'] = True

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (train_test_split, StratifiedKFold, 
                                     cross_val_score, GridSearchCV)
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

## Data preparation

### IMPORT DATA

In [14]:
train = pd.read_csv('../data/train.csv', index_col='idx')
train.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1
1,4,20,3,US,PIT,MCO,834.0,15,48,0
2,9,2,5,XE,RDU,CLE,416.0,14,22,0


In [15]:
test = pd.read_csv('../data/test.csv', index_col='idx')
test.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,7,25,3,YV,MRY,PHX,598.0,6,15,0
1,4,17,2,WN,LAS,HOU,1235.0,7,39,0
2,12,2,7,MQ,GSP,ORD,577.0,6,51,1


In [17]:
target = pd.read_csv('../data/target.csv', index_col='idx')
target.head(3)

Unnamed: 0_level_0,dep_delayed_15min
idx,Unnamed: 1_level_1
0,0
1,0
2,0


__Origin_Dest interaction: ``Route`` feature__

In [38]:
train['Route'] = train['Origin'] + '_' + train['Dest']
test['Route'] = test['Origin'] + '_' + test['Dest']

### FEATURES CONVERSION

In [68]:
train.head(1)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend,Route
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1,ATL_DFW


In [44]:
ohe = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')

In [45]:
X_month_train = ohe.fit_transform(train.Month.values.reshape(-1, 1))
X_month_test = ohe.transform(test.Month.values.reshape(-1, 1))

In [46]:
X_dom_train = ohe.fit_transform(train.DayofMonth.values.reshape(-1, 1))
X_dom_test = ohe.transform(test.DayofMonth.values.reshape(-1, 1))

In [47]:
X_dow_train = ohe.fit_transform(train.DayOfWeek.values.reshape(-1, 1))
X_dow_test = ohe.transform(test.DayOfWeek.values.reshape(-1, 1))

In [62]:
X_hour_train = ohe.fit_transform(train.Hour.values.reshape(-1, 1))
X_hour_test = ohe.transform(test.Hour.values.reshape(-1, 1))

In [63]:
X_minute_train = ohe.fit_transform(train.Minute.values.reshape(-1, 1))
X_minute_test = ohe.transform(test.Minute.values.reshape(-1, 1))

In [64]:
X_isweekend_train = train.IsWeekend.values.reshape(-1, 1)
X_isweekend_test = test.IsWeekend.values.reshape(-1, 1)

In [48]:
X_carrier_train = ohe.fit_transform(train.UniqueCarrier.values.reshape(-1, 1))
X_carrier_test = ohe.transform(test.UniqueCarrier.values.reshape(-1, 1))

In [49]:
X_origin_train = ohe.fit_transform(train.Origin.values.reshape(-1, 1))
X_origin_test = ohe.transform(test.Origin.values.reshape(-1, 1))

In [50]:
X_dest_train = ohe.fit_transform(train.Dest.values.reshape(-1, 1))
X_dest_test = ohe.transform(test.Dest.values.reshape(-1, 1))

In [51]:
X_route_train = ohe.fit_transform(train.Route.values.reshape(-1, 1))
X_route_test = ohe.fit_transform(test.Route.values.reshape(-1, 1))

### SELECT FEATURES

In [106]:
def simple_xgb_cv(X, y, n_estimators=27, max_depth=5, seed=42,
                   train_size=0.7):
    """Get ROC-AUC score for simple XGBoost classifier
    """
    skf = StratifiedKFold(n_splits=5, random_state=seed)
    
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=train_size, random_state=seed)

    xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                        random_state=seed, n_jobs=-1).fit(X_train, y_train)
    
    roc_auc = roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1],
                            average='samples')
    
    cv_score = cross_val_score(xgb, X_train, y_train, scoring='roc_auc', 
                               cv=skf, n_jobs=-1)
    return round(roc_auc, 5), round(cv_score.mean(), 5)

In [None]:
%%time
features = {'X_month_train': X_month_train, 
            'X_dom_train': X_dom_train, 
            'X_dow_train': X_dow_train, 
            'X_hour_train': X_hour_train, 
            'X_minute_train': X_minute_train,
            'X_isweekend_train': X_isweekend_train,
            'X_carrier_train': X_carrier_train,
            'X_origin_train': X_origin_train,
            'X_dest_train': X_dest_train,
            'X_route_train': X_route_train}

def select_best_feature_comb(features_dict):
    from scipy.sparse import hstack, csr_matrix, isspmatrix_csr
    
    results_dict = {}
    for l in range(1, len(features)+1):

        list_of_keys = [list(comb) for comb in 
                        itertools.combinations(features.keys(), l)]
        list_of_combs = [list(comb) for comb in 
                         itertools.combinations(features.values(), l)]

        for keys, comb in zip(list_of_keys, list_of_combs):
            print('Train on:', keys)
            
            is_csr = [isspmatrix_csr(x) for x in comb]
            if any(is_csr):
                X_con = hstack(comb, format='csr')
            else:
                comb[0] = csr_matrix(comb[0])
                X_con = hstack(comb, format='csr')
                
            result = simple_xgb_cv(X_con, y)
            results_dict[', '.join(keys)] = result
            print(f'CV: {result[1]}, OOF: {result[0]}', '\n')
            
    return results_dict

select_best_feature_comb(features)

Train on: ['X_month_train']
CV: 0.55243, OOF: 0.5521 

Train on: ['X_dom_train']
CV: 0.52353, OOF: 0.52167 

Train on: ['X_dow_train']
CV: 0.53438, OOF: 0.53991 

Train on: ['X_hour_train']
CV: 0.68467, OOF: 0.68235 

Train on: ['X_minute_train']
CV: 0.52264, OOF: 0.52692 

Train on: ['X_isweekend_train']
CV: 0.50765, OOF: 0.51224 

Train on: ['X_carrier_train']
CV: 0.54553, OOF: 0.54288 

Train on: ['X_origin_train']
CV: 0.54804, OOF: 0.55324 

Train on: ['X_dest_train']
CV: 0.54306, OOF: 0.54792 

Train on: ['X_route_train']
CV: 0.5097, OOF: 0.50741 

Train on: ['X_month_train', 'X_dom_train']
CV: 0.57066, OOF: 0.5735 

Train on: ['X_month_train', 'X_dow_train']
CV: 0.56612, OOF: 0.57273 

Train on: ['X_month_train', 'X_hour_train']
CV: 0.68989, OOF: 0.68917 

Train on: ['X_month_train', 'X_minute_train']
CV: 0.5537, OOF: 0.55604 

Train on: ['X_month_train', 'X_isweekend_train']
CV: 0.55462, OOF: 0.55599 

Train on: ['X_month_train', 'X_carrier_train']
CV: 0.5734, OOF: 0.56957 

Tra

CV: 0.68427, OOF: 0.68263 

Train on: ['X_dom_train', 'X_minute_train', 'X_isweekend_train']
CV: 0.53193, OOF: 0.53579 

Train on: ['X_dom_train', 'X_minute_train', 'X_carrier_train']
CV: 0.55224, OOF: 0.55165 

Train on: ['X_dom_train', 'X_minute_train', 'X_origin_train']
CV: 0.55583, OOF: 0.55908 

Train on: ['X_dom_train', 'X_minute_train', 'X_dest_train']
CV: 0.54802, OOF: 0.55253 

Train on: ['X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.52819, OOF: 0.53182 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.54898, OOF: 0.55237 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.55625, OOF: 0.55808 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.55016, OOF: 0.55029 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.5305, OOF: 0.53136 

Train on: ['X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.56726, OOF: 0.57053 

Train on: ['X_dom_train', 'X_carrier_train', 'X_dest_train']


CV: 0.5769, OOF: 0.58115 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.56683, OOF: 0.56899 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.57824, OOF: 0.57834 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57902, OOF: 0.58144 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57678, OOF: 0.58175 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.567, OOF: 0.56867 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.588, OOF: 0.59089 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_dest_train']
CV: 0.58601, OOF: 0.58416 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_route_train']
CV: 0.57693, OOF: 0.57264 

Train on: ['X_month_train', 'X_dom_train', 'X_origin_train', 'X_dest_train']
CV: 0.58677, OOF: 0.59226 

Train o

CV: 0.56252, OOF: 0.57023 

Train on: ['X_dom_train', 'X_dow_train', 'X_minute_train', 'X_route_train']
CV: 0.55069, OOF: 0.55729 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.56592, OOF: 0.5663 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.56766, OOF: 0.57229 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.56267, OOF: 0.57191 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_route_train']
CV: 0.55029, OOF: 0.55947 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_origin_train']
CV: 0.57667, OOF: 0.58286 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_dest_train']
CV: 0.5765, OOF: 0.57552 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_route_train']
CV: 0.56479, OOF: 0.56651 

Train on: ['X_dom_train', 'X_dow_train', 'X_origin_train', 'X_dest_train']
CV: 0.57713, OOF: 0.58531 

Train on: ['X_dom_trai

CV: 0.5595, OOF: 0.56325 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.58491, OOF: 0.58941 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.57266, OOF: 0.57813 

Train on: ['X_dow_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.57376, OOF: 0.57365 

Train on: ['X_dow_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.57442, OOF: 0.58031 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.68819, OOF: 0.68618 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.68684, OOF: 0.68481 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.68597, OOF: 0.6848 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68442, OOF: 0.68239 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69015, OOF: 0.

CV: 0.6955, OOF: 0.69451 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_carrier_train', 'X_route_train']
CV: 0.69422, OOF: 0.69247 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_dest_train']
CV: 0.6933, OOF: 0.6921 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_route_train']
CV: 0.69293, OOF: 0.69178 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_dest_train', 'X_route_train']
CV: 0.69214, OOF: 0.69205 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.58088, OOF: 0.58152 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57857, OOF: 0.58149 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57671, OOF: 0.5824 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.56

CV: 0.69338, OOF: 0.69231 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69251, OOF: 0.69156 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69227, OOF: 0.69083 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69097 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69416, OOF: 0.69448 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.69514, OOF: 0.69386 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.69384, OOF: 0.69256 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.69306, OOF: 0.6914 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_ro

CV: 0.68627, OOF: 0.68483 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68425, OOF: 0.68267 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.6893, OOF: 0.68912 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68901, OOF: 0.68827 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.68809, OOF: 0.68604 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.68755, OOF: 0.68521 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.68702, OOF: 0.68458 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.68619, OOF: 0.68451 

Train on: ['X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.688

CV: 0.57438, OOF: 0.57981 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.58501, OOF: 0.58992 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.68985, OOF: 0.68842 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68912, OOF: 0.68782 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.68789, OOF: 0.68595 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.68753, OOF: 0.68528 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_route_train']
CV: 0.68692, OOF: 0.68444 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train', 'X_route_train']
CV: 0.68586, OOF: 0.68459 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_

CV: 0.69006, OOF: 0.69016 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69448, OOF: 0.69408 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.6947, OOF: 0.69476 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.69432, OOF: 0.69047 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69356, OOF: 0.69213 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69311, OOF: 0.69249 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69211 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69444, OOF:

### CONCATENATE DATA

In [60]:
y = target.dep_delayed_15min.values
print("Classes in dataset:", np.unique(y)) 
print('Size:', y.shape)
print("positive objects:", y.sum())

Classes in dataset: [0 1]
Size: (99994,)
positive objects: 19038


In [None]:
X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_carrier_train

In [None]:
X = hstack([X_month_train, X_dom_train,
            X_dow_train, X_] , format='csr')

X_test = hstack([X_tfidf_test, X_hour_test, 
                 X_dow_test, X_daytime_test, 
                 X_timespan_test, X_unique_test,
                 X_intop10_test, X_socnet_test], format='csr')

X.shape, X_test.shape, y.shape

## XGBoost TRAINING

### Simple XGBoost