# [Flight delays](https://www.kaggle.com/c/flight-delays-spring-2018/overview)
Kaggle InClass competiton by [mlcourse.ai](mlcourse.ai). The task is to predict whether a flight will be delayed for more than 15 minutes

## Initialization

### Perform dataset preparing

In [2]:
%%capture output  
# pip install nbformat
# execute `output.show()` to show output figures and text (if they are)
%run ./2018-12-12-armavox-prepare-dataset.ipynb

### Init required  modules

__Notebook environment__

In [3]:
# pip install watermark
%load_ext watermark
%watermark -v -m -r -b -g -p numpy,pandas,sklearn,matplotlib,statsmodels,xgboost,catboost

The watermark extension is already loaded. To reload it, use:
  %reload_ext watermark
CPython 3.7.1
IPython 7.2.0

numpy 1.15.4
pandas 0.23.4
sklearn 0.20.1
matplotlib 3.0.2
statsmodels 0.9.0
xgboost 0.81
catboost 0.11.2

compiler   : Clang 4.0.1 (tags/RELEASE_401/final)
system     : Darwin
release    : 16.7.0
machine    : x86_64
processor  : i386
CPU cores  : 8
interpreter: 64bit
Git hash   : 0ffb443328315699e8f94d7f4131f1ea46f19d58
Git repo   : https://github.com/armavox/flight-delays-fall-2018.git
Git branch : master


In [5]:
import numpy as np
import pandas as pd
import itertools
import statsmodels.stats.weightstats as wsts
import scipy.stats as stats
from scipy.sparse import hstack, csr_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['patch.force_edgecolor'] = True

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (train_test_split, StratifiedKFold, 
                                     cross_val_score, GridSearchCV)
from sklearn.metrics import roc_auc_score

from xgboost import XGBClassifier

import warnings
warnings.filterwarnings('ignore')

## Data preparation

### IMPORT DATA

In [6]:
train = pd.read_csv('../data/train.csv', index_col='idx')
train.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1
1,4,20,3,US,PIT,MCO,834.0,15,48,0
2,9,2,5,XE,RDU,CLE,416.0,14,22,0


In [7]:
test = pd.read_csv('../data/test.csv', index_col='idx')
test.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,7,25,3,YV,MRY,PHX,598.0,6,15,0
1,4,17,2,WN,LAS,HOU,1235.0,7,39,0
2,12,2,7,MQ,GSP,ORD,577.0,6,51,1


In [8]:
target = pd.read_csv('../data/target.csv', index_col='idx')
target.head(3)

Unnamed: 0_level_0,dep_delayed_15min
idx,Unnamed: 1_level_1
0,0
1,0
2,0


__Origin_Dest interaction: ``Route`` feature__

In [9]:
train['Route'] = train['Origin'] + '_' + train['Dest']
test['Route'] = test['Origin'] + '_' + test['Dest']

### FEATURES CONVERSION

In [10]:
train.head(1)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend,Route
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1,ATL_DFW


In [11]:
ohe = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')

In [12]:
X_month_train = ohe.fit_transform(train.Month.values.reshape(-1, 1))
X_month_test = ohe.transform(test.Month.values.reshape(-1, 1))

In [13]:
X_dom_train = ohe.fit_transform(train.DayofMonth.values.reshape(-1, 1))
X_dom_test = ohe.transform(test.DayofMonth.values.reshape(-1, 1))

In [14]:
X_dow_train = ohe.fit_transform(train.DayOfWeek.values.reshape(-1, 1))
X_dow_test = ohe.transform(test.DayOfWeek.values.reshape(-1, 1))

In [15]:
X_hour_train = ohe.fit_transform(train.Hour.values.reshape(-1, 1))
X_hour_test = ohe.transform(test.Hour.values.reshape(-1, 1))

In [16]:
X_minute_train = ohe.fit_transform(train.Minute.values.reshape(-1, 1))
X_minute_test = ohe.transform(test.Minute.values.reshape(-1, 1))

In [17]:
X_isweekend_train = train.IsWeekend.values.reshape(-1, 1)
X_isweekend_test = test.IsWeekend.values.reshape(-1, 1)

In [18]:
X_carrier_train = ohe.fit_transform(train.UniqueCarrier.values.reshape(-1, 1))
X_carrier_test = ohe.transform(test.UniqueCarrier.values.reshape(-1, 1))

In [19]:
X_origin_train = ohe.fit_transform(train.Origin.values.reshape(-1, 1))
X_origin_test = ohe.transform(test.Origin.values.reshape(-1, 1))

In [20]:
X_dest_train = ohe.fit_transform(train.Dest.values.reshape(-1, 1))
X_dest_test = ohe.transform(test.Dest.values.reshape(-1, 1))

In [21]:
X_route_train = ohe.fit_transform(train.Route.values.reshape(-1, 1))
X_route_test = ohe.fit_transform(test.Route.values.reshape(-1, 1))

### SELECT FEATURES

In [22]:
def simple_xgb_cv(X, y, n_estimators=27, max_depth=5, seed=42,
                   train_size=0.7):
    """Get ROC-AUC score for simple XGBoost classifier
    """
    skf = StratifiedKFold(n_splits=5, random_state=seed)
    
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=train_size, random_state=seed)

    xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                        random_state=seed, n_jobs=-1).fit(X_train, y_train)
    
    roc_auc = roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1],
                            average='samples')
    
    cv_score = cross_val_score(xgb, X_train, y_train, scoring='roc_auc', 
                               cv=skf, n_jobs=-1)
    return round(roc_auc, 5), round(cv_score.mean(), 5)

In [107]:
%%time
features = {'X_month_train': X_month_train, 
            'X_dom_train': X_dom_train, 
            'X_dow_train': X_dow_train, 
            'X_hour_train': X_hour_train, 
            'X_minute_train': X_minute_train,
            'X_isweekend_train': X_isweekend_train,
            'X_carrier_train': X_carrier_train,
            'X_origin_train': X_origin_train,
            'X_dest_train': X_dest_train,
            'X_route_train': X_route_train}

def select_best_feature_comb(features_dict):
    from scipy.sparse import hstack, csr_matrix, isspmatrix_csr
    
    results_dict = {}
    for l in range(1, len(features)+1):

        list_of_keys = [list(comb) for comb in 
                        itertools.combinations(features.keys(), l)]
        list_of_combs = [list(comb) for comb in 
                         itertools.combinations(features.values(), l)]

        for keys, comb in zip(list_of_keys, list_of_combs):
            print('Train on:', keys)
            
            is_csr = [isspmatrix_csr(x) for x in comb]
            if any(is_csr):
                X_con = hstack(comb, format='csr')
            else:
                comb[0] = csr_matrix(comb[0])
                X_con = hstack(comb, format='csr')
                
            result = simple_xgb_cv(X_con, y)
            results_dict[', '.join(keys)] = result
            print(f'CV: {result[1]}, OOF: {result[0]}', '\n')
            
    return results_dict

select_best_feature_comb(features)

Train on: ['X_month_train']
CV: 0.55243, OOF: 0.5521 

Train on: ['X_dom_train']
CV: 0.52353, OOF: 0.52167 

Train on: ['X_dow_train']
CV: 0.53438, OOF: 0.53991 

Train on: ['X_hour_train']
CV: 0.68467, OOF: 0.68235 

Train on: ['X_minute_train']
CV: 0.52264, OOF: 0.52692 

Train on: ['X_isweekend_train']
CV: 0.50765, OOF: 0.51224 

Train on: ['X_carrier_train']
CV: 0.54553, OOF: 0.54288 

Train on: ['X_origin_train']
CV: 0.54804, OOF: 0.55324 

Train on: ['X_dest_train']
CV: 0.54306, OOF: 0.54792 

Train on: ['X_route_train']
CV: 0.5097, OOF: 0.50741 

Train on: ['X_month_train', 'X_dom_train']
CV: 0.57066, OOF: 0.5735 

Train on: ['X_month_train', 'X_dow_train']
CV: 0.56612, OOF: 0.57273 

Train on: ['X_month_train', 'X_hour_train']
CV: 0.68989, OOF: 0.68917 

Train on: ['X_month_train', 'X_minute_train']
CV: 0.5537, OOF: 0.55604 

Train on: ['X_month_train', 'X_isweekend_train']
CV: 0.55462, OOF: 0.55599 

Train on: ['X_month_train', 'X_carrier_train']
CV: 0.5734, OOF: 0.56957 

Tra

CV: 0.68427, OOF: 0.68263 

Train on: ['X_dom_train', 'X_minute_train', 'X_isweekend_train']
CV: 0.53193, OOF: 0.53579 

Train on: ['X_dom_train', 'X_minute_train', 'X_carrier_train']
CV: 0.55224, OOF: 0.55165 

Train on: ['X_dom_train', 'X_minute_train', 'X_origin_train']
CV: 0.55583, OOF: 0.55908 

Train on: ['X_dom_train', 'X_minute_train', 'X_dest_train']
CV: 0.54802, OOF: 0.55253 

Train on: ['X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.52819, OOF: 0.53182 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.54898, OOF: 0.55237 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.55625, OOF: 0.55808 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.55016, OOF: 0.55029 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.5305, OOF: 0.53136 

Train on: ['X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.56726, OOF: 0.57053 

Train on: ['X_dom_train', 'X_carrier_train', 'X_dest_train']


CV: 0.5769, OOF: 0.58115 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.56683, OOF: 0.56899 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.57824, OOF: 0.57834 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57902, OOF: 0.58144 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57678, OOF: 0.58175 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.567, OOF: 0.56867 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.588, OOF: 0.59089 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_dest_train']
CV: 0.58601, OOF: 0.58416 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_route_train']
CV: 0.57693, OOF: 0.57264 

Train on: ['X_month_train', 'X_dom_train', 'X_origin_train', 'X_dest_train']
CV: 0.58677, OOF: 0.59226 

Train o

CV: 0.56252, OOF: 0.57023 

Train on: ['X_dom_train', 'X_dow_train', 'X_minute_train', 'X_route_train']
CV: 0.55069, OOF: 0.55729 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.56592, OOF: 0.5663 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.56766, OOF: 0.57229 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.56267, OOF: 0.57191 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_route_train']
CV: 0.55029, OOF: 0.55947 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_origin_train']
CV: 0.57667, OOF: 0.58286 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_dest_train']
CV: 0.5765, OOF: 0.57552 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_route_train']
CV: 0.56479, OOF: 0.56651 

Train on: ['X_dom_train', 'X_dow_train', 'X_origin_train', 'X_dest_train']
CV: 0.57713, OOF: 0.58531 

Train on: ['X_dom_trai

CV: 0.5595, OOF: 0.56325 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.58491, OOF: 0.58941 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.57266, OOF: 0.57813 

Train on: ['X_dow_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.57376, OOF: 0.57365 

Train on: ['X_dow_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.57442, OOF: 0.58031 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.68819, OOF: 0.68618 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.68684, OOF: 0.68481 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.68597, OOF: 0.6848 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68442, OOF: 0.68239 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69015, OOF: 0.

CV: 0.6955, OOF: 0.69451 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_carrier_train', 'X_route_train']
CV: 0.69422, OOF: 0.69247 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_dest_train']
CV: 0.6933, OOF: 0.6921 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_route_train']
CV: 0.69293, OOF: 0.69178 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_dest_train', 'X_route_train']
CV: 0.69214, OOF: 0.69205 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.58088, OOF: 0.58152 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57857, OOF: 0.58149 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57671, OOF: 0.5824 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.56

CV: 0.69338, OOF: 0.69231 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69251, OOF: 0.69156 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69227, OOF: 0.69083 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69097 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69416, OOF: 0.69448 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.69514, OOF: 0.69386 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.69384, OOF: 0.69256 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.69306, OOF: 0.6914 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_ro

CV: 0.68627, OOF: 0.68483 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68425, OOF: 0.68267 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.6893, OOF: 0.68912 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68901, OOF: 0.68827 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.68809, OOF: 0.68604 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.68755, OOF: 0.68521 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.68702, OOF: 0.68458 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.68619, OOF: 0.68451 

Train on: ['X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.688

CV: 0.57438, OOF: 0.57981 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.58501, OOF: 0.58992 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.68985, OOF: 0.68842 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68912, OOF: 0.68782 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.68789, OOF: 0.68595 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.68753, OOF: 0.68528 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_route_train']
CV: 0.68692, OOF: 0.68444 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train', 'X_route_train']
CV: 0.68586, OOF: 0.68459 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_

CV: 0.69006, OOF: 0.69016 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69448, OOF: 0.69408 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.6947, OOF: 0.69476 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.69432, OOF: 0.69047 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69356, OOF: 0.69213 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69311, OOF: 0.69249 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69211 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69444, OOF:

CV: 0.59977, OOF: 0.60511 

Train on: ['X_month_train', 'X_dow_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.59382, OOF: 0.5981 

Train on: ['X_month_train', 'X_dow_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.59288, OOF: 0.59386 

Train on: ['X_month_train', 'X_dow_train', 'X_minute_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.59102, OOF: 0.59483 

Train on: ['X_month_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.59951, OOF: 0.60337 

Train on: ['X_month_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.59359, OOF: 0.59795 

Train on: ['X_month_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.59226, OOF: 0.59242 

Train on: ['X_month_train', 'X_dow_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train', 'X_route_train']

CV: 0.5782, OOF: 0.57903 

Train on: ['X_dom_train', 'X_dow_train', 'X_minute_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.57616, OOF: 0.5848 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.58672, OOF: 0.59054 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.57588, OOF: 0.58262 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.57695, OOF: 0.57566 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.57776, OOF: 0.58468 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.58736, OOF: 0.59256 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.689

CV: 0.69369, OOF: 0.69318 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.69241, OOF: 0.69354 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.69037, OOF: 0.68998 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69569, OOF: 0.69451 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.69512, OOF: 0.69637 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.69429, OOF: 0.69257 

Train on: ['X_month_train', 'X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69383, OOF: 0.69228 

Train on: ['X_month_train', 'X_dom_

CV: 0.58744, OOF: 0.59288 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.5961, OOF: 0.59898 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.59505, OOF: 0.59971 

Train on: ['X_month_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69481, OOF: 0.69394 

Train on: ['X_month_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.69578, OOF: 0.69385 

Train on: ['X_month_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.69375, OOF: 0.69262 

Train on: ['X_month_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.69371, OOF: 0.6916 

Train o

CV: 0.69078, OOF: 0.68832 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.68938, OOF: 0.68797 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.68871, OOF: 0.68867 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.68755, OOF: 0.68529 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.69045, OOF: 0.68831 

Train on: ['X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.69034, OOF: 0.68823 

Train on: ['X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.5825, OOF: 0.58633 

Train o

CV: 0.69574, OOF: 0.69457 

Train on: ['X_month_train', 'X_dow_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.59932, OOF: 0.60609 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.69456, OOF: 0.69408 

Train on: ['X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.69423, OOF: 0.6952 

Train on: ['X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.69259, OOF: 0.69059 

Train on: ['X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.69162, OOF: 0.69342 

Train on: ['X_dom_train', 'X_dow_train', 'X_hour_train', 'X_minute_train', 'X_isw

### CONCATENATE DATA

In [36]:
y = target.dep_delayed_15min.values
print("Classes in dataset:", np.unique(y)) 
print('Size:', y.shape)
print("positive objects:", y.sum())
balance_coef = np.sum(y==0) /  np.sum(y==1)

Classes in dataset: [0 1]
Size: (99994,)
positive objects: 19038


In [41]:
# Best feature combination

X = np.hstack([
    X_month_train,
    X_dom_train,
    X_dow_train,
    X_hour_train,
    X_minute_train,
    X_isweekend_train,
    X_carrier_train,
#     X_origin_train
    X_dest_train,
#     X_route_train
])

X_test = np.hstack([
    X_month_test,
    X_dom_test,
    X_dow_test,
    X_hour_test,
    X_minute_test,
    X_isweekend_test,
    X_carrier_test,
#     X_origin_test,
    X_dest_test,
#     X_route_test
])

X.shape, X_test.shape, y.shape

((99994, 447), (100000, 447), (99994,))

## XGBoost TUNING

### Simple XGBoost

In [88]:
X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.7, random_state=42)
skf = StratifiedKFold(n_splits=5, random_state=42)

In [89]:
xgb = XGBClassifier(n_estimators=300, max_depth=3, random_state=42, n_jobs=-1)

In [90]:
%%time
xgb.fit(X_train, y_train)
print(roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1]))

0.7219444017255943
CPU times: user 6min 6s, sys: 1.72 s, total: 6min 8s
Wall time: 6min 10s


In [52]:
_cv_score = cross_val_score(xgb, X, y, scoring='roc_auc', cv=skf, n_jobs=-1)
_cv_score.mean(), _cv_score.std()

### XGB CV

In [63]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

#### Iteration #1. Model complexity

In [51]:
_xgb_grid_params_iteration1 = {
    'colsample_bytree': np.linspace(0.4, 1, 5),
    'gamma': np.linspace(0.5, 1, 5),
    'max_depth': np.arange(1, 11)
    'min_child_weight': np.arange(1,11),
    'reg_alpha': np.logspace(-2, 2, 8),
    'reg_lambda': np.logspace(-2, 2, 8),
    'subsample': np.linspace(0.5, 1, 8)
}

In [68]:
%%time
xgb = XGBClassifier(n_estimators=30, 
                    scale_pos_weight=balance_coef,
                    random_state=42, n_jobs=-1)

xgb_search1 = RandomizedSearchCV(xgb, _xgb_grid_params_iteration1, 
                                 n_iter=1000, cv=skf, scoring='roc_auc', 
                                 random_state=42, n_jobs=-1, verbose=1)
xgb_search1.fit(X_train, y_train)

Fitting 5 folds for each of 500 candidates, totalling 2500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


KeyboardInterrupt: 

In [57]:
xgbest1 = xgb_search1.best_estimator_
xgb_best_complexity = xgb_search1.best_params_
xgb_search1.best_score_, xgb_search1.best_params_

AttributeError: 'RandomizedSearchCV' object has no attribute 'best_estimator_'

In [87]:
print(f"""ROC-AUC on the validation data: 
{roc_auc_score(y_valid, xgbest1.predict_proba(X_valid)[:, 1]):.5f}""")

ROC-AUC on the validation data: 
0.58696


#### Iteration #2. Model optimization

In [74]:
_xgb_grid_params_iteration2 = {
    'n_estimators': np.linspace(100, 1000, 10, dtype='int'),
    'learning_rate': np.arange(0.005, 0.1, 0.005)
}

In [78]:
%%time

xgb_search2 = GridSearchCV(xgbest1, _xgb_grid_params_iteration2,
                                cv=skf, scoring='roc_auc', 
                                n_jobs=-1, verbose=1)
xgb_search2.fit(X_train, y_train)

NameError: name 'xgb_best_complexity' is not defined

In [76]:
xgbest2 = xgb_search2.best_estimator_
xgb_best_complexity = xgb_search2.best_params_
xgb_search2.best_score_, xgb_search2.best_params_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=99,
       n_jobs=-1, nthread=None, objective='binary:logistic',
       random_state=42, reg_alpha=0, reg_lambda=1,
       scale_pos_weight=4.252337430402354, seed=None, silent=True,
       subsample=1)

In [None]:
print(f"""ROC-AUC on the validation data: 
{roc_auc_score(y_valid, xgbest2.predict_proba(X_valid)[:, 1]):.5f}""")

## SUBMIT

### Last check

In [92]:
X.shape, X_test.shape, y.shape

((99994, 447), (100000, 447), (99994,))

In [None]:
final_estimator = xgbest2
final_estimator

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_set=0.9, 
                                                      random_state=42)

_cv_score = cross_val_score(final_estimator, X, y, scoring='roc_auc', cv=skf,
                            n_jobs=-1)

final_estimator.fit(X_train, y_train)
_roc_auc = roc_auc_score(y_valid, final_estimator.predic_proba(X_valid)[:, 1])

print(f'CV: {_cv_score:.5f} \n ROC-AUC: {_roc_auc}')

### Train on the full dataset

In [None]:
%%time
final_estimator.fit(X, y)
final_pred = final_estimator.predict_proba(X_test)[: 1]

### Write submission

In [91]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file, 
                             target='dep_delayed_15min', index_label="id"):
    
    predicted_df = pd.DataFrame(
        predicted_labels,
        index = np.arange(0, predicted_labels.shape[0]),
        columns=[target])
    
    predicted_df.to_csv(out_file, index_label=index_label)

In [None]:
from datetime import datetime as dt
import subprocess
now = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
label = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")

### WRITE SUBMISSION
write_to_submission_file(final_pred, f'../submissions/catboost_submission_at_{now}__githash_{label}.csv')