# [Flight delays](https://www.kaggle.com/c/flight-delays-spring-2018/overview)
Kaggle InClass competiton by [mlcourse.ai](mlcourse.ai). The task is to predict whether a flight will be delayed for more than 15 minutes

## Initialization

### Perform dataset preparing

In [7]:
# pip install nbformat
# execute `output.show()` to show output figures and text (if they are)
%%capture output  
%run ./2018-12-12-armavox-prepare-dataset.ipynb

### Init required  modules

__Notebook environment__

In [1]:
# pip install watermark
%load_ext watermark
%watermark -v -m -r -b -g -p numpy,pandas,sklearn,matplotlib,statsmodels,xgboost,catboost

CPython 3.7.1
IPython 7.2.0

numpy 1.15.4
pandas 0.23.4
sklearn 0.20.1
matplotlib 2.2.2
statsmodels 0.9.0
xgboost 0.81
catboost 0.11.2

compiler   : GCC 7.3.0
system     : Linux
release    : 4.14.77-70.59.amzn1.x86_64
machine    : x86_64
processor  : x86_64
CPU cores  : 72
interpreter: 64bit
Git hash   : 2a0916569c3397dfd16e42762fdfc89e2ace2fb4
Git repo   : https://github.com/armavox/flight-delays-spring-2018.git
Git branch : master


In [22]:
import numpy as np
import pandas as pd
import itertools
import statsmodels.stats.weightstats as wsts
import scipy.stats as stats
from scipy.sparse import hstack, csr_matrix, issparse

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['patch.force_edgecolor'] = True

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import (train_test_split, StratifiedKFold, 
                                     cross_val_score, GridSearchCV)
from sklearn.metrics import roc_auc_score

from sklearn.ensemble import RandomForestClassifier

import warnings
warnings.filterwarnings('ignore')

## Data preparation

### IMPORT DATA

In [3]:
train = pd.read_csv('../data/train.csv', index_col='idx')
train.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1
1,4,20,3,US,PIT,MCO,834.0,15,48,0
2,9,2,5,XE,RDU,CLE,416.0,14,22,0


In [4]:
test = pd.read_csv('../data/test.csv', index_col='idx')
test.head(3)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,7,25,3,YV,MRY,PHX,598.0,6,15,0
1,4,17,2,WN,LAS,HOU,1235.0,7,39,0
2,12,2,7,MQ,GSP,ORD,577.0,6,51,1


In [5]:
target = pd.read_csv('../data/target.csv', index_col='idx')
target.head(3)

Unnamed: 0_level_0,dep_delayed_15min
idx,Unnamed: 1_level_1
0,0
1,0
2,0


__Origin_Dest interaction: ``Route`` feature__

In [6]:
train['Route'] = train['Origin'] + '_' + train['Dest']
test['Route'] = test['Origin'] + '_' + test['Dest']

### FEATURES CONVERSION

In [7]:
train.head(1)

Unnamed: 0_level_0,Month,DayofMonth,DayOfWeek,UniqueCarrier,Origin,Dest,Distance,Hour,Minute,IsWeekend,Route
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,8,21,7,AA,ATL,DFW,732.0,19,34,1,ATL_DFW


In [8]:
ohe = OneHotEncoder(categories='auto', sparse=False, handle_unknown='ignore')

In [9]:
X_month_train = ohe.fit_transform(train.Month.values.reshape(-1, 1))
X_month_test = ohe.transform(test.Month.values.reshape(-1, 1))

In [10]:
X_dom_train = ohe.fit_transform(train.DayofMonth.values.reshape(-1, 1))
X_dom_test = ohe.transform(test.DayofMonth.values.reshape(-1, 1))

In [11]:
X_dow_train = ohe.fit_transform(train.DayOfWeek.values.reshape(-1, 1))
X_dow_test = ohe.transform(test.DayOfWeek.values.reshape(-1, 1))

In [12]:
X_hour_train = ohe.fit_transform(train.Hour.values.reshape(-1, 1))
X_hour_test = ohe.transform(test.Hour.values.reshape(-1, 1))

In [13]:
X_minute_train = ohe.fit_transform(train.Minute.values.reshape(-1, 1))
X_minute_test = ohe.transform(test.Minute.values.reshape(-1, 1))

In [14]:
X_isweekend_train = train.IsWeekend.values.reshape(-1, 1)
X_isweekend_test = test.IsWeekend.values.reshape(-1, 1)

In [15]:
X_carrier_train = ohe.fit_transform(train.UniqueCarrier.values.reshape(-1, 1))
X_carrier_test = ohe.transform(test.UniqueCarrier.values.reshape(-1, 1))

In [16]:
X_origin_train = ohe.fit_transform(train.Origin.values.reshape(-1, 1))
X_origin_test = ohe.transform(test.Origin.values.reshape(-1, 1))

In [17]:
X_dest_train = ohe.fit_transform(train.Dest.values.reshape(-1, 1))
X_dest_test = ohe.transform(test.Dest.values.reshape(-1, 1))

In [18]:
X_route_train = ohe.fit_transform(train.Route.values.reshape(-1, 1))
X_route_test = ohe.fit_transform(test.Route.values.reshape(-1, 1))

### SELECT FEATURES

In [106]:
def simple_xgb_cv(X, y, n_estimators=27, max_depth=5, seed=42,
                   train_size=0.7):
    """Get ROC-AUC score for simple XGBoost classifier
    """
    skf = StratifiedKFold(n_splits=5, random_state=seed)
    
    X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=train_size, random_state=seed)

    xgb = XGBClassifier(n_estimators=n_estimators, max_depth=max_depth,
                        random_state=seed, n_jobs=-1).fit(X_train, y_train)
    
    roc_auc = roc_auc_score(y_valid, xgb.predict_proba(X_valid)[:, 1],
                            average='samples')
    
    cv_score = cross_val_score(xgb, X_train, y_train, scoring='roc_auc', 
                               cv=skf, n_jobs=-1)
    return round(roc_auc, 5), round(cv_score.mean(), 5)

In [None]:
%%time
features = {'X_month_train': X_month_train, 
            'X_dom_train': X_dom_train, 
            'X_dow_train': X_dow_train, 
            'X_hour_train': X_hour_train, 
            'X_minute_train': X_minute_train,
            'X_isweekend_train': X_isweekend_train,
            'X_carrier_train': X_carrier_train,
            'X_origin_train': X_origin_train,
            'X_dest_train': X_dest_train,
            'X_route_train': X_route_train}

def select_best_feature_comb(features_dict):
    from scipy.sparse import hstack, csr_matrix, isspmatrix_csr
    
    results_dict = {}
    for l in range(1, len(features)+1):

        list_of_keys = [list(comb) for comb in 
                        itertools.combinations(features.keys(), l)]
        list_of_combs = [list(comb) for comb in 
                         itertools.combinations(features.values(), l)]

        for keys, comb in zip(list_of_keys, list_of_combs):
            print('Train on:', keys)
            
            is_csr = [isspmatrix_csr(x) for x in comb]
            if any(is_csr):
                X_con = hstack(comb, format='csr')
            else:
                comb[0] = csr_matrix(comb[0])
                X_con = hstack(comb, format='csr')
                
            result = simple_xgb_cv(X_con, y)
            results_dict[', '.join(keys)] = result
            print(f'CV: {result[1]}, OOF: {result[0]}', '\n')
            
    return results_dict

select_best_feature_comb(features)

Train on: ['X_month_train']
CV: 0.55243, OOF: 0.5521 

Train on: ['X_dom_train']
CV: 0.52353, OOF: 0.52167 

Train on: ['X_dow_train']
CV: 0.53438, OOF: 0.53991 

Train on: ['X_hour_train']
CV: 0.68467, OOF: 0.68235 

Train on: ['X_minute_train']
CV: 0.52264, OOF: 0.52692 

Train on: ['X_isweekend_train']
CV: 0.50765, OOF: 0.51224 

Train on: ['X_carrier_train']
CV: 0.54553, OOF: 0.54288 

Train on: ['X_origin_train']
CV: 0.54804, OOF: 0.55324 

Train on: ['X_dest_train']
CV: 0.54306, OOF: 0.54792 

Train on: ['X_route_train']
CV: 0.5097, OOF: 0.50741 

Train on: ['X_month_train', 'X_dom_train']
CV: 0.57066, OOF: 0.5735 

Train on: ['X_month_train', 'X_dow_train']
CV: 0.56612, OOF: 0.57273 

Train on: ['X_month_train', 'X_hour_train']
CV: 0.68989, OOF: 0.68917 

Train on: ['X_month_train', 'X_minute_train']
CV: 0.5537, OOF: 0.55604 

Train on: ['X_month_train', 'X_isweekend_train']
CV: 0.55462, OOF: 0.55599 

Train on: ['X_month_train', 'X_carrier_train']
CV: 0.5734, OOF: 0.56957 

Tra

CV: 0.68427, OOF: 0.68263 

Train on: ['X_dom_train', 'X_minute_train', 'X_isweekend_train']
CV: 0.53193, OOF: 0.53579 

Train on: ['X_dom_train', 'X_minute_train', 'X_carrier_train']
CV: 0.55224, OOF: 0.55165 

Train on: ['X_dom_train', 'X_minute_train', 'X_origin_train']
CV: 0.55583, OOF: 0.55908 

Train on: ['X_dom_train', 'X_minute_train', 'X_dest_train']
CV: 0.54802, OOF: 0.55253 

Train on: ['X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.52819, OOF: 0.53182 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.54898, OOF: 0.55237 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.55625, OOF: 0.55808 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.55016, OOF: 0.55029 

Train on: ['X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.5305, OOF: 0.53136 

Train on: ['X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.56726, OOF: 0.57053 

Train on: ['X_dom_train', 'X_carrier_train', 'X_dest_train']


CV: 0.5769, OOF: 0.58115 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_route_train']
CV: 0.56683, OOF: 0.56899 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.57824, OOF: 0.57834 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57902, OOF: 0.58144 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57678, OOF: 0.58175 

Train on: ['X_month_train', 'X_dom_train', 'X_isweekend_train', 'X_route_train']
CV: 0.567, OOF: 0.56867 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_origin_train']
CV: 0.588, OOF: 0.59089 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_dest_train']
CV: 0.58601, OOF: 0.58416 

Train on: ['X_month_train', 'X_dom_train', 'X_carrier_train', 'X_route_train']
CV: 0.57693, OOF: 0.57264 

Train on: ['X_month_train', 'X_dom_train', 'X_origin_train', 'X_dest_train']
CV: 0.58677, OOF: 0.59226 

Train o

CV: 0.56252, OOF: 0.57023 

Train on: ['X_dom_train', 'X_dow_train', 'X_minute_train', 'X_route_train']
CV: 0.55069, OOF: 0.55729 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.56592, OOF: 0.5663 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.56766, OOF: 0.57229 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.56267, OOF: 0.57191 

Train on: ['X_dom_train', 'X_dow_train', 'X_isweekend_train', 'X_route_train']
CV: 0.55029, OOF: 0.55947 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_origin_train']
CV: 0.57667, OOF: 0.58286 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_dest_train']
CV: 0.5765, OOF: 0.57552 

Train on: ['X_dom_train', 'X_dow_train', 'X_carrier_train', 'X_route_train']
CV: 0.56479, OOF: 0.56651 

Train on: ['X_dom_train', 'X_dow_train', 'X_origin_train', 'X_dest_train']
CV: 0.57713, OOF: 0.58531 

Train on: ['X_dom_trai

CV: 0.5595, OOF: 0.56325 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train']
CV: 0.58491, OOF: 0.58941 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_route_train']
CV: 0.57266, OOF: 0.57813 

Train on: ['X_dow_train', 'X_carrier_train', 'X_dest_train', 'X_route_train']
CV: 0.57376, OOF: 0.57365 

Train on: ['X_dow_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.57442, OOF: 0.58031 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.68819, OOF: 0.68618 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.68684, OOF: 0.68481 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.68597, OOF: 0.6848 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68442, OOF: 0.68239 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69015, OOF: 0.

CV: 0.6955, OOF: 0.69451 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_carrier_train', 'X_route_train']
CV: 0.69422, OOF: 0.69247 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_dest_train']
CV: 0.6933, OOF: 0.6921 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_origin_train', 'X_route_train']
CV: 0.69293, OOF: 0.69178 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_dest_train', 'X_route_train']
CV: 0.69214, OOF: 0.69205 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train']
CV: 0.58088, OOF: 0.58152 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train']
CV: 0.57857, OOF: 0.58149 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train']
CV: 0.57671, OOF: 0.5824 

Train on: ['X_month_train', 'X_dom_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.56

CV: 0.69338, OOF: 0.69231 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69251, OOF: 0.69156 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69227, OOF: 0.69083 

Train on: ['X_month_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69097 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69416, OOF: 0.69448 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.69514, OOF: 0.69386 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.69384, OOF: 0.69256 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.69306, OOF: 0.6914 

Train on: ['X_month_train', 'X_hour_train', 'X_isweekend_train', 'X_origin_train', 'X_ro

CV: 0.68627, OOF: 0.68483 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_route_train']
CV: 0.68425, OOF: 0.68267 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.6893, OOF: 0.68912 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68901, OOF: 0.68827 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.68809, OOF: 0.68604 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.68755, OOF: 0.68521 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.68702, OOF: 0.68458 

Train on: ['X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.68619, OOF: 0.68451 

Train on: ['X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.688

CV: 0.57438, OOF: 0.57981 

Train on: ['X_dow_train', 'X_carrier_train', 'X_origin_train', 'X_dest_train', 'X_route_train']
CV: 0.58501, OOF: 0.58992 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.68985, OOF: 0.68842 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_dest_train']
CV: 0.68912, OOF: 0.68782 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_carrier_train', 'X_route_train']
CV: 0.68789, OOF: 0.68595 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_dest_train']
CV: 0.68753, OOF: 0.68528 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_origin_train', 'X_route_train']
CV: 0.68692, OOF: 0.68444 

Train on: ['X_hour_train', 'X_minute_train', 'X_isweekend_train', 'X_dest_train', 'X_route_train']
CV: 0.68586, OOF: 0.68459 

Train on: ['X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_

CV: 0.69006, OOF: 0.69016 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69448, OOF: 0.69408 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_dest_train']
CV: 0.6947, OOF: 0.69476 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_carrier_train', 'X_route_train']
CV: 0.69432, OOF: 0.69047 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_dest_train']
CV: 0.69356, OOF: 0.69213 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_origin_train', 'X_route_train']
CV: 0.69311, OOF: 0.69249 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_minute_train', 'X_dest_train', 'X_route_train']
CV: 0.6911, OOF: 0.69211 

Train on: ['X_month_train', 'X_dom_train', 'X_hour_train', 'X_isweekend_train', 'X_carrier_train', 'X_origin_train']
CV: 0.69444, OOF:

### CONCATENATE DATA

In [21]:
y = target.dep_delayed_15min.values
print("Classes in dataset:", np.unique(y)) 
print('Size:', y.shape)
print("positive objects:", y.sum())

Classes in dataset: [0 1]
Size: (99994,)
positive objects: 19038


In [23]:
X = hstack([csr_matrix(X_month_train), 
            X_dom_train, 
            X_dow_train, 
            X_hour_train, 
            X_minute_train,
            X_isweekend_train,
            X_carrier_train,
            X_origin_train,
            X_dest_train,
            X_route_train] , format='csr')

X_test = hstack([csr_matrix(X_month_test), 
            X_dom_test, 
            X_dow_test, 
            X_hour_test, 
            X_minute_test,
            X_isweekend_test,
            X_carrier_test,
            X_origin_test,
            X_dest_test,
            X_route_test], format='csr')

X.shape, X_test.shape, y.shape

((99994, 5165), (100000, 5405), (99994,))

## RandomForestClassifier TRAINING

### Simple Random forest

In [28]:
X_train, X_valid, y_train, y_valid = train_test_split(
        X, y, train_size=0.7, random_state=42)

rf = RandomForestClassifier(n_estimators=1000, n_jobs=-1, oob_score=True)

In [29]:
%%time
rf.fit(X_train, y_train)

CPU times: user 26min 49s, sys: 2.54 s, total: 26min 52s
Wall time: 39.3 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=-1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [30]:
rf.oob_score_

0.8145438959925709

In [31]:
roc_auc_score(y_valid, rf.predict_proba(X_valid)[:,1])

0.741062778635836

### Simple catboost

In [34]:
from catboost import CatBoostClassifier

In [44]:
cat_features_idx = np.where((train.dtypes == 'object') | (train.dtypes == 'int64'))[0].tolist()

In [45]:
cat_features_idx

[0, 1, 2, 3, 4, 5, 7, 8, 9, 10]

In [47]:
X_train, X_valid, y_train, y_valid = train_test_split(
        train, target, train_size=0.7, random_state=42)

In [48]:
cat = CatBoostClassifier(random_state=42, thread_count=72)

In [49]:
%time cat.fit(X_train, y_train, cat_features=cat_features_idx)

0:	learn: 0.6646956	total: 117ms	remaining: 1m 56s
1:	learn: 0.6370554	total: 224ms	remaining: 1m 51s
2:	learn: 0.6157410	total: 311ms	remaining: 1m 43s
3:	learn: 0.5944024	total: 405ms	remaining: 1m 40s
4:	learn: 0.5768418	total: 498ms	remaining: 1m 39s
5:	learn: 0.5606891	total: 596ms	remaining: 1m 38s
6:	learn: 0.5470106	total: 681ms	remaining: 1m 36s
7:	learn: 0.5350466	total: 776ms	remaining: 1m 36s
8:	learn: 0.5259466	total: 823ms	remaining: 1m 30s
9:	learn: 0.5185023	total: 884ms	remaining: 1m 27s
10:	learn: 0.5099633	total: 966ms	remaining: 1m 26s
11:	learn: 0.5036732	total: 1.04s	remaining: 1m 25s
12:	learn: 0.4962711	total: 1.12s	remaining: 1m 24s
13:	learn: 0.4901022	total: 1.21s	remaining: 1m 25s
14:	learn: 0.4847032	total: 1.29s	remaining: 1m 25s
15:	learn: 0.4801683	total: 1.39s	remaining: 1m 25s
16:	learn: 0.4763091	total: 1.48s	remaining: 1m 25s
17:	learn: 0.4720401	total: 1.57s	remaining: 1m 25s
18:	learn: 0.4682977	total: 1.68s	remaining: 1m 26s
19:	learn: 0.4654739	t

159:	learn: 0.4047146	total: 14.4s	remaining: 1m 15s
160:	learn: 0.4046012	total: 14.5s	remaining: 1m 15s
161:	learn: 0.4045328	total: 14.6s	remaining: 1m 15s
162:	learn: 0.4044631	total: 14.7s	remaining: 1m 15s
163:	learn: 0.4043407	total: 14.8s	remaining: 1m 15s
164:	learn: 0.4042296	total: 14.9s	remaining: 1m 15s
165:	learn: 0.4041584	total: 14.9s	remaining: 1m 15s
166:	learn: 0.4040677	total: 15s	remaining: 1m 14s
167:	learn: 0.4040058	total: 15.1s	remaining: 1m 14s
168:	learn: 0.4039674	total: 15.2s	remaining: 1m 14s
169:	learn: 0.4038775	total: 15.3s	remaining: 1m 14s
170:	learn: 0.4038718	total: 15.3s	remaining: 1m 14s
171:	learn: 0.4037775	total: 15.4s	remaining: 1m 14s
172:	learn: 0.4036917	total: 15.5s	remaining: 1m 14s
173:	learn: 0.4036246	total: 15.6s	remaining: 1m 14s
174:	learn: 0.4035345	total: 15.7s	remaining: 1m 14s
175:	learn: 0.4034969	total: 15.8s	remaining: 1m 13s
176:	learn: 0.4033504	total: 15.9s	remaining: 1m 13s
177:	learn: 0.4033222	total: 16s	remaining: 1m 1

318:	learn: 0.3949007	total: 28.7s	remaining: 1m 1s
319:	learn: 0.3948247	total: 28.8s	remaining: 1m 1s
320:	learn: 0.3947746	total: 28.9s	remaining: 1m 1s
321:	learn: 0.3946894	total: 29s	remaining: 1m
322:	learn: 0.3946503	total: 29.1s	remaining: 1m
323:	learn: 0.3945953	total: 29.1s	remaining: 1m
324:	learn: 0.3945206	total: 29.2s	remaining: 1m
325:	learn: 0.3944840	total: 29.3s	remaining: 1m
326:	learn: 0.3944462	total: 29.4s	remaining: 1m
327:	learn: 0.3943763	total: 29.5s	remaining: 1m
328:	learn: 0.3943289	total: 29.6s	remaining: 1m
329:	learn: 0.3942935	total: 29.7s	remaining: 1m
330:	learn: 0.3942689	total: 29.8s	remaining: 1m
331:	learn: 0.3942104	total: 29.9s	remaining: 1m
332:	learn: 0.3941284	total: 30s	remaining: 1m
333:	learn: 0.3940937	total: 30.1s	remaining: 60s
334:	learn: 0.3940326	total: 30.2s	remaining: 59.9s
335:	learn: 0.3940059	total: 30.3s	remaining: 59.8s
336:	learn: 0.3939751	total: 30.4s	remaining: 59.7s
337:	learn: 0.3939381	total: 30.4s	remaining: 59.6s
33

480:	learn: 0.3878518	total: 44.1s	remaining: 47.5s
481:	learn: 0.3878145	total: 44.2s	remaining: 47.5s
482:	learn: 0.3877616	total: 44.3s	remaining: 47.4s
483:	learn: 0.3877198	total: 44.4s	remaining: 47.3s
484:	learn: 0.3876797	total: 44.5s	remaining: 47.2s
485:	learn: 0.3876391	total: 44.5s	remaining: 47.1s
486:	learn: 0.3875789	total: 44.7s	remaining: 47s
487:	learn: 0.3875144	total: 44.8s	remaining: 47s
488:	learn: 0.3874899	total: 44.8s	remaining: 46.9s
489:	learn: 0.3874618	total: 45s	remaining: 46.8s
490:	learn: 0.3874091	total: 45s	remaining: 46.7s
491:	learn: 0.3873594	total: 45.2s	remaining: 46.6s
492:	learn: 0.3873212	total: 45.3s	remaining: 46.5s
493:	learn: 0.3872645	total: 45.3s	remaining: 46.4s
494:	learn: 0.3872384	total: 45.4s	remaining: 46.4s
495:	learn: 0.3872105	total: 45.6s	remaining: 46.3s
496:	learn: 0.3871532	total: 45.7s	remaining: 46.2s
497:	learn: 0.3870749	total: 45.8s	remaining: 46.1s
498:	learn: 0.3870496	total: 45.9s	remaining: 46s
499:	learn: 0.3870176	

639:	learn: 0.3821983	total: 59.5s	remaining: 33.4s
640:	learn: 0.3821676	total: 59.6s	remaining: 33.4s
641:	learn: 0.3821419	total: 59.7s	remaining: 33.3s
642:	learn: 0.3821068	total: 59.8s	remaining: 33.2s
643:	learn: 0.3820722	total: 59.9s	remaining: 33.1s
644:	learn: 0.3820325	total: 60s	remaining: 33s
645:	learn: 0.3819854	total: 1m	remaining: 32.9s
646:	learn: 0.3819696	total: 1m	remaining: 32.8s
647:	learn: 0.3819489	total: 1m	remaining: 32.7s
648:	learn: 0.3819042	total: 1m	remaining: 32.7s
649:	learn: 0.3818602	total: 1m	remaining: 32.6s
650:	learn: 0.3818291	total: 1m	remaining: 32.5s
651:	learn: 0.3817789	total: 1m	remaining: 32.4s
652:	learn: 0.3817403	total: 1m	remaining: 32.3s
653:	learn: 0.3817161	total: 1m	remaining: 32.2s
654:	learn: 0.3816451	total: 1m	remaining: 32.1s
655:	learn: 0.3815980	total: 1m 1s	remaining: 32s
656:	learn: 0.3815691	total: 1m 1s	remaining: 31.9s
657:	learn: 0.3815075	total: 1m 1s	remaining: 31.8s
658:	learn: 0.3814645	total: 1m 1s	remaining: 31

799:	learn: 0.3761189	total: 1m 15s	remaining: 18.8s
800:	learn: 0.3760898	total: 1m 15s	remaining: 18.7s
801:	learn: 0.3760618	total: 1m 15s	remaining: 18.6s
802:	learn: 0.3760293	total: 1m 15s	remaining: 18.5s
803:	learn: 0.3760004	total: 1m 15s	remaining: 18.4s
804:	learn: 0.3759652	total: 1m 15s	remaining: 18.3s
805:	learn: 0.3759112	total: 1m 15s	remaining: 18.2s
806:	learn: 0.3758669	total: 1m 15s	remaining: 18.1s
807:	learn: 0.3758370	total: 1m 15s	remaining: 18s
808:	learn: 0.3758199	total: 1m 15s	remaining: 17.9s
809:	learn: 0.3758101	total: 1m 16s	remaining: 17.8s
810:	learn: 0.3757997	total: 1m 16s	remaining: 17.7s
811:	learn: 0.3757787	total: 1m 16s	remaining: 17.7s
812:	learn: 0.3757475	total: 1m 16s	remaining: 17.6s
813:	learn: 0.3757228	total: 1m 16s	remaining: 17.5s
814:	learn: 0.3757064	total: 1m 16s	remaining: 17.4s
815:	learn: 0.3756915	total: 1m 16s	remaining: 17.3s
816:	learn: 0.3756327	total: 1m 16s	remaining: 17.2s
817:	learn: 0.3755839	total: 1m 16s	remaining: 1

956:	learn: 0.3711848	total: 1m 29s	remaining: 4.04s
957:	learn: 0.3711498	total: 1m 30s	remaining: 3.95s
958:	learn: 0.3711323	total: 1m 30s	remaining: 3.85s
959:	learn: 0.3711026	total: 1m 30s	remaining: 3.76s
960:	learn: 0.3710742	total: 1m 30s	remaining: 3.67s
961:	learn: 0.3710578	total: 1m 30s	remaining: 3.57s
962:	learn: 0.3710261	total: 1m 30s	remaining: 3.48s
963:	learn: 0.3710037	total: 1m 30s	remaining: 3.38s
964:	learn: 0.3709685	total: 1m 30s	remaining: 3.29s
965:	learn: 0.3709210	total: 1m 30s	remaining: 3.19s
966:	learn: 0.3709041	total: 1m 30s	remaining: 3.1s
967:	learn: 0.3708730	total: 1m 31s	remaining: 3.01s
968:	learn: 0.3708277	total: 1m 31s	remaining: 2.91s
969:	learn: 0.3707728	total: 1m 31s	remaining: 2.82s
970:	learn: 0.3707444	total: 1m 31s	remaining: 2.73s
971:	learn: 0.3707032	total: 1m 31s	remaining: 2.63s
972:	learn: 0.3706900	total: 1m 31s	remaining: 2.54s
973:	learn: 0.3706548	total: 1m 31s	remaining: 2.44s
974:	learn: 0.3706290	total: 1m 31s	remaining: 

<catboost.core.CatBoostClassifier at 0x7f3691167320>

In [50]:
roc_auc_score(y_valid, cat.predict_proba(X_valid)[:,1])

0.783345262611359

In [53]:
final_pred = predicts = cat.predict_proba(test)[:, 1]

## Submit

In [63]:
# Function for writing predictions to a file
def write_to_submission_file(predicted_labels, out_file, 
                             target='dep_delayed_15min', index_label="id"):
    
    predicted_df = pd.DataFrame(
        predicted_labels,
        index = np.arange(0, predicted_labels.shape[0]),
        columns=[target])
    
    predicted_df.to_csv(out_file, index_label=index_label)

In [64]:
from datetime import datetime as dt
import subprocess
now = dt.now().strftime("%Y-%m-%d_%H-%M-%S")
label = subprocess.check_output(["git", "describe", "--always"]).strip().decode("utf-8")

### WRITE SUBMISSION
write_to_submission_file(final_pred, f'../submissions/catboost_submission_at_{now}__githash_{label}.csv')

pd.Series(final_pred, 
          name='dep_delayed_15min').to_csv('xgb_2feat.csv', 
                                           index_label='id', header=True)

In [58]:
pd.Series(final_pred, 
          name='dep_delayed_15min').to_csv('xgb_2feat.csv', 
                                           index_label='id', header=True)