In [59]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV

import xgboost as xgb
from catboost import CatBoostClassifier

In [7]:
def label_encode(train_data, test_data, columns):
    'Returns a DataFrame with encoded columns'
    encoded_cols = []
    for col in columns:
        factorised = pd.factorize(train_data[col])[1]
        labels = pd.Series(range(len(factorised)), index=factorised)
        encoded_col_train = train_data[col].map(labels) 
        encoded_col_test = test_data[col].map(labels)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = -1
        encoded_cols.append(pd.DataFrame({'label_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def freq_encode(train_data, test_data, columns):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    nsamples = train_data.shape[0]
    for col in columns:    
        freqs_cat = train_data.groupby(col)[col].count()/nsamples
        encoded_col_train = train_data[col].map(freqs_cat)
        encoded_col_test = test_data[col].map(freqs_cat)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = 0
        encoded_cols.append(pd.DataFrame({'freq_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def mean_encode(train_data, test_data, columns, target_col, reg_method=None,
                alpha=0, add_random=False, rmean=0, rstd=0.1, folds=1):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    target_mean_global = train_data[target_col].mean()
    for col in columns:
        # Getting means for test data
        nrows_cat = train_data.groupby(col)[target_col].count()
        target_means_cats = train_data.groupby(col)[target_col].mean()
        target_means_cats_adj = (target_means_cats*nrows_cat + 
                                 target_mean_global*alpha)/(nrows_cat+alpha)
        # Mapping means to test data
        encoded_col_test = test_data[col].map(target_means_cats_adj)
        # Getting a train encodings
        if reg_method == 'expanding_mean':
            train_data_shuffled = train_data.sample(frac=1, random_state=1)
            cumsum = train_data_shuffled.groupby(col)[target_col].cumsum() - train_data_shuffled[target_col]
            cumcnt = train_data_shuffled.groupby(col).cumcount()
            encoded_col_train = cumsum/(cumcnt)
            encoded_col_train.fillna(target_mean_global, inplace=True)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))
        elif (reg_method == 'k_fold') and (folds > 1):
            kfold = StratifiedKFold(folds, shuffle=True, random_state=1)
            parts = []
            for tr_in, val_ind in kfold.split(train_data[columns].values, train_data[target_col].values):
                # divide data
                df_for_estimation, df_estimated = train_data.iloc[tr_in], train_data.iloc[val_ind]
                # getting means on data for estimation (all folds except estimated)
                nrows_cat = df_for_estimation.groupby(col)[target_col].count()
                target_means_cats = df_for_estimation.groupby(col)[target_col].mean()
                target_means_cats_adj = (target_means_cats*nrows_cat + 
                                         target_mean_global*alpha)/(nrows_cat+alpha)
                # Mapping means to estimated fold
                encoded_col_train_part = df_estimated[col].map(target_means_cats_adj)
                if add_random:
                    encoded_col_train_part = encoded_col_train_part + normal(loc=rmean, scale=rstd, 
                                                                             size=(encoded_col_train_part.shape[0]))
                # Saving estimated encodings for a fold
                parts.append(encoded_col_train_part)
            encoded_col_train = pd.concat(parts, axis=0)
            encoded_col_train.fillna(target_mean_global, inplace=True)
        else:
            encoded_col_train = train_data[col].map(target_means_cats_adj)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))

        # Saving the column with means
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = target_mean_global
        encoded_cols.append(pd.DataFrame({'mean_'+target_col+'_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

In [42]:
PATH_TO_DATA = Path('../input/flight-delays-fall-2018/')

train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')
test_df.index = range(100000, 200000)

air = pd.read_csv(PATH_TO_DATA / 'airports.csv')
air = air.set_index('IATA_CODE')

train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [43]:
air_city = air['CITY'].to_dict()
air_state = air['STATE'].to_dict()
air_country = air['COUNTRY'].to_dict()
air_lat = air['LATITUDE'].to_dict()
air_long = air['LONGITUDE'].to_dict()

## Preproc

In [44]:
def upd_time(val):
    hour = val // 100
    minutes = val % 100
    if hour >= 24:
        hour = hour - 24
    return hour * 60 + minutes

def preproc(df, test=False):
    df['Month'] = df['Month'].apply(lambda x: int(x[2:]))
    df['DayofMonth'] = df['DayofMonth'].apply(lambda x: int(x[2:]))
    df['DayOfWeek'] = df['DayOfWeek'].apply(lambda x: int(x[2:]))
    
    df['DepTime'] = df['DepTime'].apply(upd_time)
    
    minutes_in_day = 24 * 60
    df['sin_time'] = np.sin(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['cos_time'] = np.cos(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['hour'] = df['DepTime'] // 60
    df['minutes'] = df['DepTime'] % 60
    
    df['Distance_orig'] = df['Distance'].values
    df['Distance'] = df['Distance'].apply(np.log)

    df.loc[~df['Origin'].isin(or_dest), 'Origin'] = 'other'
    df.loc[~df['Dest'].isin(or_dest), 'Dest'] = 'other'
    df.loc[~df['UniqueCarrier'].isin(unique_carrier), 'UniqueCarrier'] = 'other'
    
    if not test:
        le_carrier.fit(df['UniqueCarrier'])
        le_or_dest.fit(pd.concat([df['Origin'], df['Dest']]))
        del df['dep_delayed_15min']
    
    df['UniqueCarrierLabel'] = le_carrier.transform(df['UniqueCarrier'])
    df['OriginLabel'] = le_or_dest.transform(df['Origin'])
    df['DestLabel'] = le_or_dest.transform(df['Dest'])
    
    return df

le_carrier = LabelEncoder()
le_or_dest = LabelEncoder()
le_target = LabelEncoder()

In [45]:
unique_carrier = list(set(train_df['UniqueCarrier']) & set(test_df['UniqueCarrier']))
or_dest = list(set(train_df['Origin']) & set(test_df['Origin']) & set(train_df['Dest']) & set(test_df['Dest']))

In [46]:
for df in [train_df, test_df]:
    for col in ['Origin', 'Dest']:
        df[col+'CITY'] = df[col].map(air_city).fillna('other')
        df[col+'STATE'] = df[col].map(air_state).fillna('other')
        df[col+'COUNTRY'] = df[col].map(air_country).fillna('other')

        df[col+'LATITUDE'] = df[col].map(air_lat)
        df[col+'LONGITUDE'] = df[col].map(air_long)

        df[col+'x_coord'] = np.cos(df[col+'LATITUDE']) * np.cos(df[col+'LONGITUDE'])
        df[col+'y_coord'] = np.cos(df[col+'LATITUDE']) * np.sin(df[col+'LONGITUDE'])
        df[col+'z_coord'] = np.sin(df[col+'LATITUDE'])
        
    df['x_coord'] = df['Originx_coord'] - df['Destx_coord']
    df['y_coord'] = df['Originy_coord'] - df['Desty_coord']
    df['z_coord'] = df['Originz_coord'] - df['Destz_coord']
    df['LATITUDE'] = df['OriginLATITUDE'] - df['DestLATITUDE']
    df['LONGITUDE'] = df['OriginLONGITUDE'] - df['DestLONGITUDE']
    
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['flight_city'] = df['OriginCITY'] + '-->' + df['DestCITY']
    df['flight_state'] = df['OriginSTATE'] + '-->' + df['DestSTATE']

In [47]:
train_df['dep_delayed_15min'] = le_target.fit_transform(train_df['dep_delayed_15min'])

categ_cols = ['UniqueCarrier', 'Origin', 'Dest', 
              'OriginCITY', 'OriginSTATE', 'OriginCOUNTRY', 
              'DestCITY', 'DestSTATE', 'DestCOUNTRY']
train_df_fe, test_df_fe = freq_encode(train_df, test_df, columns=categ_cols)

train_df_mekf, test_df_mekf = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='k_fold', alpha=5, folds=5)

train_df_meem, test_df_meem = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='expanding_mean', alpha=5)

In [48]:
y = train_df['dep_delayed_15min']
X = preproc(train_df)
X_test = preproc(test_df, test=True)

X.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,OriginCITY,OriginSTATE,...,flight_city,flight_state,sin_time,cos_time,hour,minutes,Distance_orig,UniqueCarrierLabel,OriginLabel,DestLabel
0,8,21,7,1174,AA,ATL,DFW,6.595781,Atlanta,GA,...,Atlanta-->Dallas-Fort Worth,GA-->TX,-0.91706,0.398749,19,34,732,0,17,77
1,4,20,3,948,US,PIT,MCO,6.726233,Pittsburgh,PA,...,Pittsburgh-->Orlando,PA-->FL,-0.838671,-0.544639,15,48,834,15,209,166
2,9,2,5,862,XE,RDU,CLE,6.030685,Raleigh,NC,...,Raleigh-->Cleveland,NC-->OH,-0.580703,-0.814116,14,22,416,17,220,58
3,11,25,6,615,OO,DEN,MEM,6.770789,Denver,CO,...,Denver-->Memphis,CO-->TN,0.442289,-0.896873,10,15,872,13,76,170
4,10,7,6,1108,WN,MDW,OMA,6.047372,Chicago,IL,...,Chicago-->Omaha,IL-->NE,-0.992546,0.121869,18,28,423,16,168,194


## CV

In [15]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=38)

In [29]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE']
cols_fe = ['freq_UniqueCarrier', 'freq_Origin', 'freq_Dest']

X_train = pd.concat([X[cols_to_use], train_df_fe[cols_fe]], axis=1)

regr = xgb.XGBRegressor(random_state=42, n_jobs=-1, max_depth=9, n_estimators=250)
cv_score = cross_val_score(regr, X_train, y, cv=skf, scoring='roc_auc')
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75569+-0.00347


## CatBoost

In [31]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=38)

In [30]:
X.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'Distance', 'OriginCITY', 'OriginSTATE',
       'OriginCOUNTRY', 'OriginLATITUDE', 'OriginLONGITUDE', 'Originx_coord',
       'Originy_coord', 'Originz_coord', 'DestCITY', 'DestSTATE',
       'DestCOUNTRY', 'DestLATITUDE', 'DestLONGITUDE', 'Destx_coord',
       'Desty_coord', 'Destz_coord', 'x_coord', 'y_coord', 'z_coord',
       'LATITUDE', 'LONGITUDE', 'flight', 'sin_time', 'cos_time', 'hour',
       'minutes', 'Distance_orig', 'UniqueCarrierLabel', 'OriginLabel',
       'DestLabel'],
      dtype='object')

In [32]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE']

cols_fe = ['freq_UniqueCarrier', 'freq_Origin', 'freq_Dest']

X_train = pd.concat([X[cols_to_use], train_df_fe[cols_fe]], axis=1)
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74760+-0.00303


In [34]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74816+-0.00231


In [52]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74803+-0.00303


In [53]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
              'UniqueCarrier', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74291+-0.00192


In [54]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
              'UniqueCarrier', 'Origin', 'Dest']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74620+-0.00272


In [50]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74774+-0.00223


In [51]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74803+-0.00303


In [38]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'Originx_coord', 'Originy_coord', 'Originz_coord', 
               'Destx_coord', 'Desty_coord', 'Destz_coord',
               'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74559+-0.00325


In [49]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
               'UniqueCarrier', 'Origin', 'Dest', 'flight', 
               'OriginCITY', 'OriginSTATE', 'DestCITY', 'DestSTATE']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

ctb = CatBoostClassifier(random_state=42, silent=True)
cv_score = cross_val_score(ctb, X_train, y, cv=skf, scoring='roc_auc', fit_params=fit_params)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74685+-0.00286


### GSCV

In [62]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

params = {'loss_function': ['Logloss', 'CrossEntropy']}

regr = CatBoostClassifier(random_state=42, silent=True)

gscv = GridSearchCV(regr, params, cv=skf, scoring='roc_auc', fit_params=fit_params)

gscv.fit(X_train, y)

results = pd.concat([pd.Series(gscv.cv_results_['params'], name='params'), 
                pd.Series(gscv.cv_results_['mean_test_score'], name='mean_test_score'), 
                pd.Series(gscv.cv_results_['std_test_score'], name='std_test_score')], axis=1)

results.sort_values('mean_test_score', ascending=False)

<catboost.core.CatBoostClassifier at 0x1971d9a9860>

In [77]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

params = {'depth': [3, 4, 5, 6, 7, 8, 9, 10],
          'iterations': [100, 250, 500, 1000],
          'learning_rate': [0.001, 0.01, 0.03, 0.1, 0.2, 0.3], 
          'border_count': [5, 10, 25, 50, 100, 200],
          'ctr_border_count': [5, 10, 20, 50, 100, 200],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [5, 10, 20]}

regr = CatBoostClassifier(random_state=42, silent=True)

gscv = RandomizedSearchCV(regr, params, cv=skf, scoring='roc_auc', fit_params=fit_params, n_iter=20)

gscv.fit(X_train, y)

results = pd.concat([pd.Series(gscv.cv_results_['params'], name='params'), 
                     pd.Series(gscv.cv_results_['mean_test_score'], name='mean_test_score'), 
                     pd.Series(gscv.cv_results_['std_test_score'], name='std_test_score'),
                     pd.Series(gscv.cv_results_['mean_fit_time'], name='mean_fit_time'),], axis=1)

results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score
12,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.741365,0.002557
11,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.739957,0.001925
0,"{'learning_rate': 0.3, 'leaf_estimation_iterat...",0.733084,0.001459
1,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.732812,0.00338
19,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.731193,0.003852
6,"{'learning_rate': 0.2, 'leaf_estimation_iterat...",0.730591,0.001385
5,"{'learning_rate': 0.01, 'leaf_estimation_itera...",0.725661,0.001937
14,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.725384,0.002947
8,"{'learning_rate': 0.01, 'leaf_estimation_itera...",0.716932,0.004338
18,"{'learning_rate': 0.01, 'leaf_estimation_itera...",0.711878,0.00285


In [87]:
results_1 = gscv.cv_results_

In [79]:
for i in results.sort_values('mean_test_score', ascending=False)['params']:
    print(i)

{'learning_rate': 0.1, 'leaf_estimation_iterations': 20, 'l2_leaf_reg': 1e-20, 'iterations': 1000, 'depth': 3, 'ctr_border_count': 100, 'border_count': 100}
{'learning_rate': 0.03, 'leaf_estimation_iterations': 20, 'l2_leaf_reg': 1e-20, 'iterations': 1000, 'depth': 7, 'ctr_border_count': 20, 'border_count': 200}
{'learning_rate': 0.3, 'leaf_estimation_iterations': 10, 'l2_leaf_reg': 1e-19, 'iterations': 250, 'depth': 5, 'ctr_border_count': 5, 'border_count': 25}
{'learning_rate': 0.1, 'leaf_estimation_iterations': 5, 'l2_leaf_reg': 3.162277660168379e-20, 'iterations': 100, 'depth': 7, 'ctr_border_count': 20, 'border_count': 200}
{'learning_rate': 0.1, 'leaf_estimation_iterations': 5, 'l2_leaf_reg': 3.162277660168379e-20, 'iterations': 100, 'depth': 6, 'ctr_border_count': 20, 'border_count': 200}
{'learning_rate': 0.2, 'leaf_estimation_iterations': 10, 'l2_leaf_reg': 3.162277660168379e-20, 'iterations': 250, 'depth': 3, 'ctr_border_count': 200, 'border_count': 25}
{'learning_rate': 0.01

In [88]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

params = {'depth': [3, 4, 5, 6, 7, 8],
          'iterations': [250, 500, 1000, 2000, 3000],
          'learning_rate': [0.03, 0.1, 0.3], 
          'border_count': [128, 254],
          'l2_leaf_reg': np.logspace(-22, 5, 100),
          'leaf_estimation_iterations': np.arange(3, 50, 2)}

regr = CatBoostClassifier(random_state=42, silent=True)

gscv = RandomizedSearchCV(regr, params, cv=skf, scoring='roc_auc', fit_params=fit_params, n_iter=20)

gscv.fit(X_train, y)

results = pd.concat([pd.Series(gscv.cv_results_['params'], name='params'), 
                pd.Series(gscv.cv_results_['mean_test_score'], name='mean_test_score'), 
                pd.Series(gscv.cv_results_['std_test_score'], name='std_test_score')], axis=1)

results.sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score
6,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.745121,0.003048
8,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.743415,0.002534
9,"{'learning_rate': 0.3, 'leaf_estimation_iterat...",0.741845,0.004099
5,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.739315,0.00296
3,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.738847,0.002265
4,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.738562,0.003167
7,"{'learning_rate': 0.3, 'leaf_estimation_iterat...",0.737951,0.003004
11,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.73658,0.002392
14,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.735684,0.002093
0,"{'learning_rate': 0.03, 'leaf_estimation_itera...",0.731582,0.003801


In [89]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]
categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

params = {'depth': [3, 4, 5, 6, 7, 8],
          'iterations': [250, 500, 1000, 2000, 3000],
          'learning_rate': [0.03, 0.06, 0.1, 0.2, 0.3, 0.6], 
          'border_count': [128, 254],
          'l2_leaf_reg': np.logspace(-22, 5, 100),
          'leaf_estimation_iterations': np.arange(3, 50, 2)}

regr = CatBoostClassifier(random_state=42, silent=True)

gscv2 = RandomizedSearchCV(regr, params, cv=skf, scoring='roc_auc', fit_params=fit_params, n_iter=75)

gscv2.fit(X_train, y)

results2 = pd.concat([pd.Series(gscv2.cv_results_['params'], name='params'), 
                pd.Series(gscv2.cv_results_['mean_test_score'], name='mean_test_score'), 
                pd.Series(gscv2.cv_results_['std_test_score'], name='std_test_score')], axis=1)

results2.sort_values('mean_test_score', ascending=False)

Unnamed: 0,params,mean_test_score,std_test_score
12,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.752246,0.001531
9,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.749043,0.001960
59,"{'learning_rate': 0.3, 'leaf_estimation_iterat...",0.748060,0.001631
18,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.747842,0.001975
36,"{'learning_rate': 0.06, 'leaf_estimation_itera...",0.747595,0.002276
33,"{'learning_rate': 0.06, 'leaf_estimation_itera...",0.747456,0.002316
2,"{'learning_rate': 0.06, 'leaf_estimation_itera...",0.746183,0.001571
38,"{'learning_rate': 0.1, 'leaf_estimation_iterat...",0.746056,0.002237
52,"{'learning_rate': 0.3, 'leaf_estimation_iterat...",0.746021,0.001735
27,"{'learning_rate': 0.06, 'leaf_estimation_itera...",0.745851,0.002684


In [120]:
results_sorted2 = results2.sort_values('mean_test_score', ascending=False)
params2 = results_sorted2['params']
r2 = {x: list() for x in results2.iloc[0, 0].keys()}
for row in params2:
    for key in row.keys():
        r2[key].append(row[key]) 
        
r2 = pd.concat([pd.DataFrame(r2), results_sorted2[['mean_test_score', 'std_test_score']].reset_index(drop=True)], axis=1)
r2.head(10)

Unnamed: 0,learning_rate,leaf_estimation_iterations,l2_leaf_reg,iterations,depth,border_count,mean_test_score,std_test_score
0,0.1,31,2310.13,2000,6,254,0.752246,0.001531
1,0.1,29,8.111308,3000,3,254,0.749043,0.00196
2,0.3,9,8111.308,3000,3,254,0.74806,0.001631
3,0.1,13,0.0006579332,3000,3,128,0.747842,0.001975
4,0.06,49,2.848036e-08,3000,4,128,0.747595,0.002276
5,0.06,25,4.328761e-06,3000,5,254,0.747456,0.002316
6,0.06,41,1.519911e-08,2000,5,254,0.746183,0.001571
7,0.1,45,8.111308,500,6,254,0.746056,0.002237
8,0.3,33,1.232847,2000,4,128,0.746021,0.001735
9,0.06,29,4328.761,3000,3,128,0.745851,0.002684


In [121]:
results_sorted2.iloc[0, 0]

{'learning_rate': 0.1,
 'leaf_estimation_iterations': 31,
 'l2_leaf_reg': 2310.1297000831582,
 'iterations': 2000,
 'depth': 6,
 'border_count': 254}

## Train

In [123]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'Distance', 'DepTime', 
               'OriginLATITUDE', 'OriginLONGITUDE', 'DestLATITUDE', 'DestLONGITUDE',
              'UniqueCarrier', 'Origin', 'Dest', 'flight']

X_train = X[cols_to_use]

X_test_ = X_test[cols_to_use]

categ_feat_idx = np.where(X_train.dtypes == 'object')[0]
fit_params = {'cat_features': categ_feat_idx}

params = {'learning_rate': 0.1,
         'leaf_estimation_iterations': 31,
         'l2_leaf_reg': 2310.1297000831582,
         'iterations': 2000,
         'depth': 6,
         'border_count': 254}

regr = CatBoostClassifier(random_state=42, silent=True)

regr.fit(X_train, y, cat_features=categ_feat_idx)

test_pred = regr.predict_proba(X_test_)[:, 1]

CatBoostError: Bad value for num_feature[0,4]="YV": Cannot convert 'b'YV'' to float

In [125]:
sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', index_col='id')
sample_sub['dep_delayed_15min'] = test_pred
sample_sub.to_csv(PATH_TO_DATA / 'cat_pred.csv')