# Main

In [14]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV

from catboost import CatBoostClassifier

In [15]:
def label_encode(train_data, test_data, columns):
    'Returns a DataFrame with encoded columns'
    encoded_cols = []
    for col in columns:
        factorised = pd.factorize(train_data[col])[1]
        labels = pd.Series(range(len(factorised)), index=factorised)
        encoded_col_train = train_data[col].map(labels) 
        encoded_col_test = test_data[col].map(labels)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = -1
        encoded_cols.append(pd.DataFrame({'label_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def freq_encode(train_data, test_data, columns):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    nsamples = train_data.shape[0]
    for col in columns:    
        freqs_cat = train_data.groupby(col)[col].count()/nsamples
        encoded_col_train = train_data[col].map(freqs_cat)
        encoded_col_test = test_data[col].map(freqs_cat)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = 0
        encoded_cols.append(pd.DataFrame({'freq_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def mean_encode(train_data, test_data, columns, target_col, reg_method=None,
                alpha=0, add_random=False, rmean=0, rstd=0.1, folds=1):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    target_mean_global = train_data[target_col].mean()
    for col in columns:
        # Getting means for test data
        nrows_cat = train_data.groupby(col)[target_col].count()
        target_means_cats = train_data.groupby(col)[target_col].mean()
        target_means_cats_adj = (target_means_cats*nrows_cat + 
                                 target_mean_global*alpha)/(nrows_cat+alpha)
        # Mapping means to test data
        encoded_col_test = test_data[col].map(target_means_cats_adj)
        # Getting a train encodings
        if reg_method == 'expanding_mean':
            train_data_shuffled = train_data.sample(frac=1, random_state=1)
            cumsum = train_data_shuffled.groupby(col)[target_col].cumsum() - train_data_shuffled[target_col]
            cumcnt = train_data_shuffled.groupby(col).cumcount()
            encoded_col_train = cumsum/(cumcnt)
            encoded_col_train.fillna(target_mean_global, inplace=True)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))
        elif (reg_method == 'k_fold') and (folds > 1):
            kfold = StratifiedKFold(folds, shuffle=True, random_state=1)
            parts = []
            for tr_in, val_ind in kfold.split(train_data[columns].values, train_data[target_col].values):
                # divide data
                df_for_estimation, df_estimated = train_data.iloc[tr_in], train_data.iloc[val_ind]
                # getting means on data for estimation (all folds except estimated)
                nrows_cat = df_for_estimation.groupby(col)[target_col].count()
                target_means_cats = df_for_estimation.groupby(col)[target_col].mean()
                target_means_cats_adj = (target_means_cats*nrows_cat + 
                                         target_mean_global*alpha)/(nrows_cat+alpha)
                # Mapping means to estimated fold
                encoded_col_train_part = df_estimated[col].map(target_means_cats_adj)
                if add_random:
                    encoded_col_train_part = encoded_col_train_part + normal(loc=rmean, scale=rstd, 
                                                                             size=(encoded_col_train_part.shape[0]))
                # Saving estimated encodings for a fold
                parts.append(encoded_col_train_part)
            encoded_col_train = pd.concat(parts, axis=0)
            encoded_col_train.fillna(target_mean_global, inplace=True)
        else:
            encoded_col_train = train_data[col].map(target_means_cats_adj)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))

        # Saving the column with means
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = target_mean_global
        encoded_cols.append(pd.DataFrame({'mean_'+target_col+'_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

In [101]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16) 
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64) 
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [70]:
PATH_TO_DATA = Path('../input/flight-delays-fall-2018/')

train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')
test_df.index = range(100000, 200000)

air = pd.read_csv(PATH_TO_DATA / 'airports.csv')
air = air.set_index('IATA_CODE')

train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [71]:
air_city = air['CITY'].to_dict()
air_state = air['STATE'].to_dict()
air_country = air['COUNTRY'].to_dict()
air_lat = air['LATITUDE'].to_dict()
air_long = air['LONGITUDE'].to_dict()

# Preproc

In [72]:
for df in [train_df, test_df]:
    for col in ['Origin', 'Dest']:
        df[col+'CITY'] = df[col].map(air_city).fillna('other')
        df[col+'STATE'] = df[col].map(air_state).fillna('other')
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['flight_city'] = df['OriginCITY'] + '-->' + df['DestCITY']
    df['flight_state'] = df['OriginSTATE'] + '-->' + df['DestSTATE']

unique_carrier = list(set(train_df['UniqueCarrier']) & set(test_df['UniqueCarrier']))
or_dest = list(set(train_df['Origin']) & set(test_df['Origin']) & set(train_df['Dest']) & set(test_df['Dest']))
flight_set = list(set(train_df['flight']) & set(test_df['flight']))
flight_city = list(set(train_df['flight_city']) & set(test_df['flight_city']))
flight_state = list(set(train_df['flight_state']) & set(test_df['flight_state']))

In [73]:
minutes_in_day = 24 * 60

def upd_time(val):
    hour = val // 100
    minutes = val % 100
    if hour >= 24:
        hour = hour - 24
    return hour * 60 + minutes

def preproc(df, test=False):
    df['Month'] = df['Month'].apply(lambda x: int(x[2:]))
    df['DayofMonth'] = df['DayofMonth'].apply(lambda x: int(x[2:]))
    df['DayOfWeek'] = df['DayOfWeek'].apply(lambda x: int(x[2:]))
    
    df['DepTime'] = df['DepTime'].apply(upd_time)
    df['sin_time'] = np.sin(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['cos_time'] = np.cos(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['hour'] = df['DepTime'] // 60
    df['minutes'] = df['DepTime'] % 60
    
    df['morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype('int')
    df['day'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype('int')
    df['evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype('int')
    df['night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype('int')

    df['low_delay'] = ((df['hour'] >= 4) & (df['hour'] < 9)).astype('int')
    df['other_time'] = ((df['hour'] < 9) & (df['hour'] >=5)).astype('int')
    df['delay_time'] = ((df['hour'] >= 13) & (df['hour'] < 24) | (df['hour'] < 5)).astype('int')
    df['middle_time'] = ((df['hour'] >= 9) & (df['hour'] < 13)).astype('int')

    df['winter'] = ((df['Month'].isin([12, 1, 2]))).astype('int')
    df['spring'] = ((df['Month'].isin([3, 4, 5]))).astype('int')
    df['summer'] = ((df['Month'].isin([6, 7, 8]))).astype('int')
    df['autumn'] = ((df['Month'].isin([9, 10, 11]))).astype('int')

    df['Distance_log'] = df['Distance'].apply(np.log)
    
    for col in ['Origin', 'Dest']:
        df[col+'CITY'] = df[col].map(air_city).fillna('other')
        df[col+'STATE'] = df[col].map(air_state).fillna('other')
        df[col+'COUNTRY'] = df[col].map(air_country).fillna('other')

        df[col+'LATITUDE'] = df[col].map(air_lat)
        df[col+'LONGITUDE'] = df[col].map(air_long)

        df[col+'x_coord'] = np.cos(df[col+'LATITUDE']) * np.cos(df[col+'LONGITUDE'])
        df[col+'y_coord'] = np.cos(df[col+'LATITUDE']) * np.sin(df[col+'LONGITUDE'])
        df[col+'z_coord'] = np.sin(df[col+'LATITUDE'])
        
    df['x_coord'] = df['Originx_coord'] - df['Destx_coord']
    df['y_coord'] = df['Originy_coord'] - df['Desty_coord']
    df['z_coord'] = df['Originz_coord'] - df['Destz_coord']
    df['LATITUDE'] = df['OriginLATITUDE'] - df['DestLATITUDE']
    df['LONGITUDE'] = df['OriginLONGITUDE'] - df['DestLONGITUDE']
    
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['flight_city'] = df['OriginCITY'] + '-->' + df['DestCITY']
    df['flight_state'] = df['OriginSTATE'] + '-->' + df['DestSTATE']
    
    df.loc[~df['Origin'].isin(or_dest), 'Origin'] = 'other'
    df.loc[~df['Dest'].isin(or_dest), 'Dest'] = 'other'
    df.loc[~df['UniqueCarrier'].isin(unique_carrier), 'UniqueCarrier'] = 'other'
    df.loc[~df['flight'].isin(flight_set), 'flight'] = 'other'
    df.loc[~df['flight_city'].isin(flight_city), 'flight_city'] = 'other'
    df.loc[~df['flight_state'].isin(flight_state), 'flight_state'] = 'other'
    
    if not test:
        le_carrier.fit(df['UniqueCarrier'])
        le_or_dest.fit(pd.concat([df['Origin'], df['Dest']]))
        le_flight.fit(df['flight'])
        le_flight_city.fit(df['flight_city'])
        le_flight_state.fit(df['flight_state'])
    
    df['UniqueCarrierLabel'] = le_carrier.transform(df['UniqueCarrier'])
    df['OriginLabel'] = le_or_dest.transform(df['Origin'])
    df['DestLabel'] = le_or_dest.transform(df['Dest'])
    df['flightLabel'] = le_flight.transform(df['flight'])
    df['flight_cityLabel'] = le_flight_city.transform(df['flight_city'])
    df['flight_stateLabel'] = le_flight_state.transform(df['flight_state'])

    for col in df.columns:
        if df[col].isna().sum() > 0:
            if df[col].dtype != 'object':
                df[col] = df[col].fillna(df[col].median())
            else:
                common_val = df[col].value_counts().index[0]
                df[col] = df[col].fillna(common_val)
    
    return df

le_carrier = LabelEncoder()
le_or_dest = LabelEncoder()
le_target = LabelEncoder()
le_flight = LabelEncoder()
le_flight_city = LabelEncoder()
le_flight_state = LabelEncoder()

In [74]:
train_df.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'Distance', 'dep_delayed_15min', 'OriginCITY',
       'OriginSTATE', 'DestCITY', 'DestSTATE', 'flight', 'flight_city',
       'flight_state'],
      dtype='object')

In [110]:
train_df['dep_delayed_15min'] = le_target.fit_transform(train_df['dep_delayed_15min'])
categ_cols = ['UniqueCarrier', 'Origin', 'Dest', 'flight']

train_df_fe, test_df_fe = freq_encode(train_df, test_df, columns=categ_cols)

train_df_mekf, test_df_mekf = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='k_fold', alpha=5, folds=5)

train_df_meem, test_df_meem = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='expanding_mean', alpha=5)

train_df_meem.columns = [col + 'expanding_mean' for col in categ_cols]
test_df_meem.columns = [col + 'expanding_mean' for col in categ_cols]

In [111]:
y_train = train_df['dep_delayed_15min']

X_train_full = preproc(train_df.copy())
X_test_full = preproc(test_df.copy(), test=True)

del X_train_full['dep_delayed_15min']

X_train_full = pd.concat([X_train_full, train_df_fe, train_df_mekf, train_df_meem], axis=1)
X_test_full = pd.concat([X_test_full, test_df_fe, test_df_mekf, test_df_meem], axis=1)

X_train_full.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,OriginCITY,OriginSTATE,...,freq_Dest,freq_flight,mean_dep_delayed_15min_UniqueCarrier,mean_dep_delayed_15min_Origin,mean_dep_delayed_15min_Dest,mean_dep_delayed_15min_flight,UniqueCarrierexpanding_mean,Originexpanding_mean,Destexpanding_mean,flightexpanding_mean
0,8,21,7,1174,AA,ATL,DFW,732,Atlanta,GA,...,0.04337,0.00136,0.185031,0.255794,0.153744,0.254111,0.184438,0.255592,0.153869,0.236364
1,4,20,3,948,US,PIT,MCO,834,Pittsburgh,PA,...,0.01728,0.00054,0.171243,0.168373,0.188049,0.16229,0.174815,0.125,0.193103,0.0
2,9,2,5,862,XE,RDU,CLE,416,Raleigh,NC,...,0.01218,0.00022,0.17289,0.19185,0.183729,0.113546,0.170219,0.176938,0.180328,0.2
3,11,25,6,615,OO,DEN,MEM,872,Denver,CO,...,0.00629,0.00014,0.170996,0.195163,0.147418,0.122012,0.171821,0.186235,0.162791,0.333333
4,10,7,6,1108,WN,MDW,OMA,423,Chicago,IL,...,0.00311,0.0003,0.214693,0.249266,0.241468,0.412145,0.207071,0.206897,0.333333,1.0


# Feature Selection

In [119]:
# from sklearn.feature_selection import RFECV
from eli5 import show_weights
from eli5.sklearn import PermutationImportance
# from boruta import BorutaPy

In [116]:
# cols_to_use = [col for col in X_train_full.columns if 'Label' not in col]
# X_train = X_train_full[cols_to_use]
# X_test = X_test_full[cols_to_use]

X_train = X_train_full.loc[:, X_train_full.dtypes!='object']
X_test = X_test_full.loc[:, X_train_full.dtypes!='object']

# X_train = reduce_mem_usage(X_train.copy())

In [83]:
params = {'learning_rate': 0.1,
         'leaf_estimation_iterations': 31,
         'l2_leaf_reg': 2310.1297000831582,
         'iterations': 2000,
         'depth': 6,
         'border_count': 254}

categ_feat_idx = np.where(X_train.dtypes == 'object')[0]

In [85]:
categ_feat_idx = [i for i, col in enumerate(X_train.columns) if 'Label' in col]

In [117]:
clf = CatBoostClassifier(random_state=42, silent=True)
# clf = CBCforRFE()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=38)
perm = PermutationImportance(clf, cv=skf, n_iter=15, scoring='roc_auc', random_state=42)
perm.fit(X_train.values, y_train, cat_features=categ_feat_idx)

PermutationImportance(cv=StratifiedKFold(n_splits=5, random_state=38, shuffle=True),
           estimator=<catboost.core.CatBoostClassifier object at 0x0000017623D5C0F0>,
           n_iter=15, random_state=42, refit=True, scoring='roc_auc')

In [121]:
show_weights(perm, feature_names = X_train.columns.tolist(), top=100)

Weight,Feature
0.1643  ± 0.0549,flightLabel
0.0437  ± 0.0143,mean_dep_delayed_15min_flight
0.0264  ± 0.0083,flightexpanding_mean
0.0234  ± 0.0076,cos_time
0.0185  ± 0.0242,flight_cityLabel
0.0139  ± 0.0066,sin_time
0.0068  ± 0.0021,Month
0.0064  ± 0.0016,DayofMonth
0.0055  ± 0.0035,DepTime
0.0051  ± 0.0017,DayOfWeek


#### check feature importances

In [127]:
for thresh in [0.01, 0.005, 0.003, 0.002, 0.001, 0.0006, 0.0002]:
    print("Thresh = {}".format(thresh))

    X_train_opt = X_train.loc[:, perm.feature_importances_ >= thresh]
    print("N cols: {}".format(X_train_opt.shape[-1]))
    categ_feat_idx = [i for i, col in enumerate(X_train_opt.columns) if 'Label' in col]

    clf = CatBoostClassifier(random_state=42, silent=True)
    cv_score = cross_val_score(clf, X_train_opt, y_train, cv=skf, scoring='roc_auc')
    print(f'Score: {cv_score.mean():.5f}+-{cv_score.std():.5f}')
    print('-' * 40)

Thresh = 0.01
N cols: 6
Score: 0.71107+-0.00436
----------------------------------------
Thresh = 0.005
N cols: 10
Score: 0.72801+-0.00312
----------------------------------------
Thresh = 0.003
N cols: 14
Score: 0.73123+-0.00239
----------------------------------------
Thresh = 0.002
N cols: 17
Score: 0.74689+-0.00314
----------------------------------------
Thresh = 0.001
N cols: 20
Score: 0.74946+-0.00287
----------------------------------------
Thresh = 0.0006
N cols: 27
Score: 0.75132+-0.00327
----------------------------------------
Thresh = 0.0002
N cols: 41
Score: 0.75103+-0.00283
----------------------------------------


In [147]:
params = {'learning_rate': 0.1,
 'leaf_estimation_iterations': 49,
 'l2_leaf_reg': 259.5024211399737,
 'iterations': 3186,
 'depth': 8,
 'border_count': 254}

for thresh in [0.01, 0.005, 0.003, 0.002, 0.001, 0.0006, 0.0002]:
    print("Thresh = {}".format(thresh))

    X_train_opt = X_train.loc[:, perm.feature_importances_ >= thresh]
    print("N cols: {}".format(X_train_opt.shape[-1]))
    categ_feat_idx = [i for i, col in enumerate(X_train_opt.columns) if 'Label' in col]

    clf = CatBoostClassifier(random_state=42, silent=True, **params)
    cv_score = cross_val_score(clf, X_train_opt, y_train, cv=skf, scoring='roc_auc')
    print(f'Score: {cv_score.mean():.5f}+-{cv_score.std():.5f}')
    print('-' * 40)

Thresh = 0.01
N cols: 6
Score: 0.70085+-0.00479
----------------------------------------
Thresh = 0.005
N cols: 10
Score: 0.70560+-0.00304
----------------------------------------
Thresh = 0.003
N cols: 14
Score: 0.71105+-0.00299
----------------------------------------
Thresh = 0.002
N cols: 17
Score: 0.74211+-0.00332
----------------------------------------
Thresh = 0.001
N cols: 20
Score: 0.74882+-0.00386
----------------------------------------
Thresh = 0.0006
N cols: 27
Score: 0.75419+-0.00434
----------------------------------------
Thresh = 0.0002
N cols: 41
Score: 0.75845+-0.00179
----------------------------------------


#### GSCV

In [138]:
X_train_opt = X_train.loc[:, perm.feature_importances_ >= 0.0006]
categ_feat_idx = [i for i, col in enumerate(X_train_opt.columns) if 'Label' in col]

In [140]:
fit_params = {'cat_features': categ_feat_idx}

params = {'depth': [4, 5, 6, 7, 8],
          'iterations': np.linspace(500, 4000, 100, dtype=int),
          'learning_rate': [0.03, 0.06, 0.1, 0.2, 0.3, 0.6], 
          'border_count': [128, 254],
          'l2_leaf_reg': np.logspace(1, 5, 100),
          'leaf_estimation_iterations': np.arange(15, 50, 2)}

clf = CatBoostClassifier(random_state=42, silent=True)

gscv = RandomizedSearchCV(clf, params, cv=skf, scoring='roc_auc', fit_params=fit_params, n_iter=30)

gscv.fit(X_train, y_train)

results = pd.concat([pd.Series(gscv.cv_results_['params'], name='params'), 
                pd.Series(gscv.cv_results_['mean_test_score'], name='mean_test_score'), 
                pd.Series(gscv.cv_results_['std_test_score'], name='std_test_score')], axis=1)

# results.sort_values('mean_test_score', ascending=False)

In [142]:
results_sorted = results.sort_values('mean_test_score', ascending=False)
params = results_sorted['params']
r2 = {x: list() for x in results.iloc[0, 0].keys()}
for row in params:
    for key in row.keys():
        r2[key].append(row[key]) 
        
r2 = pd.concat([pd.DataFrame(r2), results_sorted[['mean_test_score', 'std_test_score']].reset_index(drop=True)], axis=1)
r2.head(10)

Unnamed: 0,learning_rate,leaf_estimation_iterations,l2_leaf_reg,iterations,depth,border_count,mean_test_score,std_test_score
0,0.1,49,259.502421,3186,8,254,0.757447,0.002565
1,0.03,35,1149.756995,2585,8,254,0.755873,0.003523
2,0.1,49,112.332403,888,7,128,0.754085,0.002677
3,0.2,43,2420.128265,2939,6,128,0.753615,0.003702
4,0.2,17,312.571585,3964,6,254,0.753154,0.000768
5,0.03,19,2656.087783,3893,7,254,0.752113,0.002791
6,0.06,43,413.20124,1242,6,128,0.752091,0.002955
7,0.3,23,4229.242874,2126,7,254,0.751348,0.002178
8,0.2,27,36.783798,3045,5,254,0.751199,0.00307
9,0.6,25,8111.308308,1772,7,128,0.749248,0.002789


In [143]:
results_sorted.iloc[0, 0]

{'learning_rate': 0.1,
 'leaf_estimation_iterations': 49,
 'l2_leaf_reg': 259.5024211399737,
 'iterations': 3186,
 'depth': 8,
 'border_count': 254}

# Train

In [145]:
thresh = 0.0006
X_train_opt = X_train.loc[:, perm.feature_importances_ >= thresh]
X_test_opt = X_test.loc[:, perm.feature_importances_ >= thresh]

categ_feat_idx = [i for i, col in enumerate(X_train_opt.columns) if 'Label' in col]

params = {'learning_rate': 0.1,
         'leaf_estimation_iterations': 49,
         'l2_leaf_reg': 259.5024211399737,
         'iterations': 3186,
         'depth': 8,
         'border_count': 254}

regr = CatBoostClassifier(random_state=42, silent=True, **params)

regr.fit(X_train_opt, y_train, cat_features=categ_feat_idx)

test_pred = regr.predict_proba(X_test_opt)[:, 1]

In [146]:
sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', index_col='id')
sample_sub['dep_delayed_15min'] = test_pred
sample_sub.to_csv(PATH_TO_DATA / 'cat_fs_pred_gscv.csv')