# Main

In [14]:
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from pathlib import Path

%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score, RandomizedSearchCV

from catboost import CatBoostClassifier

In [15]:
def label_encode(train_data, test_data, columns):
    'Returns a DataFrame with encoded columns'
    encoded_cols = []
    for col in columns:
        factorised = pd.factorize(train_data[col])[1]
        labels = pd.Series(range(len(factorised)), index=factorised)
        encoded_col_train = train_data[col].map(labels) 
        encoded_col_test = test_data[col].map(labels)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = -1
        encoded_cols.append(pd.DataFrame({'label_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def freq_encode(train_data, test_data, columns):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    nsamples = train_data.shape[0]
    for col in columns:    
        freqs_cat = train_data.groupby(col)[col].count()/nsamples
        encoded_col_train = train_data[col].map(freqs_cat)
        encoded_col_test = test_data[col].map(freqs_cat)
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = 0
        encoded_cols.append(pd.DataFrame({'freq_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

def mean_encode(train_data, test_data, columns, target_col, reg_method=None,
                alpha=0, add_random=False, rmean=0, rstd=0.1, folds=1):
    '''Returns a DataFrame with encoded columns'''
    encoded_cols = []
    target_mean_global = train_data[target_col].mean()
    for col in columns:
        # Getting means for test data
        nrows_cat = train_data.groupby(col)[target_col].count()
        target_means_cats = train_data.groupby(col)[target_col].mean()
        target_means_cats_adj = (target_means_cats*nrows_cat + 
                                 target_mean_global*alpha)/(nrows_cat+alpha)
        # Mapping means to test data
        encoded_col_test = test_data[col].map(target_means_cats_adj)
        # Getting a train encodings
        if reg_method == 'expanding_mean':
            train_data_shuffled = train_data.sample(frac=1, random_state=1)
            cumsum = train_data_shuffled.groupby(col)[target_col].cumsum() - train_data_shuffled[target_col]
            cumcnt = train_data_shuffled.groupby(col).cumcount()
            encoded_col_train = cumsum/(cumcnt)
            encoded_col_train.fillna(target_mean_global, inplace=True)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))
        elif (reg_method == 'k_fold') and (folds > 1):
            kfold = StratifiedKFold(folds, shuffle=True, random_state=1)
            parts = []
            for tr_in, val_ind in kfold.split(train_data[columns].values, train_data[target_col].values):
                # divide data
                df_for_estimation, df_estimated = train_data.iloc[tr_in], train_data.iloc[val_ind]
                # getting means on data for estimation (all folds except estimated)
                nrows_cat = df_for_estimation.groupby(col)[target_col].count()
                target_means_cats = df_for_estimation.groupby(col)[target_col].mean()
                target_means_cats_adj = (target_means_cats*nrows_cat + 
                                         target_mean_global*alpha)/(nrows_cat+alpha)
                # Mapping means to estimated fold
                encoded_col_train_part = df_estimated[col].map(target_means_cats_adj)
                if add_random:
                    encoded_col_train_part = encoded_col_train_part + normal(loc=rmean, scale=rstd, 
                                                                             size=(encoded_col_train_part.shape[0]))
                # Saving estimated encodings for a fold
                parts.append(encoded_col_train_part)
            encoded_col_train = pd.concat(parts, axis=0)
            encoded_col_train.fillna(target_mean_global, inplace=True)
        else:
            encoded_col_train = train_data[col].map(target_means_cats_adj)
            if add_random:
                encoded_col_train = encoded_col_train + normal(loc=rmean, scale=rstd, 
                                                               size=(encoded_col_train.shape[0]))

        # Saving the column with means
        encoded_col = pd.concat([encoded_col_train, encoded_col_test], axis=0)
        encoded_col[encoded_col.isnull()] = target_mean_global
        encoded_cols.append(pd.DataFrame({'mean_'+target_col+'_'+col:encoded_col}))
    all_encoded = pd.concat(encoded_cols, axis=1)
    return (all_encoded.loc[train_data.index,:], 
            all_encoded.loc[test_data.index,:])

In [101]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16) 
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64) 
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [173]:
PATH_TO_DATA = Path('../input/flight-delays-fall-2018/')

train_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_train.csv')
test_df = pd.read_csv(PATH_TO_DATA / 'flight_delays_test.csv')
test_df.index = range(100000, 200000)

air = pd.read_csv(PATH_TO_DATA / 'airports.csv')
air = air.set_index('IATA_CODE')

train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y


In [174]:
air_city = air['CITY'].to_dict()
air_state = air['STATE'].to_dict()
air_country = air['COUNTRY'].to_dict()
air_lat = air['LATITUDE'].to_dict()
air_long = air['LONGITUDE'].to_dict()

# Preproc

In [175]:
for df in [train_df, test_df]:
    for col in ['Origin', 'Dest']:
        df[col+'CITY'] = df[col].map(air_city).fillna('other')
        df[col+'STATE'] = df[col].map(air_state).fillna('other')
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['flight_city'] = df['OriginCITY'] + '-->' + df['DestCITY']
    df['flight_state'] = df['OriginSTATE'] + '-->' + df['DestSTATE']

unique_carrier = list(set(train_df['UniqueCarrier']) & set(test_df['UniqueCarrier']))
or_dest = list(set(train_df['Origin']) & set(test_df['Origin']) & set(train_df['Dest']) & set(test_df['Dest']))
flight_set = list(set(train_df['flight']) & set(test_df['flight']))
flight_city = list(set(train_df['flight_city']) & set(test_df['flight_city']))
flight_state = list(set(train_df['flight_state']) & set(test_df['flight_state']))

In [176]:
train_df.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,dep_delayed_15min,OriginCITY,OriginSTATE,DestCITY,DestSTATE,flight,flight_city,flight_state
0,c-8,c-21,c-7,1934,AA,ATL,DFW,732,N,Atlanta,GA,Dallas-Fort Worth,TX,ATL-->DFW,Atlanta-->Dallas-Fort Worth,GA-->TX
1,c-4,c-20,c-3,1548,US,PIT,MCO,834,N,Pittsburgh,PA,Orlando,FL,PIT-->MCO,Pittsburgh-->Orlando,PA-->FL
2,c-9,c-2,c-5,1422,XE,RDU,CLE,416,N,Raleigh,NC,Cleveland,OH,RDU-->CLE,Raleigh-->Cleveland,NC-->OH
3,c-11,c-25,c-6,1015,OO,DEN,MEM,872,N,Denver,CO,Memphis,TN,DEN-->MEM,Denver-->Memphis,CO-->TN
4,c-10,c-7,c-6,1828,WN,MDW,OMA,423,Y,Chicago,IL,Omaha,NE,MDW-->OMA,Chicago-->Omaha,IL-->NE


In [182]:
minutes_in_day = 24 * 60

def upd_time(val):
    hour = val // 100
    minutes = val % 100
    if hour >= 24:
        hour = hour - 24
    return hour * 60 + minutes

def preproc(df, test=False):
    df['Month'] = df['Month'].apply(lambda x: int(x[2:]))
    df['DayofMonth'] = df['DayofMonth'].apply(lambda x: int(x[2:]))
    df['DayOfWeek'] = df['DayOfWeek'].apply(lambda x: int(x[2:]))
    df['Holiday'] = (df['DayOfWeek'] > 5).astype('int')
    
    df['DepTime'] = df['DepTime'].apply(upd_time)
    df['sin_time'] = np.sin(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['cos_time'] = np.cos(2 * np.pi * df['DepTime'] / minutes_in_day)
    df['hour'] = df['DepTime'] // 60
    df['minutes'] = df['DepTime'] % 60
    
    df['morning'] = ((df['hour'] >= 6) & (df['hour'] < 12)).astype('int')
    df['day'] = ((df['hour'] >= 12) & (df['hour'] < 18)).astype('int')
    df['evening'] = ((df['hour'] >= 18) & (df['hour'] < 24)).astype('int')
    df['night'] = ((df['hour'] >= 0) & (df['hour'] < 6)).astype('int')
    df['dtimeLabel'] = df['morning'] + df['day'] * 2 + df['evening'] * 3 + df['night'] * 4
    
    df['low_delay'] = ((df['hour'] >= 4) & (df['hour'] < 9)).astype('int')
    df['other_time'] = ((df['hour'] < 9) & (df['hour'] >=5)).astype('int')
    df['delay_time'] = ((df['hour'] >= 13) & (df['hour'] < 24) | (df['hour'] < 5)).astype('int')
    df['middle_time'] = ((df['hour'] >= 9) & (df['hour'] < 13)).astype('int')
    
    df['hour_bins'] = pd.cut(df['hour'], [-1, 5, 8, 10, 12, 50], right=False)

    df['winter'] = ((df['Month'].isin([12, 1, 2]))).astype('int')
    df['spring'] = ((df['Month'].isin([3, 4, 5]))).astype('int')
    df['summer'] = ((df['Month'].isin([6, 7, 8]))).astype('int')
    df['autumn'] = ((df['Month'].isin([9, 10, 11]))).astype('int')
    df['4seasonsLabel'] = df['winter'] + df['spring'] * 2 + df['summer'] * 3 + df['autumn'] * 4

    df['Distance_log'] = df['Distance'].apply(np.log)
    
    for col in ['Origin', 'Dest']:
        df[col+'CITY'] = df[col].map(air_city).fillna('other')
        df[col+'STATE'] = df[col].map(air_state).fillna('other')
        df[col+'COUNTRY'] = df[col].map(air_country).fillna('other')

        df[col+'LATITUDE'] = df[col].map(air_lat)
        df[col+'LONGITUDE'] = df[col].map(air_long)

        df[col+'x_coord'] = np.cos(df[col+'LATITUDE']) * np.cos(df[col+'LONGITUDE'])
        df[col+'y_coord'] = np.cos(df[col+'LATITUDE']) * np.sin(df[col+'LONGITUDE'])
        df[col+'z_coord'] = np.sin(df[col+'LATITUDE'])
        
    df['x_coord'] = df['Originx_coord'] - df['Destx_coord']
    df['y_coord'] = df['Originy_coord'] - df['Desty_coord']
    df['z_coord'] = df['Originz_coord'] - df['Destz_coord']
    df['LATITUDE'] = df['OriginLATITUDE'] - df['DestLATITUDE']
    df['LONGITUDE'] = df['OriginLONGITUDE'] - df['DestLONGITUDE']
    
    df['flight'] = df['Origin'] + '-->' + df['Dest']
    df['flight_city'] = df['OriginCITY'] + '-->' + df['DestCITY']
    df['flight_state'] = df['OriginSTATE'] + '-->' + df['DestSTATE']
    
    df.loc[~df['Origin'].isin(or_dest), 'Origin'] = 'other'
    df.loc[~df['Dest'].isin(or_dest), 'Dest'] = 'other'
    df.loc[~df['UniqueCarrier'].isin(unique_carrier), 'UniqueCarrier'] = 'other'
    df.loc[~df['flight'].isin(flight_set), 'flight'] = 'other'
    df.loc[~df['flight_city'].isin(flight_city), 'flight_city'] = 'other'
    df.loc[~df['flight_state'].isin(flight_state), 'flight_state'] = 'other'
    
    if not test:
        le_carrier.fit(df['UniqueCarrier'])
        le_or_dest.fit(pd.concat([df['Origin'], df['Dest']]))
        le_flight.fit(df['flight'])
        le_flight_city.fit(df['flight_city'])
        le_flight_state.fit(df['flight_state'])
        le_hour.fit(df['hour_bins'])
    
    df['UniqueCarrierLabel'] = le_carrier.transform(df['UniqueCarrier'])
    df['OriginLabel'] = le_or_dest.transform(df['Origin'])
    df['DestLabel'] = le_or_dest.transform(df['Dest'])
    df['flightLabel'] = le_flight.transform(df['flight'])
    df['flight_cityLabel'] = le_flight_city.transform(df['flight_city'])
    df['flight_stateLabel'] = le_flight_state.transform(df['flight_state'])
    df['hour_binsLabel'] = le_hour.transform(df['hour_bins'])

    for col in df.columns:
        if df[col].isna().sum() > 0:
            if df[col].dtype != 'object':
                df[col] = df[col].fillna(df[col].median())
            else:
                common_val = df[col].value_counts().index[0]
                df[col] = df[col].fillna(common_val)
    
    return df

le_carrier = LabelEncoder()
le_or_dest = LabelEncoder()
le_target = LabelEncoder()
le_flight = LabelEncoder()
le_flight_city = LabelEncoder()
le_flight_state = LabelEncoder()
le_hour = LabelEncoder()

In [183]:
train_df['dep_delayed_15min'] = le_target.fit_transform(train_df['dep_delayed_15min'])
categ_cols = ['UniqueCarrier', 'Origin', 'Dest', 'flight']

train_df_fe, test_df_fe = freq_encode(train_df, test_df, columns=categ_cols)

train_df_mekf, test_df_mekf = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='k_fold', alpha=5, folds=5)

train_df_meem, test_df_meem = mean_encode(train_df, test_df, columns=categ_cols, target_col='dep_delayed_15min', 
                                                        reg_method='expanding_mean', alpha=5)

train_df_meem.columns = [col + 'expanding_mean' for col in categ_cols]
test_df_meem.columns = [col + 'expanding_mean' for col in categ_cols]

In [184]:
y_train = train_df['dep_delayed_15min']

X_train_full = preproc(train_df.copy())
X_test_full = preproc(test_df.copy(), test=True)

del X_train_full['dep_delayed_15min']

X_train_full = pd.concat([X_train_full, train_df_fe, train_df_mekf, train_df_meem], axis=1)
X_test_full = pd.concat([X_test_full, test_df_fe, test_df_mekf, test_df_meem], axis=1)

X_train_full.head()

Unnamed: 0,Month,DayofMonth,DayOfWeek,DepTime,UniqueCarrier,Origin,Dest,Distance,OriginCITY,OriginSTATE,...,freq_Dest,freq_flight,mean_dep_delayed_15min_UniqueCarrier,mean_dep_delayed_15min_Origin,mean_dep_delayed_15min_Dest,mean_dep_delayed_15min_flight,UniqueCarrierexpanding_mean,Originexpanding_mean,Destexpanding_mean,flightexpanding_mean
0,8,21,7,1174,AA,ATL,DFW,732,Atlanta,GA,...,0.04337,0.00136,0.185031,0.255794,0.153744,0.254111,0.184438,0.255592,0.153869,0.236364
1,4,20,3,948,US,PIT,MCO,834,Pittsburgh,PA,...,0.01728,0.00054,0.171243,0.168373,0.188049,0.16229,0.174815,0.125,0.193103,0.0
2,9,2,5,862,XE,RDU,CLE,416,Raleigh,NC,...,0.01218,0.00022,0.17289,0.19185,0.183729,0.113546,0.170219,0.176938,0.180328,0.2
3,11,25,6,615,OO,DEN,MEM,872,Denver,CO,...,0.00629,0.00014,0.170996,0.195163,0.147418,0.122012,0.171821,0.186235,0.162791,0.333333
4,10,7,6,1108,WN,MDW,OMA,423,Chicago,IL,...,0.00311,0.0003,0.214693,0.249266,0.241468,0.412145,0.207071,0.206897,0.333333,1.0


# CV

In [187]:
X_tr, X_valid, y_tr, y_valid = train_test_split(X_train_full, y_train, test_size=0.25, random_state=174)

In [191]:
skf = StratifiedKFold(n_splits=4, shuffle=True, random_state=11138)

In [192]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']
X_train = X_tr[cols_to_use]

cat_cols = ['UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74169+-0.00689


In [193]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']
X_train = X_tr[cols_to_use]

cat_cols = ['Month', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74150+-0.00715


In [194]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']
X_train = X_tr[cols_to_use]

cat_cols = ['Month', 'DayofMonth', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74615+-0.00699


In [196]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance']
X_train = X_tr[cols_to_use]

cat_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75272+-0.00655


In [197]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log']
X_train = X_tr[cols_to_use]

cat_cols = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75272+-0.00655


In [198]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75308+-0.00738


In [204]:
cols_to_use = ['DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75252+-0.00679


In [205]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'dtimeLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75217+-0.00719


In [207]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75340+-0.00677


In [208]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel', 'hour']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.77118+-0.00761


In [209]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75340+-0.00677


In [206]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75383+-0.00712


In [199]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'dtimeLabel', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75230+-0.00766


In [201]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'dtimeLabel', 'hour_binsLabel', 'flight']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 'flight', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.75370+-0.00730


In [202]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Dest', 'Distance_log',
              '4seasonsLabel', 'dtimeLabel', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Dest', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74437+-0.00722


In [203]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Distance_log',
              '4seasonsLabel', 'dtimeLabel', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74953+-0.00634


In [200]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Distance_log', 'flight',
              '4seasonsLabel', 'dtimeLabel', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'flight',
            'dtimeLabel', 'hour_binsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True)
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.74676+-0.00690


In [210]:
X_tr.columns

Index(['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier',
       'Origin', 'Dest', 'Distance', 'OriginCITY', 'OriginSTATE', 'DestCITY',
       'DestSTATE', 'flight', 'flight_city', 'flight_state', 'Holiday',
       'sin_time', 'cos_time', 'hour', 'minutes', 'morning', 'day', 'evening',
       'night', 'dtimeLabel', 'low_delay', 'other_time', 'delay_time',
       'middle_time', 'hour_bins', 'winter', 'spring', 'summer', 'autumn',
       '4seasonsLabel', 'Distance_log', 'OriginCOUNTRY', 'OriginLATITUDE',
       'OriginLONGITUDE', 'Originx_coord', 'Originy_coord', 'Originz_coord',
       'DestCOUNTRY', 'DestLATITUDE', 'DestLONGITUDE', 'Destx_coord',
       'Desty_coord', 'Destz_coord', 'x_coord', 'y_coord', 'z_coord',
       'LATITUDE', 'LONGITUDE', 'UniqueCarrierLabel', 'OriginLabel',
       'DestLabel', 'flightLabel', 'flight_cityLabel', 'flight_stateLabel',
       'hour_binsLabel', 'freq_UniqueCarrier', 'freq_Origin', 'freq_Dest',
       'freq_flight', 'mean_dep_delayed_15

In [211]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel', 'hour']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True, task_type='GPU')
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.77827+-0.00528


In [212]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel', 'hour']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True, task_type='GPU')
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.77388+-0.00506


In [214]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'OriginLabel', 'DestLabel', 'Distance_log',
              '4seasonsLabel', 'hour', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['4seasonsLabel', 'DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'OriginLabel', 'DestLabel', 
            'dtimeLabel', 'hour_binsLabel', 'hour']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True, task_type='GPU')
cv_score = cross_val_score(clf, X_train, y_tr, cv=skf, scoring='roc_auc', fit_params=cat_features)
print(f'{cv_score.mean():.5f}+-{cv_score.std():.5f}')

0.77388+-0.00512


Check valid

In [216]:
cols_to_use = ['Month', 'DayofMonth', 'DayOfWeek', 'DepTime', 'UniqueCarrier', 'Origin', 'Dest', 'Distance_log',
              '4seasonsLabel', 'hour', 'minutes', 'hour_binsLabel']
X_train = X_tr[cols_to_use]

cat_cols = ['DayofMonth', 'DayOfWeek', 'UniqueCarrier', 'Origin', 'Dest', 
            'dtimeLabel', 'hour_binsLabel', 'hour', 'minutes', '4seasonsLabel']
categ_feat_idx = [i for i, col in enumerate(cols_to_use) if col in cat_cols]
cat_features = {'cat_features': categ_feat_idx}

clf = CatBoostClassifier(random_state=177, silent=True, task_type='GPU')
clf.fit(X_train, y_tr, cat_features=categ_feat_idx)
y_valid_pred = clf.predict_proba(X_valid[cols_to_use])[:, 1]
score = roc_auc_score(y_valid, y_valid_pred)
print(score)

CatBoostError: Bad value for num_feature[0,12]="ORD-->LAX": Cannot convert 'b'ORD-->LAX'' to float

# Train

In [218]:
clf = CatBoostClassifier(random_state=177, silent=True, task_type='GPU')
clf.fit(X_train_full[cols_to_use], y_train, cat_features=categ_feat_idx)

<catboost.core.CatBoostClassifier at 0x176663cc358>

In [221]:
ctb_test_pred = clf.predict_proba(X_test_full[cols_to_use])[:, 1]

with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    
    sample_sub = pd.read_csv(PATH_TO_DATA / 'sample_submission.csv', 
                             index_col='id')
    sample_sub['dep_delayed_15min'] = ctb_test_pred
    sample_sub.to_csv(PATH_TO_DATA / 'catboost_more_cats.csv')