In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')

In [4]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


# Preporcess data

In [5]:
from sklearn.model_selection import StratifiedKFold
import category_encoders as ce
import numpy as np
import datetime

mapping_ord1 = {'Unknown': 0, 'Novice': 1, 'Expert': 2, 'Contributor': 3, 'Master': 4, 'Grandmaster': 5}
mapping_ord2 = {'Unknown': 0, 'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6}
mapping_ord3 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_3'].fillna("0"))))])
mapping_ord4 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_4'].fillna("0"))))])
mapping_ord5 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_5'].fillna("0"))))])

def preprocess_data(df):
    df['ord_0'] = df['ord_0'].fillna(0)
    df['ord_1'] =  df['ord_1'].fillna('Unknown').map(mapping_ord1)
    df['ord_2'] =  df['ord_2'].fillna('Unknown').map(mapping_ord2)
    df['ord_3'] =  df['ord_3'].fillna('0').map(mapping_ord3)
    df['ord_4'] =  df['ord_4'].fillna('0').map(mapping_ord4)
    df['ord_5'] =  df['ord_5'].fillna('0').map(mapping_ord5)
    df['bin_3'] = df['bin_3'].fillna('U').map({"T": 1, "F": 0, "U": np.nan})
    df['bin_4'] = df['bin_4'].fillna('U').map({"Y": 1, "N": 0, "U": np.nan})
    return df

def encode(df, df_test, cols):
    cols_enc = list(map(lambda x: x + "_enc", cols))
    for c in cols_enc:
        df[c] = np.nan
        df_test[c] = np.nan
        
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    for train_idx, oof_idx in skf.split(df, df["target"]):
        enc = ce.TargetEncoder(cols = cols, smoothing=0.3)
        enc.fit(df.loc[train_idx, cols], df.loc[train_idx, "target"])
        df.loc[oof_idx, cols_enc] = enc.transform(df.loc[oof_idx, cols]).values
    
    enc = ce.TargetEncoder(cols = cols, smoothing=0.3)
    enc.fit(df[cols], df["target"])
    df_test[cols_enc] = enc.transform(df_test[cols])
    return df, df_test, cols_enc
    
df = preprocess_data(df)
df_test = preprocess_data(df_test)

binary = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
ordinal  = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
low_card = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
high_card = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
date = ['day', 'month']

df, df_test, cat_enc =  encode(df, df_test, high_card + low_card + date)
features = high_card + binary + ordinal + cat_enc + low_card + date

In [6]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_7_enc,nom_8_enc,nom_9_enc,nom_0_enc,nom_1_enc,nom_2_enc,nom_3_enc,nom_4_enc,day_enc,month_enc
0,0,0.0,0.0,0.0,0.0,0.0,Red,Trapezoid,Hamster,Russia,...,0.098296,0.129236,0.177914,0.183158,0.225096,0.167919,0.219491,0.207147,0.200424,0.146475
1,1,1.0,1.0,0.0,0.0,1.0,Red,Star,Axolotl,,...,0.128414,0.191517,0.155502,0.183176,0.130233,0.200552,0.185771,0.179593,0.212892,0.208686
2,2,0.0,1.0,0.0,0.0,0.0,Red,,Hamster,Canada,...,0.266392,0.169176,0.188439,0.183158,0.1819,0.167919,0.159677,0.207147,0.164081,0.213645
3,3,,0.0,0.0,0.0,0.0,Red,Circle,Hamster,Finland,...,0.139367,0.222915,0.271186,0.182968,0.179723,0.168632,0.178692,0.179306,0.163514,0.14627
4,4,0.0,,0.0,1.0,0.0,Red,Triangle,Hamster,Costa Rica,...,0.195893,0.191633,0.261851,0.183176,0.157501,0.167905,0.202839,0.188604,0.163069,0.224929


In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

In [8]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
import category_encoders as ce
 
params = {
        'clf__objective': 'binary',
        'clf__boosting_type': 'gbdt', 
        'clf__metric': 'auc',
        'clf__learning_rate': 0.01,
        "clf__bagging_freq": 5,
        "clf__bagging_fraction": 0.8,
        "clf__min_data_in_leaf": 30,
        "clf__min_sum_hessian_in_leaf": 5,
        "clf__use_two_round_loading": False,
        "clf__feature_fraction": 0.8,
        'clf__verbose': 1,
        "clf__lambda_l1": 0.1,
        "clf__n_estimators": 5000,
        "clf__max_depth": 3,
        "clf__reg_alpha": 1,
        "clf__reg_lambda": 1,
        "enc__a": 0.2
    }

                        
pipeline = Pipeline(steps=[('sel', ColumnSelector(features)), 
                           ('enc', ce.CatBoostEncoder(cols=high_card + low_card + date)),
                           ('clf', LGBMClassifier())]).set_params(**params)

In [None]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt.pyll.base import scope
from hyperopt import space_eval
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


def make_objective(pipeline, df):
    
    def objective(params):
                
        pipeline.set_params(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        scores = []
        for train_idx, val_idx in tqdm(skf.split(df, df["target"])):
            
            df_train, df_val = df.loc[train_idx, :], df.loc[val_idx, :]
            transformers = Pipeline(pipeline.steps[:-1]).fit(df_train, df_train['target'])
            X_val, y_val = transformers.transform(df_val), df_val['target']
            
            fit_args = {
                "clf__early_stopping_rounds": 1000, 
                "clf__eval_set": (X_val, y_val),
                "clf__verbose": 500,
                "clf__eval_metric": 'auc',
            }
            
            pipeline.fit(df_train, df_train['target'], **fit_args)
            prob = pipeline.predict_proba(df_val)[:, 1]
            scores.append(roc_auc_score(y_val, prob))
            
        score = np.mean(scores)
        print("AUC {:.5f}+-{:.5f}.\n".format(np.mean(scores), np.std(scores)) + 
              "Params: \n" + 
              "\n".join(["{}: {:.3f}".format(k, params[k]) for k in params]))
        return 1 - score
    
    return objective

space = {
        "clf__learning_rate": hp.loguniform("clf__learning_rate", np.log(0.005), np.log(0.5)),
        "clf__num_leaves": hp.choice("clf__num_leaves", np.linspace(8, 128, 50, dtype=int)),
        "clf__feature_fraction": hp.quniform("clf__feature_fraction", 0.5, 1.0, 0.1),
        "clf__bagging_fraction": hp.quniform("clf__bagging_fraction", 0.5, 1.0, 0.1),
        "clf__min_child_weight": hp.uniform('clf__min_child_weight', 0.5, 10),
        'clf__lambda_l1': hp.loguniform('clf__lambda_l1', np.log(0.005), np.log(0.5)),
        'clf__colsample_bytree': hp.uniform('clf__colsample_bytree', 0.3, 1.0),
    }

best = fmin(fn=make_objective(pipeline, df),
            space=space,
            algo=tpe.suggest,
            max_evals=10)

best_params = space_eval(space, best)

In [11]:
print("Best params: \n" + "\n".join(["{}: {:.3f}".format(k, best_params[k]) for k in best_params]))

Best params: 
clf__bagging_fraction: 0.800
clf__colsample_bytree: 0.360
clf__feature_fraction: 0.500
clf__lambda_l1: 0.114
clf__learning_rate: 0.032
clf__min_child_weight: 6.257
clf__num_leaves: 115.000


In [12]:
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

folds = 20
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
pipeline.set_params(**best_params)
df_test['target'] = 0

for train_idx, val_idx in tqdm(skf.split(df, df['target'])):
    df_train = df.loc[train_idx, :]
    df_val = df.loc[val_idx, :]
    X_val, y_val = Pipeline(pipeline.steps[:-1]).fit(df_train, df_train['target'])\
                                                    .transform(df_val), df_val['target']
    fit_args = {
        "clf__early_stopping_rounds": 1000, 
        "clf__eval_set": (X_val, y_val),
        "clf__verbose": 500,
        "clf__eval_metric": 'auc'
    }
        
    pipeline.fit(df_train, df_train['target'], **fit_args)
    prob = pipeline.predict_proba(df_val)[:, 1]
    score = roc_auc_score(y_val, prob)
    
    df_test['target'] += pipeline.predict_proba(df_test)[:, 1]


0it [00:00, ?it/s]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.787208
[1000]	valid_0's auc: 0.78949
[1500]	valid_0's auc: 0.789686
[2000]	valid_0's auc: 0.789571
Early stopping, best iteration is:
[1466]	valid_0's auc: 0.789719


1it [00:49, 49.85s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.78049
[1000]	valid_0's auc: 0.783273
[1500]	valid_0's auc: 0.783661
[2000]	valid_0's auc: 0.783725
[2500]	valid_0's auc: 0.783653
Early stopping, best iteration is:
[1786]	valid_0's auc: 0.783779


2it [01:47, 52.28s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.786196
[1000]	valid_0's auc: 0.789378
[1500]	valid_0's auc: 0.789867
[2000]	valid_0's auc: 0.789761
[2500]	valid_0's auc: 0.789635
Early stopping, best iteration is:
[1504]	valid_0's auc: 0.789875


3it [02:38, 51.80s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.78019
[1000]	valid_0's auc: 0.783809
[1500]	valid_0's auc: 0.784184
[2000]	valid_0's auc: 0.784061
Early stopping, best iteration is:
[1489]	valid_0's auc: 0.784186


4it [03:33, 52.63s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.79043
[1000]	valid_0's auc: 0.792844
[1500]	valid_0's auc: 0.79292
[2000]	valid_0's auc: 0.792794
Early stopping, best iteration is:
[1149]	valid_0's auc: 0.792969


5it [04:16, 49.91s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.783371
[1000]	valid_0's auc: 0.786605
[1500]	valid_0's auc: 0.787
[2000]	valid_0's auc: 0.787074
[2500]	valid_0's auc: 0.787018
Early stopping, best iteration is:
[1950]	valid_0's auc: 0.787091


6it [05:08, 50.64s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.786153
[1000]	valid_0's auc: 0.788859
[1500]	valid_0's auc: 0.789253
[2000]	valid_0's auc: 0.789253
[2500]	valid_0's auc: 0.789173
[3000]	valid_0's auc: 0.789057
Early stopping, best iteration is:
[2185]	valid_0's auc: 0.789305


7it [06:04, 51.98s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.783541
[1000]	valid_0's auc: 0.787276
[1500]	valid_0's auc: 0.787719
[2000]	valid_0's auc: 0.787777
[2500]	valid_0's auc: 0.787816
[3000]	valid_0's auc: 0.787764
[3500]	valid_0's auc: 0.787713
Early stopping, best iteration is:
[2708]	valid_0's auc: 0.787891


8it [07:05, 54.90s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.782134
[1000]	valid_0's auc: 0.785299
[1500]	valid_0's auc: 0.785754
[2000]	valid_0's auc: 0.78575
[2500]	valid_0's auc: 0.785794
[3000]	valid_0's auc: 0.785792
[3500]	valid_0's auc: 0.785811
Early stopping, best iteration is:
[2771]	valid_0's auc: 0.785844


9it [08:08, 57.25s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.785192
[1000]	valid_0's auc: 0.788119
[1500]	valid_0's auc: 0.788424
[2000]	valid_0's auc: 0.788489
[2500]	valid_0's auc: 0.788432
Early stopping, best iteration is:
[1936]	valid_0's auc: 0.78853


10it [09:01, 56.09s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.789861
[1000]	valid_0's auc: 0.792735
[1500]	valid_0's auc: 0.792807
[2000]	valid_0's auc: 0.79272
Early stopping, best iteration is:
[1335]	valid_0's auc: 0.792913


11it [09:47, 52.85s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.78792
[1000]	valid_0's auc: 0.790735
[1500]	valid_0's auc: 0.790719
[2000]	valid_0's auc: 0.790601
Early stopping, best iteration is:
[1234]	valid_0's auc: 0.790825


12it [10:31, 50.33s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.779974
[1000]	valid_0's auc: 0.782488
[1500]	valid_0's auc: 0.782567
[2000]	valid_0's auc: 0.782477
Early stopping, best iteration is:
[1217]	valid_0's auc: 0.782648


13it [11:15, 48.52s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.780734
[1000]	valid_0's auc: 0.78417
[1500]	valid_0's auc: 0.784597
[2000]	valid_0's auc: 0.7845
Early stopping, best iteration is:
[1329]	valid_0's auc: 0.784622


14it [12:01, 47.59s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.789442
[1000]	valid_0's auc: 0.792009
[1500]	valid_0's auc: 0.79222
[2000]	valid_0's auc: 0.792107
Early stopping, best iteration is:
[1411]	valid_0's auc: 0.792235


15it [12:47, 47.16s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.784488
[1000]	valid_0's auc: 0.786929
[1500]	valid_0's auc: 0.787013
[2000]	valid_0's auc: 0.786982
[2500]	valid_0's auc: 0.786897
Early stopping, best iteration is:
[1553]	valid_0's auc: 0.787043


16it [13:35, 47.54s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.783846
[1000]	valid_0's auc: 0.787149
[1500]	valid_0's auc: 0.787475
[2000]	valid_0's auc: 0.787428
Early stopping, best iteration is:
[1404]	valid_0's auc: 0.787516


17it [14:22, 47.10s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.782541
[1000]	valid_0's auc: 0.786009
[1500]	valid_0's auc: 0.786229
[2000]	valid_0's auc: 0.78619
[2500]	valid_0's auc: 0.786124
Early stopping, best iteration is:
[1690]	valid_0's auc: 0.786272


18it [15:11, 47.83s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.780215
[1000]	valid_0's auc: 0.782948
[1500]	valid_0's auc: 0.78329
[2000]	valid_0's auc: 0.783182
Early stopping, best iteration is:
[1428]	valid_0's auc: 0.783316


19it [16:06, 49.97s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.785332
[1000]	valid_0's auc: 0.788821
[1500]	valid_0's auc: 0.789235
[2000]	valid_0's auc: 0.789183
[2500]	valid_0's auc: 0.789093
Early stopping, best iteration is:
[1827]	valid_0's auc: 0.789314


20it [16:57, 50.89s/it]


In [18]:
df_test['target'] /= folds
df_test[['id', 'target']].to_csv('../data/submission.csv', index=False)
!head -n 5 ../data/submission.csv

id,target
600000,0.11854787835141165
600001,0.2686114339297638
600002,0.17125694474430989
600003,0.11825015887986522
