In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd

In [3]:
%%time
df = pd.read_csv('../data/train.csv')
df_test = pd.read_csv('../data/test.csv')
df.head()

CPU times: user 2.27 s, sys: 184 ms, total: 2.45 s
Wall time: 2.46 s


Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_9,ord_0,ord_1,ord_2,ord_3,ord_4,ord_5,day,month,target
0,0,0.0,0.0,0.0,F,N,Red,Trapezoid,Hamster,Russia,...,02e7c8990,3.0,Contributor,Hot,c,U,Pw,6.0,3.0,0
1,1,1.0,1.0,0.0,F,Y,Red,Star,Axolotl,,...,f37df64af,3.0,Grandmaster,Warm,e,X,pE,7.0,7.0,0
2,2,0.0,1.0,0.0,F,N,Red,,Hamster,Canada,...,,3.0,,Freezing,n,P,eN,5.0,9.0,0
3,3,,0.0,0.0,F,N,Red,Circle,Hamster,Finland,...,f9d456e57,1.0,Novice,Lava Hot,a,C,,3.0,3.0,0
4,4,0.0,,0.0,T,N,Red,Triangle,Hamster,Costa Rica,...,c5361037c,3.0,Grandmaster,Cold,h,C,OZ,5.0,12.0,0


# Preporcess data

In [4]:
from sklearn.model_selection import StratifiedKFold
import category_encoders as ce
from tqdm import tqdm
import numpy as np
import datetime
import itertools

mapping_ord1 = {'Unknown': 0, 'Novice': 1, 'Expert': 2, 'Contributor': 3, 'Master': 4, 'Grandmaster': 5}
mapping_ord2 = {'Unknown': 0, 'Freezing': 1, 'Cold': 2, 'Warm': 3, 'Hot': 4, 'Boiling Hot': 5, 'Lava Hot': 6}
mapping_ord3 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_3'].fillna("0"))))])
mapping_ord4 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_4'].fillna("0"))))])
mapping_ord5 = dict([(v, i) for i, v in enumerate(sorted(set(df['ord_5'].fillna("0"))))])


def combine_cols(df, cols):
    comb_cols = []
    for i, j in tqdm(itertools.combinations(cols, 2)):
        c = i + "_" + j
        df[c] = (df[i].fillna("Unknown").astype(str) + "_" + df[j].fillna("Unknown").astype(str)).values
        comb_cols.append(c)
    return df, comb_cols

def preprocess_data(df):
    df['ord_0'] = df['ord_0'].fillna(0)
    df['ord_1'] =  df['ord_1'].fillna('Unknown').map(mapping_ord1)
    df['ord_2'] =  df['ord_2'].fillna('Unknown').map(mapping_ord2)
    df['ord_3'] =  df['ord_3'].fillna('0').map(mapping_ord3)
    df['ord_4'] =  df['ord_4'].fillna('0').map(mapping_ord4)
    df['ord_5'] =  df['ord_5'].fillna('0').map(mapping_ord5)
    df['bin_3'] = df['bin_3'].fillna('U').map({"T": 1, "F": 0, "U": np.nan})
    df['bin_4'] = df['bin_4'].fillna('U').map({"Y": 1, "N": 0, "U": np.nan})
    return df

def encode(df, df_test, cols):
    cols_enc = list(map(lambda x: x + "_enc", cols))
    for c in cols_enc:
        df[c] = np.nan
        df_test[c] = np.nan
        
    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    
    for train_idx, oof_idx in tqdm(skf.split(df, df["target"])):
        enc = ce.TargetEncoder(cols = cols, smoothing=0.3)
        enc.fit(df.loc[train_idx, cols], df.loc[train_idx, "target"])
        df.loc[oof_idx, cols_enc] = enc.transform(df.loc[oof_idx, cols]).values
    
    enc = ce.TargetEncoder(cols = cols, smoothing=0.3)
    enc.fit(df[cols], df["target"])
    df_test[cols_enc] = enc.transform(df_test[cols])
    return df, df_test, cols_enc
    
df = preprocess_data(df)
df_test = preprocess_data(df_test)

binary = ['bin_0', 'bin_1', 'bin_2', 'bin_3', 'bin_4']
ordinal  = ['ord_0', 'ord_1', 'ord_2', 'ord_3', 'ord_4', 'ord_5']
low_card = ['nom_0', 'nom_1', 'nom_2', 'nom_3', 'nom_4']
high_card = ['nom_5', 'nom_6', 'nom_7', 'nom_8', 'nom_9']
date = ['day', 'month']

df, df_test, cat_enc =  encode(df, df_test, high_card + low_card + date)
features = binary + ordinal + cat_enc

5it [00:15,  3.01s/it]


In [5]:
df.head()

Unnamed: 0,id,bin_0,bin_1,bin_2,bin_3,bin_4,nom_0,nom_1,nom_2,nom_3,...,nom_7_enc,nom_8_enc,nom_9_enc,nom_0_enc,nom_1_enc,nom_2_enc,nom_3_enc,nom_4_enc,day_enc,month_enc
0,0,0.0,0.0,0.0,0.0,0.0,Red,Trapezoid,Hamster,Russia,...,0.098296,0.129236,0.177914,0.183158,0.225096,0.167919,0.219491,0.207147,0.200424,0.146475
1,1,1.0,1.0,0.0,0.0,1.0,Red,Star,Axolotl,,...,0.128414,0.191517,0.155502,0.183176,0.130233,0.200552,0.185771,0.179593,0.212892,0.208686
2,2,0.0,1.0,0.0,0.0,0.0,Red,,Hamster,Canada,...,0.266392,0.169176,0.188439,0.183158,0.1819,0.167919,0.159677,0.207147,0.164081,0.213645
3,3,,0.0,0.0,0.0,0.0,Red,Circle,Hamster,Finland,...,0.139367,0.222915,0.271186,0.182968,0.179723,0.168632,0.178692,0.179306,0.163514,0.14627
4,4,0.0,,0.0,1.0,0.0,Red,Triangle,Hamster,Costa Rica,...,0.195893,0.191633,0.261851,0.183176,0.157501,0.167905,0.202839,0.188604,0.163069,0.224929


In [6]:
df.shape

(600000, 37)

# Learn model

In [7]:
from sklearn.base import BaseEstimator, TransformerMixin

class ColumnSelector(BaseEstimator, TransformerMixin):
    
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        assert isinstance(X, pd.DataFrame)

        try:
            return X[self.columns]
        except KeyError:
            cols_error = list(set(self.columns) - set(X.columns))
            raise KeyError("The DataFrame does not include the columns: %s" % cols_error)

In [8]:
from lightgbm import LGBMClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer
import category_encoders as ce
 
params = {
        'clf__objective': 'binary',
        'clf__boosting_type': 'gbdt', 
        'clf__metric': 'auc',
        'clf__learning_rate': 0.01,
        "clf__bagging_freq": 5,
        "clf__bagging_fraction": 0.8,
        "clf__min_data_in_leaf": 30,
        "clf__min_sum_hessian_in_leaf": 5,
        "clf__use_two_round_loading": False,
        "clf__feature_fraction": 0.8,
        'clf__verbose': 1,
        "clf__lambda_l1": 0.1,
        "clf__n_estimators": 5000,
        "clf__max_depth": 3,
    }

                        
pipeline = Pipeline(steps=[('sel', ColumnSelector(features)), 
                           ('clf', LGBMClassifier())]).set_params(**params)

In [9]:
from hyperopt import hp, tpe
from hyperopt.fmin import fmin
from hyperopt.pyll.base import scope
from hyperopt import space_eval
import numpy as np
from tqdm import tqdm
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold


def make_objective(pipeline, df):
    
    def objective(params):
                
        pipeline.set_params(**params)
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
        
        scores = []
        for train_idx, val_idx in tqdm(skf.split(df, df["target"])):
            
            df_train, df_val = df.loc[train_idx, :], df.loc[val_idx, :]
            transformers = Pipeline(pipeline.steps[:-1]).fit(df_train, df_train['target'])
            X_val, y_val = transformers.transform(df_val), df_val['target']
            
            fit_args = {
                "clf__early_stopping_rounds": 1000, 
                "clf__eval_set": (X_val, y_val),
                "clf__verbose": 500,
                "clf__eval_metric": 'auc',
            }
            
            pipeline.fit(df_train, df_train['target'], **fit_args)
            prob = pipeline.predict_proba(df_val)[:, 1]
            scores.append(roc_auc_score(y_val, prob))
            
        score = np.mean(scores)
        print("AUC {:.5f}+-{:.5f}.\n".format(np.mean(scores), np.std(scores)) + 
              "Params: \n" + 
              "\n".join(["{}: {:.3f}".format(k, params[k]) for k in params]))
        return 1 - score
    
    return objective

space = {
        "clf__learning_rate": hp.loguniform("clf__learning_rate", np.log(0.005), np.log(0.5)),
        'clf__lambda_l1': hp.loguniform('clf__lambda_l1', np.log(0.005), np.log(0.5)),
        "clf__num_leaves": hp.choice("clf__num_leaves", np.linspace(8, 128, 50, dtype=int)),
        "clf__feature_fraction": hp.quniform("clf__feature_fraction", 0.5, 1.0, 0.1),
        "clf__bagging_fraction": hp.quniform("clf__bagging_fraction", 0.5, 1.0, 0.1),
        "clf__min_child_weight": hp.uniform('clf__min_child_weight', 0.5, 10),
        'clf__colsample_bytree': hp.uniform('clf__colsample_bytree', 0.3, 1.0),
    }

best = fmin(fn=make_objective(pipeline, df),
            space=space,
            algo=tpe.suggest,
            max_evals=10)

best_params = space_eval(space, best)

  0%|          | 0/10 [00:00<?, ?trial/s, best loss=?]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.775119                         
[1000]	valid_0's auc: 0.783153                        
[1500]	valid_0's auc: 0.785433                        
[2000]	valid_0's auc: 0.786188                        
[2500]	valid_0's auc: 0.786422                        
[3000]	valid_0's auc: 0.78645                         
[3500]	valid_0's auc: 0.786444                        
Early stopping, best iteration is:                    
[2853]	valid_0's auc: 0.78648
  0%|          | 0/10 [00:47<?, ?trial/s, best loss=?]

1it [00:49, 49.55s/it]
[A


Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.777409                         
[1000]	valid_0's auc: 0.785385                        
[1500]	valid_0's auc: 0.787817                        
[2000]	valid_0's auc: 0.788578                        
[2500]	valid_0's auc: 0.788816                        
[3000]	valid_0's auc: 0.788879                        
[3500]	valid_0's auc: 0.788864                        
[4000]	valid_0's auc: 0.788847                        
[4500]	valid_0's auc: 0.788772                        
Early stopping, best iteration is:                    
[3590]	valid_0's auc: 0.788897
  0%|          | 0/10 [01:48<?, ?trial/s, best loss=?]

2it [01:50, 52.90s/it]
[A


Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.778128                         
[1000]	valid_0's auc: 0.786227                        
[1500]	valid_0's auc: 0.78842                         
[2000]	valid_0's auc: 0.789015                        
[2500]	valid_0's auc: 0.789156                        
[3000]	valid_0's auc: 0.789139                        
[3500]	valid_0's auc: 0.789091                        
Early stopping, best iteration is:                    
[2755]	valid_0's auc: 0.789192
  0%|          | 0/10 [02:38<?, ?trial/s, best loss=?]

3it [02:40, 52.09s/it]
[A


Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.775846                         
[1000]	valid_0's auc: 0.783484                        
[1500]	valid_0's auc: 0.785708                        
[2000]	valid_0's auc: 0.786332                        
[2500]	valid_0's auc: 0.786478                        
[3000]	valid_0's auc: 0.7865                          
[3500]	valid_0's auc: 0.786507                        
[4000]	valid_0's auc: 0.786411                        
Early stopping, best iteration is:                    
[3430]	valid_0's auc: 0.786513
  0%|          | 0/10 [03:35<?, ?trial/s, best loss=?]

4it [03:37, 53.55s/it]
[A


Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.774243                         
[1000]	valid_0's auc: 0.782845                        
[1500]	valid_0's auc: 0.785173                        
[2000]	valid_0's auc: 0.786012                        
[2500]	valid_0's auc: 0.786281                        
[3000]	valid_0's auc: 0.786313                        
[3500]	valid_0's auc: 0.786284                        
Early stopping, best iteration is:                    
[2975]	valid_0's auc: 0.786316
  0%|          | 0/10 [04:31<?, ?trial/s, best loss=?]

5it [04:32, 54.08s/it]
[A
5it [04:32, 54.55s/it]


AUC 0.78748+-0.00128.                                 
Params: 
clf__bagging_fraction: 0.700
clf__colsample_bytree: 0.567
clf__feature_fraction: 0.900
clf__lambda_l1: 0.016
clf__learning_rate: 0.016
clf__min_child_weight: 3.762
clf__num_leaves: 34.000
 10%|█         | 1/10 [04:32<40:55, 272.79s/trial, best loss: 0.21252020843685515]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.777072                                                     
[1000]	valid_0's auc: 0.783946                                                    
[1500]	valid_0's auc: 0.785733                                                    
[2000]	valid_0's auc: 0.786346                                                    
[2500]	valid_0's auc: 0.786461                                                    
[3000]	valid_0's auc: 0.786426                                                    
[3500]	valid_0's auc: 0.786389                                                    
Early stopping, best iteration is:                                                
[2500]	valid_0's auc: 0.786461
 10%|█         | 1/10 [05:23<40:55, 272.79s/trial, best loss: 0.21252020843685515]

1it [00:52, 52.05s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.779                                                        
[1000]	valid_0's auc: 0.786244                                                    
[1500]	valid_0's auc: 0.788116                                                    
[2000]	valid_0's auc: 0.78873                                                     
[2500]	valid_0's auc: 0.78886                                                     
[3000]	valid_0's auc: 0.788901                                                    
[3500]	valid_0's auc: 0.788904                                                    
[4000]	valid_0's auc: 0.788864                                                    
Early stopping, best iteration is:                                                
[3075]	valid_0's auc: 0.788907
 10%|█         | 1/10 [06:21<40:55, 272.79s/trial, best loss: 0.21252020843685515]

2it [01:50, 53.99s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.780057                                                     
[1000]	valid_0's auc: 0.786824                                                    
[1500]	valid_0's auc: 0.788529                                                    
[2000]	valid_0's auc: 0.788954                                                    
[2500]	valid_0's auc: 0.789048                                                    
[3000]	valid_0's auc: 0.789062                                                    
[3500]	valid_0's auc: 0.789006                                                    
Early stopping, best iteration is:                                                
[2720]	valid_0's auc: 0.789088
 10%|█         | 1/10 [07:13<40:55, 272.79s/trial, best loss: 0.21252020843685515]

3it [02:42, 53.34s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.77752                                                      
[1000]	valid_0's auc: 0.784147                                                    
[1500]	valid_0's auc: 0.785779                                                    
[2000]	valid_0's auc: 0.78622                                                     
[2500]	valid_0's auc: 0.78634                                                     
[3000]	valid_0's auc: 0.786326                                                    
[3500]	valid_0's auc: 0.786264                                                    
Early stopping, best iteration is:                                                
[2660]	valid_0's auc: 0.786353
 10%|█         | 1/10 [08:06<40:55, 272.79s/trial, best loss: 0.21252020843685515]

4it [03:35, 53.18s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.776289                                                     
[1000]	valid_0's auc: 0.783701                                                    
[1500]	valid_0's auc: 0.785665                                                    
[2000]	valid_0's auc: 0.786174                                                    
[2500]	valid_0's auc: 0.786277                                                    
[3000]	valid_0's auc: 0.786323                                                    
[3500]	valid_0's auc: 0.786324                                                    
Early stopping, best iteration is:                                                
[2935]	valid_0's auc: 0.786331
 10%|█         | 1/10 [09:02<40:55, 272.79s/trial, best loss: 0.21252020843685515]

5it [04:31, 54.13s/it]
[A
5it [04:31, 54.31s/it]


AUC 0.78743+-0.00128.                                                             
Params: 
clf__bagging_fraction: 0.900
clf__colsample_bytree: 0.719
clf__feature_fraction: 0.800
clf__lambda_l1: 0.447
clf__learning_rate: 0.018
clf__min_child_weight: 8.150
clf__num_leaves: 98.000
 20%|██        | 2/10 [09:04<36:19, 272.43s/trial, best loss: 0.21252020843685515]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784183                                                     
[1000]	valid_0's auc: 0.782551                                                    
Early stopping, best iteration is:                                                
[194]	valid_0's auc: 0.78498
 20%|██        | 2/10 [09:20<36:19, 272.43s/trial, best loss: 0.21252020843685515]

1it [00:16, 16.21s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.786547                                                     
[1000]	valid_0's auc: 0.785686                                                    
Early stopping, best iteration is:                                                
[199]	valid_0's auc: 0.787303
 20%|██        | 2/10 [09:35<36:19, 272.43s/trial, best loss: 0.21252020843685515]

2it [00:31, 16.04s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785978                                                     
[1000]	valid_0's auc: 0.784279                                                    
Early stopping, best iteration is:                                                
[173]	valid_0's auc: 0.787258
 20%|██        | 2/10 [09:52<36:19, 272.43s/trial, best loss: 0.21252020843685515]

3it [00:48, 16.15s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784015                                                     
[1000]	valid_0's auc: 0.78203                                                     
Early stopping, best iteration is:                                                
[194]	valid_0's auc: 0.784935
 20%|██        | 2/10 [10:08<36:19, 272.43s/trial, best loss: 0.21252020843685515]

4it [01:04, 16.12s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.783858                                                     
[1000]	valid_0's auc: 0.782416                                                    
Early stopping, best iteration is:                                                
[230]	valid_0's auc: 0.784779
 20%|██        | 2/10 [10:24<36:19, 272.43s/trial, best loss: 0.21252020843685515]

5it [01:20, 16.01s/it]
[A
5it [01:20, 16.01s/it]


AUC 0.78585+-0.00117.                                                             
Params: 
clf__bagging_fraction: 0.700
clf__colsample_bytree: 0.492
clf__feature_fraction: 0.800
clf__lambda_l1: 0.124
clf__learning_rate: 0.229
clf__min_child_weight: 1.800
clf__num_leaves: 39.000
 30%|███       | 3/10 [10:24<25:03, 214.73s/trial, best loss: 0.21252020843685515]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784738                                                     
[1000]	valid_0's auc: 0.783731                                                    
Early stopping, best iteration is:                                                
[337]	valid_0's auc: 0.785298
 30%|███       | 3/10 [10:42<25:03, 214.73s/trial, best loss: 0.21252020843685515]

1it [00:18, 18.57s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.787563                                                     
[1000]	valid_0's auc: 0.786634                                                    
Early stopping, best iteration is:                                                
[282]	valid_0's auc: 0.788026
 30%|███       | 3/10 [11:01<25:03, 214.73s/trial, best loss: 0.21252020843685515]

2it [00:36, 18.48s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.787448                                                     
[1000]	valid_0's auc: 0.78594                                                     
Early stopping, best iteration is:                                                
[272]	valid_0's auc: 0.787827
 30%|███       | 3/10 [11:17<25:03, 214.73s/trial, best loss: 0.21252020843685515]

3it [00:53, 17.99s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784866                                                     
[1000]	valid_0's auc: 0.783215                                                    
Early stopping, best iteration is:                                                
[222]	valid_0's auc: 0.785212
 30%|███       | 3/10 [11:33<25:03, 214.73s/trial, best loss: 0.21252020843685515]

4it [01:09, 17.36s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784368                                                     
[1000]	valid_0's auc: 0.783082                                                    
Early stopping, best iteration is:                                                
[295]	valid_0's auc: 0.784728
 30%|███       | 3/10 [11:50<25:03, 214.73s/trial, best loss: 0.21252020843685515]

5it [01:26, 17.26s/it]
[A
5it [01:26, 17.32s/it]


AUC 0.78622+-0.00141.                                                             
Params: 
clf__bagging_fraction: 0.700
clf__colsample_bytree: 0.520
clf__feature_fraction: 0.900
clf__lambda_l1: 0.051
clf__learning_rate: 0.175
clf__min_child_weight: 8.802
clf__num_leaves: 61.000
 40%|████      | 4/10 [11:51<17:37, 176.31s/trial, best loss: 0.21252020843685515]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.781459                                                     
[1000]	valid_0's auc: 0.778258                                                    
Early stopping, best iteration is:                                                
[156]	valid_0's auc: 0.783517
 40%|████      | 4/10 [12:05<17:37, 176.31s/trial, best loss: 0.21252020843685515]

1it [00:14, 14.60s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.783363                                                     
[1000]	valid_0's auc: 0.780969                                                    
Early stopping, best iteration is:                                                
[122]	valid_0's auc: 0.785602
 40%|████      | 4/10 [12:19<17:37, 176.31s/trial, best loss: 0.21252020843685515]

2it [00:28, 14.44s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.78361                                                      
[1000]	valid_0's auc: 0.780092                                                    
Early stopping, best iteration is:                                                
[112]	valid_0's auc: 0.785976
 40%|████      | 4/10 [12:32<17:37, 176.31s/trial, best loss: 0.21252020843685515]

3it [00:41, 13.98s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.781004                                                     
[1000]	valid_0's auc: 0.776989                                                    
Early stopping, best iteration is:                                                
[140]	valid_0's auc: 0.783929
 40%|████      | 4/10 [12:45<17:37, 176.31s/trial, best loss: 0.21252020843685515]

4it [00:54, 13.72s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.780718                                                     
[1000]	valid_0's auc: 0.778022                                                    
Early stopping, best iteration is:                                                
[167]	valid_0's auc: 0.782849
 40%|████      | 4/10 [12:59<17:37, 176.31s/trial, best loss: 0.21252020843685515]

5it [01:08, 13.70s/it]
[A
5it [01:08, 13.67s/it]


AUC 0.78437+-0.00121.                                                             
Params: 
clf__bagging_fraction: 0.600
clf__colsample_bytree: 0.379
clf__feature_fraction: 0.800
clf__lambda_l1: 0.046
clf__learning_rate: 0.357
clf__min_child_weight: 8.449
clf__num_leaves: 12.000
 50%|█████     | 5/10 [12:59<11:59, 143.92s/trial, best loss: 0.21252020843685515]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.781838                                                     
[1000]	valid_0's auc: 0.78605                                                     
[1500]	valid_0's auc: 0.786625                                                    
[2000]	valid_0's auc: 0.786648                                                    
[2500]	valid_0's auc: 0.786535                                                    
Early stopping, best iteration is:                                                
[1666]	valid_0's auc: 0.786663
 50%|█████     | 5/10 [13:29<11:59, 143.92s/trial, best loss: 0.21252020843685515]

1it [00:31, 31.22s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784045                                                     
[1000]	valid_0's auc: 0.788398                                                    
[1500]	valid_0's auc: 0.789042                                                    
[2000]	valid_0's auc: 0.789128                                                    
[2500]	valid_0's auc: 0.789079                                                    
[3000]	valid_0's auc: 0.78898                                                     
Early stopping, best iteration is:                                                
[2016]	valid_0's auc: 0.789137
 50%|█████     | 5/10 [14:04<11:59, 143.92s/trial, best loss: 0.21252020843685515]

2it [01:05, 32.22s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.784698                                                     
[1000]	valid_0's auc: 0.788688                                                    
[1500]	valid_0's auc: 0.789128                                                    
[2000]	valid_0's auc: 0.789037                                                    
[2500]	valid_0's auc: 0.788906                                                    
Early stopping, best iteration is:                                                
[1613]	valid_0's auc: 0.789161
 50%|█████     | 5/10 [14:34<11:59, 143.92s/trial, best loss: 0.21252020843685515]

3it [01:36, 31.63s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.782048                                                     
[1000]	valid_0's auc: 0.785884                                                    
[1500]	valid_0's auc: 0.786396                                                    
[2000]	valid_0's auc: 0.786412                                                    
[2500]	valid_0's auc: 0.786339                                                    
Early stopping, best iteration is:                                                
[1791]	valid_0's auc: 0.786445
 50%|█████     | 5/10 [15:07<11:59, 143.92s/trial, best loss: 0.21252020843685515]

4it [02:09, 32.14s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.781364                                                     
[1000]	valid_0's auc: 0.785807                                                    
[1500]	valid_0's auc: 0.786412                                                    
[2000]	valid_0's auc: 0.78643                                                     
[2500]	valid_0's auc: 0.78641                                                     
Early stopping, best iteration is:                                                
[1756]	valid_0's auc: 0.786478
 50%|█████     | 5/10 [15:40<11:59, 143.92s/trial, best loss: 0.21252020843685515]

5it [02:42, 32.40s/it]
[A
5it [02:42, 32.47s/it]


AUC 0.78758+-0.00129.                                                             
Params: 
clf__bagging_fraction: 0.600
clf__colsample_bytree: 0.813
clf__feature_fraction: 0.600
clf__lambda_l1: 0.017
clf__learning_rate: 0.026
clf__min_child_weight: 9.744
clf__num_leaves: 74.000
 60%|██████    | 6/10 [15:41<09:57, 149.46s/trial, best loss: 0.21242300080278187]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785851                                                     
[1000]	valid_0's auc: 0.785417                                                    
Early stopping, best iteration is:                                                
[458]	valid_0's auc: 0.785888
 60%|██████    | 6/10 [15:58<09:57, 149.46s/trial, best loss: 0.21242300080278187]

1it [00:17, 17.18s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.788363                                                     
[1000]	valid_0's auc: 0.787855                                                    
Early stopping, best iteration is:                                                
[471]	valid_0's auc: 0.788461
 60%|██████    | 6/10 [16:15<09:57, 149.46s/trial, best loss: 0.21242300080278187]

2it [00:34, 17.16s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.78829                                                      
[1000]	valid_0's auc: 0.787655                                                    
Early stopping, best iteration is:                                                
[355]	valid_0's auc: 0.788462
 60%|██████    | 6/10 [16:31<09:57, 149.46s/trial, best loss: 0.21242300080278187]

3it [00:49, 16.66s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.786008                                                     
[1000]	valid_0's auc: 0.78539                                                     
Early stopping, best iteration is:                                                
[443]	valid_0's auc: 0.786064
 60%|██████    | 6/10 [16:47<09:57, 149.46s/trial, best loss: 0.21242300080278187]

4it [01:06, 16.62s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785755                                                     
[1000]	valid_0's auc: 0.785349                                                    
[1500]	valid_0's auc: 0.78461                                                     
Early stopping, best iteration is:                                                
[538]	valid_0's auc: 0.785789
 60%|██████    | 6/10 [17:05<09:57, 149.46s/trial, best loss: 0.21242300080278187]

5it [01:24, 17.02s/it]
[A
5it [01:24, 16.85s/it]


AUC 0.78693+-0.00125.                                                             
Params: 
clf__bagging_fraction: 0.500
clf__colsample_bytree: 0.314
clf__feature_fraction: 0.900
clf__lambda_l1: 0.245
clf__learning_rate: 0.092
clf__min_child_weight: 5.506
clf__num_leaves: 52.000
 70%|███████   | 7/10 [17:06<06:29, 129.92s/trial, best loss: 0.21242300080278187]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.763444                                                     
[1000]	valid_0's auc: 0.777578                                                    
[1500]	valid_0's auc: 0.782176                                                    
[2000]	valid_0's auc: 0.784353                                                    
[2500]	valid_0's auc: 0.785476                                                    
[3000]	valid_0's auc: 0.786085                                                    
[3500]	valid_0's auc: 0.786385                                                    
[4000]	valid_0's auc: 0.786541                                                    
[4500]	valid_0's auc: 0.78659                                                     
[5000]	valid_0's auc: 0.786578                                                    
Did not meet early stopping. Best iteration is:                                   
[464

1it [01:03, 63.88s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.765766                                                     
[1000]	valid_0's auc: 0.779393                                                    
[1500]	valid_0's auc: 0.784389                                                    
[2000]	valid_0's auc: 0.786697                                                    
[2500]	valid_0's auc: 0.787944                                                    
[3000]	valid_0's auc: 0.788484                                                    
[3500]	valid_0's auc: 0.788771                                                    
[4000]	valid_0's auc: 0.788915                                                    
[4500]	valid_0's auc: 0.789019                                                    
[5000]	valid_0's auc: 0.789048                                                    
Did not meet early stopping. Best iteration is:                                   
[498

2it [02:08, 64.10s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.766118                                                     
[1000]	valid_0's auc: 0.78045                                                     
[1500]	valid_0's auc: 0.785079                                                    
[2000]	valid_0's auc: 0.787262                                                    
[2500]	valid_0's auc: 0.788326                                                    
[3000]	valid_0's auc: 0.788823                                                    
[3500]	valid_0's auc: 0.789109                                                    
[4000]	valid_0's auc: 0.789226                                                    
[4500]	valid_0's auc: 0.789266                                                    
[5000]	valid_0's auc: 0.789244                                                    
Did not meet early stopping. Best iteration is:                                   
[430

3it [03:20, 66.37s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.764825                                                     
[1000]	valid_0's auc: 0.778062                                                    
[1500]	valid_0's auc: 0.782507                                                    
[2000]	valid_0's auc: 0.784631                                                    
[2500]	valid_0's auc: 0.785717                                                    
[3000]	valid_0's auc: 0.786186                                                    
[3500]	valid_0's auc: 0.786446                                                    
[4000]	valid_0's auc: 0.78656                                                     
[4500]	valid_0's auc: 0.786621                                                    
[5000]	valid_0's auc: 0.786625                                                    
Did not meet early stopping. Best iteration is:                                   
[467

4it [04:32, 68.28s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.762881                                                     
[1000]	valid_0's auc: 0.77702                                                     
[1500]	valid_0's auc: 0.781843                                                    
[2000]	valid_0's auc: 0.784202                                                    
[2500]	valid_0's auc: 0.78543                                                     
[3000]	valid_0's auc: 0.786058                                                    
[3500]	valid_0's auc: 0.786356                                                    
[4000]	valid_0's auc: 0.786497                                                    
[4500]	valid_0's auc: 0.78656                                                     
[5000]	valid_0's auc: 0.786558                                                    
Did not meet early stopping. Best iteration is:                                   
[479

5it [05:47, 70.10s/it]
[A
5it [05:47, 69.45s/it]


AUC 0.78763+-0.00126.                                                             
Params: 
clf__bagging_fraction: 0.600
clf__colsample_bytree: 0.418
clf__feature_fraction: 0.900
clf__lambda_l1: 0.016
clf__learning_rate: 0.009
clf__min_child_weight: 7.577
clf__num_leaves: 79.000
 80%|████████  | 8/10 [22:53<06:30, 195.13s/trial, best loss: 0.21236755207045555]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.786057                                                     
[1000]	valid_0's auc: 0.785662                                                    
[1500]	valid_0's auc: 0.785231                                                    
Early stopping, best iteration is:                                                
[619]	valid_0's auc: 0.786114
 80%|████████  | 8/10 [23:19<06:30, 195.13s/trial, best loss: 0.21236755207045555]

1it [00:26, 26.61s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.788473                                                     
[1000]	valid_0's auc: 0.788337                                                    
[1500]	valid_0's auc: 0.787814                                                    
Early stopping, best iteration is:                                                
[599]	valid_0's auc: 0.788513
 80%|████████  | 8/10 [23:44<06:30, 195.13s/trial, best loss: 0.21236755207045555]

2it [00:51, 26.00s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.788526                                                     
[1000]	valid_0's auc: 0.788254                                                    
[1500]	valid_0's auc: 0.787703                                                    
Early stopping, best iteration is:                                                
[523]	valid_0's auc: 0.788589
 80%|████████  | 8/10 [24:06<06:30, 195.13s/trial, best loss: 0.21236755207045555]

3it [01:13, 25.02s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785999                                                     
[1000]	valid_0's auc: 0.785578                                                    
Early stopping, best iteration is:                                                
[495]	valid_0's auc: 0.786023
 80%|████████  | 8/10 [24:28<06:30, 195.13s/trial, best loss: 0.21236755207045555]

4it [01:35, 23.89s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785899                                                     
[1000]	valid_0's auc: 0.785818                                                    
[1500]	valid_0's auc: 0.785281                                                    
Early stopping, best iteration is:                                                
[612]	valid_0's auc: 0.785936
 80%|████████  | 8/10 [24:50<06:30, 195.13s/trial, best loss: 0.21236755207045555]

5it [01:57, 23.52s/it]
[A
5it [01:57, 23.56s/it]


AUC 0.78703+-0.00124.                                                             
Params: 
clf__bagging_fraction: 0.800
clf__colsample_bytree: 0.514
clf__feature_fraction: 1.000
clf__lambda_l1: 0.170
clf__learning_rate: 0.083
clf__min_child_weight: 6.214
clf__num_leaves: 81.000
 90%|█████████ | 9/10 [24:51<02:51, 171.95s/trial, best loss: 0.21236755207045555]

0it [00:00, ?it/s]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.783664                                                     
[1000]	valid_0's auc: 0.781924                                                    
Early stopping, best iteration is:                                                
[164]	valid_0's auc: 0.784725
 90%|█████████ | 9/10 [25:06<02:51, 171.95s/trial, best loss: 0.21236755207045555]

1it [00:15, 15.78s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785879                                                     
[1000]	valid_0's auc: 0.784237                                                    
Early stopping, best iteration is:                                                
[180]	valid_0's auc: 0.786669
 90%|█████████ | 9/10 [25:22<02:51, 171.95s/trial, best loss: 0.21236755207045555]

2it [00:31, 15.80s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.785361                                                     
[1000]	valid_0's auc: 0.783066                                                    
Early stopping, best iteration is:                                                
[192]	valid_0's auc: 0.786997
 90%|█████████ | 9/10 [25:38<02:51, 171.95s/trial, best loss: 0.21236755207045555]

3it [00:47, 15.89s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.782431                                                     
[1000]	valid_0's auc: 0.780208                                                    
Early stopping, best iteration is:                                                
[126]	valid_0's auc: 0.783822
 90%|█████████ | 9/10 [25:54<02:51, 171.95s/trial, best loss: 0.21236755207045555]

4it [01:03, 15.75s/it]
[A


Training until validation scores don't improve for 1000 rounds                    
[500]	valid_0's auc: 0.783432                                                     
[1000]	valid_0's auc: 0.781335                                                    
Early stopping, best iteration is:                                                
[185]	valid_0's auc: 0.784669
 90%|█████████ | 9/10 [26:10<02:51, 171.95s/trial, best loss: 0.21236755207045555]

5it [01:19, 15.82s/it]
[A
5it [01:19, 15.83s/it]


AUC 0.78538+-0.00124.                                                             
Params: 
clf__bagging_fraction: 0.800
clf__colsample_bytree: 0.971
clf__feature_fraction: 0.500
clf__lambda_l1: 0.360
clf__learning_rate: 0.296
clf__min_child_weight: 7.139
clf__num_leaves: 49.000
100%|██████████| 10/10 [26:10<00:00, 157.05s/trial, best loss: 0.21236755207045555]


In [10]:
print("Best params: \n" + "\n".join(["{}: {:.3f}".format(k, best_params[k]) for k in best_params]))

Best params: 
clf__bagging_fraction: 0.600
clf__colsample_bytree: 0.418
clf__feature_fraction: 0.900
clf__lambda_l1: 0.016
clf__learning_rate: 0.009
clf__min_child_weight: 7.577
clf__num_leaves: 79.000


In [11]:
from sklearn.model_selection import StratifiedKFold
from tqdm import tqdm

folds = 5
skf = StratifiedKFold(n_splits=folds, shuffle=True, random_state=42)
pipeline.set_params(**best_params)
df_test['target'] = 0

for train_idx, val_idx in tqdm(skf.split(df, df['target'])):
    df_train = df.loc[train_idx, :]
    df_val = df.loc[val_idx, :]
    X_val, y_val = Pipeline(pipeline.steps[:-1]).fit(df_train, df_train['target'])\
                                                    .transform(df_val), df_val['target']
    fit_args = {
        "clf__early_stopping_rounds": 1000, 
        "clf__eval_set": (X_val, y_val),
        "clf__verbose": 500,
        "clf__eval_metric": 'auc'
    }
        
    pipeline.fit(df_train, df_train['target'], **fit_args)
    prob = pipeline.predict_proba(df_val)[:, 1]
    score = roc_auc_score(y_val, prob)
    
    df_test['target'] += pipeline.predict_proba(df_test)[:, 1]


0it [00:00, ?it/s]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.763444
[1000]	valid_0's auc: 0.777578
[1500]	valid_0's auc: 0.782176
[2000]	valid_0's auc: 0.784353
[2500]	valid_0's auc: 0.785476
[3000]	valid_0's auc: 0.786085
[3500]	valid_0's auc: 0.786385
[4000]	valid_0's auc: 0.786541
[4500]	valid_0's auc: 0.78659
[5000]	valid_0's auc: 0.786578
Did not meet early stopping. Best iteration is:
[4644]	valid_0's auc: 0.7866


1it [01:13, 73.65s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.765766
[1000]	valid_0's auc: 0.779393
[1500]	valid_0's auc: 0.784389
[2000]	valid_0's auc: 0.786697
[2500]	valid_0's auc: 0.787944
[3000]	valid_0's auc: 0.788484
[3500]	valid_0's auc: 0.788771
[4000]	valid_0's auc: 0.788915
[4500]	valid_0's auc: 0.789019
[5000]	valid_0's auc: 0.789048
Did not meet early stopping. Best iteration is:
[4984]	valid_0's auc: 0.789056


2it [02:31, 74.78s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.766118
[1000]	valid_0's auc: 0.78045
[1500]	valid_0's auc: 0.785079
[2000]	valid_0's auc: 0.787262
[2500]	valid_0's auc: 0.788326
[3000]	valid_0's auc: 0.788823
[3500]	valid_0's auc: 0.789109
[4000]	valid_0's auc: 0.789226
[4500]	valid_0's auc: 0.789266
[5000]	valid_0's auc: 0.789244
Did not meet early stopping. Best iteration is:
[4305]	valid_0's auc: 0.789284


3it [04:08, 81.67s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.764825
[1000]	valid_0's auc: 0.778062
[1500]	valid_0's auc: 0.782507
[2000]	valid_0's auc: 0.784631
[2500]	valid_0's auc: 0.785717
[3000]	valid_0's auc: 0.786186
[3500]	valid_0's auc: 0.786446
[4000]	valid_0's auc: 0.78656
[4500]	valid_0's auc: 0.786621
[5000]	valid_0's auc: 0.786625
Did not meet early stopping. Best iteration is:
[4670]	valid_0's auc: 0.786632


4it [05:47, 86.74s/it]

Training until validation scores don't improve for 1000 rounds
[500]	valid_0's auc: 0.762881
[1000]	valid_0's auc: 0.77702
[1500]	valid_0's auc: 0.781843
[2000]	valid_0's auc: 0.784202
[2500]	valid_0's auc: 0.78543
[3000]	valid_0's auc: 0.786058
[3500]	valid_0's auc: 0.786356
[4000]	valid_0's auc: 0.786497
[4500]	valid_0's auc: 0.78656
[5000]	valid_0's auc: 0.786558
Did not meet early stopping. Best iteration is:
[4790]	valid_0's auc: 0.78659


5it [07:31, 90.38s/it]


In [12]:
df_test['target'] /= folds
df_test[['id', 'target']].to_csv('../data/submission_lgbm.csv', index=False)
!head -n 5 ../data/submission_lgbm.csv

id,target
600000,0.10387206060714602
600001,0.2675943182020537
600002,0.1818472226517094
600003,0.10811305099184843
