In [1]:
import numpy as np
import pandas as pd
import datetime
import gc
import matplotlib.pyplot as plt
import seaborn as sns
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss
import warnings
import time
warnings.filterwarnings('ignore')
np.random.seed(4590)

In [2]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [3]:
df_train = reduce_mem_usage(pd.read_csv('../input/train.csv'))
df_test = reduce_mem_usage(pd.read_csv('../input/test.csv'))
df_hist_trans = reduce_mem_usage(pd.read_csv('../input/historical_transactions.csv'))
df_new_merchant_trans = reduce_mem_usage(pd.read_csv('../input/new_merchant_transactions.csv'))
df_merchant = reduce_mem_usage(pd.read_csv('../input/merchants.csv'))

Mem. usage decreased to  4.04 Mb (56.2% reduction)
Mem. usage decreased to  2.24 Mb (52.5% reduction)
Mem. usage decreased to 1749.11 Mb (43.7% reduction)
Mem. usage decreased to 114.20 Mb (45.5% reduction)
Mem. usage decreased to 30.32 Mb (46.0% reduction)


## Data Cleaning

In [4]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['category_2'].fillna(1.0,inplace=True)
    df['category_3'].fillna('A',inplace=True)
    df['merchant_id'].fillna('M_ID_00a6ca8a8a',inplace=True)

In [5]:
def get_new_columns(name,aggs):
    return [name + '_' + k + '_' + agg for k in aggs.keys() for agg in aggs[k]]

In [6]:
for df in [df_hist_trans,df_new_merchant_trans]:
    df['purchase_date'] = pd.to_datetime(df['purchase_date'])
    df['year'] = df['purchase_date'].dt.year
    df['weekofyear'] = df['purchase_date'].dt.weekofyear
    df['month'] = df['purchase_date'].dt.month
    df['dayofweek'] = df['purchase_date'].dt.dayofweek
    df['weekend'] = (df.purchase_date.dt.weekday >=5).astype(int)
    df['hour'] = df['purchase_date'].dt.hour
    df['authorized_flag'] = df['authorized_flag'].map({'Y':1, 'N':0})
    df['category_1'] = df['category_1'].map({'Y':1, 'N':0}) 
    #https://www.kaggle.com/c/elo-merchant-category-recommendation/discussion/73244
    df['month_diff'] = ((datetime.datetime.today() - df['purchase_date']).dt.days)//30
    df['month_diff'] += df['month_lag']

In [7]:
def trans_agg(name, aggs):
    new_columns = get_new_columns(name,aggs)
    df_trans_group = df_hist_trans.groupby('card_id').agg(aggs) if name == 'hist' else df_new_merchant_trans.groupby('card_id').agg(aggs)
    df_trans_group.columns = new_columns
    df_trans_group.reset_index(drop=False,inplace=True)
    df_trans_group[name + '_purchase_date_diff'] = (df_trans_group[name + '_purchase_date_max'] - df_trans_group[name + '_purchase_date_min']).dt.days
    df_trans_group[name + '_purchase_date_average'] = df_trans_group[name + '_purchase_date_diff']/df_trans_group[name + '_card_id_size']
    df_trans_group[name + '_purchase_date_uptonow'] = (datetime.datetime.today() - df_trans_group[name + '_purchase_date_max']).dt.days
    return df_trans_group

In [8]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']

aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['authorized_flag'] = ['sum', 'mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_hist_trans[col+'_mean'] = df_hist_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']    

df_trans_group = trans_agg('hist', aggs)
df_train = df_train.merge(df_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_trans_group,on='card_id',how='left')
del df_trans_group
gc.collect()

42

In [9]:
aggs = {}
for col in ['month','hour','weekofyear','dayofweek','year','subsector_id','merchant_id','merchant_category_id']:
    aggs[col] = ['nunique']
aggs['purchase_amount'] = ['sum','max','min','mean','var']
aggs['installments'] = ['sum','max','min','mean','var']
aggs['purchase_date'] = ['max','min']
aggs['month_lag'] = ['max','min','mean','var']
aggs['month_diff'] = ['mean']
aggs['weekend'] = ['sum', 'mean']
aggs['category_1'] = ['sum', 'mean']
aggs['card_id'] = ['size']

for col in ['category_2','category_3']:
    df_new_merchant_trans[col+'_mean'] = df_new_merchant_trans.groupby([col])['purchase_amount'].transform('mean')
    aggs[col+'_mean'] = ['mean']
    
df_trans_group = trans_agg('new_hist', aggs)
df_train = df_train.merge(df_trans_group,on='card_id',how='left')
df_test = df_test.merge(df_trans_group,on='card_id',how='left')
del df_trans_group
gc.collect()

28

In [10]:
del df_hist_trans;gc.collect()
del df_new_merchant_trans;gc.collect()
df_train.head(5)

Unnamed: 0,first_active_month,card_id,feature_1,feature_2,feature_3,target,hist_month_nunique,hist_hour_nunique,hist_weekofyear_nunique,hist_dayofweek_nunique,hist_year_nunique,hist_subsector_id_nunique,hist_merchant_id_nunique,hist_merchant_category_id_nunique,hist_purchase_amount_sum,hist_purchase_amount_max,hist_purchase_amount_min,hist_purchase_amount_mean,hist_purchase_amount_var,hist_installments_sum,hist_installments_max,hist_installments_min,hist_installments_mean,hist_installments_var,hist_purchase_date_max,hist_purchase_date_min,hist_month_lag_max,hist_month_lag_min,hist_month_lag_mean,hist_month_lag_var,hist_month_diff_mean,hist_authorized_flag_sum,hist_authorized_flag_mean,hist_weekend_sum,hist_weekend_mean,hist_category_1_sum,hist_category_1_mean,hist_card_id_size,hist_category_2_mean_mean,hist_category_3_mean_mean,hist_purchase_date_diff,hist_purchase_date_average,hist_purchase_date_uptonow,new_hist_month_nunique,new_hist_hour_nunique,new_hist_weekofyear_nunique,new_hist_dayofweek_nunique,new_hist_year_nunique,new_hist_subsector_id_nunique,new_hist_merchant_id_nunique,new_hist_merchant_category_id_nunique,new_hist_purchase_amount_sum,new_hist_purchase_amount_max,new_hist_purchase_amount_min,new_hist_purchase_amount_mean,new_hist_purchase_amount_var,new_hist_installments_sum,new_hist_installments_max,new_hist_installments_min,new_hist_installments_mean,new_hist_installments_var,new_hist_purchase_date_max,new_hist_purchase_date_min,new_hist_month_lag_max,new_hist_month_lag_min,new_hist_month_lag_mean,new_hist_month_lag_var,new_hist_month_diff_mean,new_hist_weekend_sum,new_hist_weekend_mean,new_hist_category_1_sum,new_hist_category_1_mean,new_hist_card_id_size,new_hist_category_2_mean_mean,new_hist_category_3_mean_mean,new_hist_purchase_date_diff,new_hist_purchase_date_average,new_hist_purchase_date_uptonow
0,2017-06,C_ID_92a2005557,5,2,1,-0.820312,9,23,35,7,2,21,94,41,-165.968735,2.258394,-0.739395,-0.638341,0.045003,4,1,0,0.015385,0.015206,2018-02-25 09:31:15,2017-06-27 14:18:08,0,-8,-3.911538,5.748901,21.407692,247,0.95,90,0.346154,0,0.0,260,0.072502,0.346719,242,0.930769,644,2.0,8.0,7.0,7.0,1.0,10.0,23.0,14.0,-13.242188,-0.296143,-0.724609,-0.575684,0.018433,0.0,0.0,0.0,0.0,0.0,2018-04-29 11:23:05,2018-03-05 14:04:36,2.0,1.0,1.478261,0.26087,21.391304,6.0,0.26087,0.0,0.0,23.0,-0.550293,-0.592773,54.0,2.347826,581.0
1,2017-01,C_ID_3d0044924f,4,1,0,0.392822,12,24,50,7,2,24,142,57,-210.006332,4.6303,-0.7424,-0.600018,0.1482,543,10,-1,1.551429,2.282448,2018-01-31 22:31:09,2017-01-06 16:29:42,0,-12,-5.031429,14.477519,22.348571,339,0.968571,132,0.377143,31,0.088571,350,0.074568,-0.295163,390,1.114286,668,2.0,5.0,4.0,4.0,1.0,4.0,6.0,5.0,-4.355469,-0.70166,-0.739258,-0.726074,0.000207,6.0,1.0,1.0,1.0,0.0,2018-03-30 06:48:26,2018-02-01 17:07:54,2.0,1.0,1.5,0.3,22.5,0.0,0.0,0.0,0.0,6.0,-0.550293,-0.606445,56.0,9.333333,611.0
2,2016-08,C_ID_d639edf6cd,2,2,0,0.687988,10,14,22,7,2,7,13,8,-29.167391,-0.145847,-0.730138,-0.678311,0.007635,0,0,0,0.0,0.0,2018-02-27 19:08:25,2017-01-11 08:21:22,0,-13,-8.604651,14.768549,21.255814,41,0.953488,11,0.255814,0,0.0,43,-0.087803,0.358458,412,9.581395,642,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,-0.700195,-0.700195,-0.700195,-0.700195,,0.0,0.0,0.0,0.0,,2018-04-28 17:43:11,2018-04-28 17:43:11,2.0,2.0,2.0,,21.0,1.0,1.0,0.0,0.0,1.0,-0.548828,-0.592773,0.0,0.0,582.0
3,2017-09,C_ID_186d6a6901,4,3,0,0.142456,6,16,20,7,2,13,50,25,-49.491364,1.445596,-0.740897,-0.642745,0.068447,84,3,-1,1.090909,0.34689,2018-02-28 11:44:40,2017-09-26 16:22:21,0,-5,-2.831169,3.247437,21.311688,77,1.0,11,0.142857,12,0.155844,77,-0.086166,-0.338321,154,2.0,641,2.0,5.0,5.0,4.0,1.0,5.0,7.0,6.0,-4.65625,-0.566895,-0.734375,-0.665039,0.004345,5.0,1.0,-1.0,0.714286,0.571429,2018-04-18 11:00:11,2018-03-07 11:55:06,2.0,1.0,1.714286,0.238095,21.428571,3.0,0.428571,1.0,0.142857,7.0,-0.556641,-0.604492,41.0,5.857143,592.0
4,2017-11,C_ID_cdbd2c0db2,1,3,0,-0.15979,4,22,17,7,2,17,66,26,-48.687656,7.193041,-0.746156,-0.366073,1.828159,182,12,1,1.368421,3.598086,2018-02-28 20:40:41,2017-11-12 00:00:00,0,-3,-1.285714,1.054113,21.300752,128,0.962406,42,0.315789,15,0.112782,133,-0.114647,-0.377684,108,0.81203,640,2.0,14.0,8.0,7.0,1.0,10.0,36.0,17.0,-19.921875,0.450928,-0.739258,-0.553711,0.05011,35.0,2.0,-1.0,0.972222,0.142063,2018-04-28 18:50:25,2018-03-02 11:55:43,2.0,1.0,1.555556,0.253968,21.305556,12.0,0.333333,2.0,0.055556,36.0,-0.555664,-0.588379,57.0,1.583333,582.0


In [11]:
df_train['outliers'] = 0
df_train.loc[df_train['target'] < -30, 'outliers'] = 1
df_train['outliers'].value_counts()

0    199710
1      2207
Name: outliers, dtype: int64

In [12]:
for df in [df_train,df_test]:
    df['first_active_month'] = pd.to_datetime(df['first_active_month'])
    df['dayofweek'] = df['first_active_month'].dt.dayofweek
    df['weekofyear'] = df['first_active_month'].dt.weekofyear
    df['month'] = df['first_active_month'].dt.month
    df['elapsed_time'] = (datetime.datetime.today() - df['first_active_month']).dt.days
    df['hist_first_buy'] = (df['hist_purchase_date_min'] - df['first_active_month']).dt.days
    df['new_hist_first_buy'] = (df['new_hist_purchase_date_min'] - df['first_active_month']).dt.days
    for f in ['hist_purchase_date_max','hist_purchase_date_min','new_hist_purchase_date_max',\
                     'new_hist_purchase_date_min']:
        df[f] = df[f].astype(np.int64) * 1e-9
    df['card_id_total'] = df['new_hist_card_id_size']+df['hist_card_id_size']
    df['purchase_amount_total'] = df['new_hist_purchase_amount_sum']+df['hist_purchase_amount_sum']

# for f in ['feature_1','feature_2','feature_3']:
#     order_label = df_train.groupby([f])['outliers'].mean()
#     df_train[f] = df_train[f].map(order_label)
#     df_test[f] = df_test[f].map(order_label)


## Outlier

In [13]:
df_train_with_outlier = df_train.copy()

## Training

## Normal Train

In [14]:
## Normal Train
df_train_columns = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','target','outliers']]
target = df_train['target']
categorical_feats = [c for c in df_train_columns if 'feature_' in c]

In [15]:
param = {'num_leaves': 2500,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.1,
         "min_child_samples": 20,
         "boosting": "dart",
         "feature_fraction": 0.7,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=4590)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][df_train_columns], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][df_train_columns], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][df_train_columns], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = df_train_columns
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[df_train_columns], num_iteration=clf.best_iteration) / folds.n_splits

np.sqrt(mean_squared_error(oof, target))

fold 0
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.53768	valid_1's rmse: 3.68564
Early stopping, best iteration is:
[77]	training's rmse: 2.69974	valid_1's rmse: 3.68301
fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.5361	valid_1's rmse: 3.68577
Early stopping, best iteration is:
[67]	training's rmse: 2.75356	valid_1's rmse: 3.6807
fold 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.53554	valid_1's rmse: 3.7006
Early stopping, best iteration is:
[45]	training's rmse: 2.90392	valid_1's rmse: 3.69103
fold 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.53046	valid_1's rmse: 3.69271
Early stopping, best iteration is:
[90]	training's rmse: 2.59483	valid_1's rmse: 3.68923
fold 4
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 2.52629	valid_1's rmse: 3.67851
Early stopping, best i

3.7027337738019654

In [16]:
best_submission = pd.DataFrame({"card_id":df_test["card_id"].values})
best_submission["target"] = predictions

### Train for Non-outlier

In [17]:
# Only select non-outlier
df_train = df_train[df_train['outliers'] == 0]
target = df_train['target']
del df_train['target']
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]

In [18]:
param = {'num_leaves': 2500,
         'min_data_in_leaf': 30, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.1,
         "min_child_samples": 20,
         "boosting": "dart",
         "feature_fraction": 0.7,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 8,
         "random_state": 4590}
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2333)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 100)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.02694	valid_1's rmse: 1.57678
[200]	training's rmse: 0.745605	valid_1's rmse: 1.58188
Early stopping, best iteration is:
[101]	training's rmse: 1.01482	valid_1's rmse: 1.57672
fold 1
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.03058	valid_1's rmse: 1.55808
Early stopping, best iteration is:
[89]	training's rmse: 1.07164	valid_1's rmse: 1.55755
fold 2
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.02581	valid_1's rmse: 1.58565
Early stopping, best iteration is:
[90]	training's rmse: 1.05406	valid_1's rmse: 1.58523
fold 3
Training until validation scores don't improve for 100 rounds.
[100]	training's rmse: 1.02662	valid_1's rmse: 1.56853
[200]	training's rmse: 0.745318	valid_1's rmse: 1.57583
Early stopping, best iteration is:
[100]	training's rmse: 1.02662	valid_1's rmse: 1.56853
fold 4
Training until validati

In [19]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions

### Train to determine outlier

In [20]:
df_train = df_train_with_outlier
isOutlier = df_train['outliers']
del df_train['outliers']
del df_train['target']
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [21]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
outlier_predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, isOutlier.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=isOutlier.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=isOutlier.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    outlier_predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(isOutlier, oof)))

fold n°0
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0444851	valid_1's binary_logloss: 0.0471206
[200]	training's binary_logloss: 0.0444412	valid_1's binary_logloss: 0.0470667
[300]	training's binary_logloss: 0.0444516	valid_1's binary_logloss: 0.0470878
Early stopping, best iteration is:
[181]	training's binary_logloss: 0.0444298	valid_1's binary_logloss: 0.0470589
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0444866	valid_1's binary_logloss: 0.0454288
[200]	training's binary_logloss: 0.0444308	valid_1's binary_logloss: 0.0453418
Early stopping, best iteration is:
[5]	training's binary_logloss: 0.0446916	valid_1's binary_logloss: 0.0451183
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0446572	valid_1's binary_logloss: 0.0441425
[200]	training's binary_logloss: 0.0446421	valid_1's binary_logloss: 0.0440969
Early

In [22]:
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = outlier_predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.085141
1,C_ID_130fd0cbdd,0.001808
2,C_ID_b709037bc5,0.010753
3,C_ID_d27d835a9f,0.001808
4,C_ID_2b5e3df5c2,0.001808


In [23]:
# 123623*0.0106 = 1310.4038
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(25000)['card_id'])
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()


Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-1.697952
1,C_ID_b709037bc5,-0.451921
2,C_ID_f7cada36d3,0.731079
3,C_ID_6d8dba8475,-0.608415
4,C_ID_7f1041e8e1,-2.816065


In [24]:
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target'] = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

## Submit

In [25]:
model_without_outliers.to_csv("submission.csv", index=False)