In [1]:
import numpy as np
import pandas as pd
import time
import lightgbm as lgb
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import log_loss

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [3]:
%%time
df_train = pd.read_csv('elo_train.csv')
df_test = pd.read_csv('elo_test.csv')

CPU times: user 8.6 s, sys: 581 ms, total: 9.18 s
Wall time: 9.2 s


In [4]:
df_train = df_train[df_train['outliers'] == 0]
target = df_train['target']
del df_train['target']
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month','outliers']]
categorical_feats = [c for c in features if 'feature_' in c]


In [5]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 32, 
         'objective':'regression',
         'max_depth': -1,
         'learning_rate': 0.005,
         "min_child_samples": 20,
         "boosting": "gbdt",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'rmse',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "nthread": 4,
         "random_state": 4590}

In [6]:
%%time
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=2333)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train,df_train['outliers'].values)):
    print("fold {}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx])#, categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx])#, categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval= 100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["Feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(mean_squared_error(oof, target)**0.5))

fold 0
Training until validation scores don't improve for 200 rounds.
[100]	training's rmse: 1.64216	valid_1's rmse: 1.65112
[200]	training's rmse: 1.60609	valid_1's rmse: 1.61719
[300]	training's rmse: 1.58643	valid_1's rmse: 1.5997
[400]	training's rmse: 1.57426	valid_1's rmse: 1.58941
[500]	training's rmse: 1.56554	valid_1's rmse: 1.58265
[600]	training's rmse: 1.55884	valid_1's rmse: 1.5781
[700]	training's rmse: 1.55328	valid_1's rmse: 1.57467
[800]	training's rmse: 1.5486	valid_1's rmse: 1.57212
[900]	training's rmse: 1.54451	valid_1's rmse: 1.57008
[1000]	training's rmse: 1.54077	valid_1's rmse: 1.56853
[1100]	training's rmse: 1.53736	valid_1's rmse: 1.56722
[1200]	training's rmse: 1.53426	valid_1's rmse: 1.56621
[1300]	training's rmse: 1.5313	valid_1's rmse: 1.56538
[1400]	training's rmse: 1.52848	valid_1's rmse: 1.56466
[1500]	training's rmse: 1.52585	valid_1's rmse: 1.56414
[1600]	training's rmse: 1.52333	valid_1's rmse: 1.56373
[1700]	training's rmse: 1.52092	valid_1's rmse:

[2300]	training's rmse: 1.50695	valid_1's rmse: 1.56861
[2400]	training's rmse: 1.50495	valid_1's rmse: 1.56849
[2500]	training's rmse: 1.50302	valid_1's rmse: 1.56841
[2600]	training's rmse: 1.50111	valid_1's rmse: 1.56827
[2700]	training's rmse: 1.49912	valid_1's rmse: 1.56816
[2800]	training's rmse: 1.49722	valid_1's rmse: 1.56802
[2900]	training's rmse: 1.49538	valid_1's rmse: 1.56791
[3000]	training's rmse: 1.49352	valid_1's rmse: 1.56784
[3100]	training's rmse: 1.4917	valid_1's rmse: 1.5678
[3200]	training's rmse: 1.48987	valid_1's rmse: 1.56774
[3300]	training's rmse: 1.48805	valid_1's rmse: 1.56767
[3400]	training's rmse: 1.48623	valid_1's rmse: 1.5676
[3500]	training's rmse: 1.48442	valid_1's rmse: 1.56751
[3600]	training's rmse: 1.48259	valid_1's rmse: 1.56742
[3700]	training's rmse: 1.48079	valid_1's rmse: 1.56735
[3800]	training's rmse: 1.47901	valid_1's rmse: 1.56729
[3900]	training's rmse: 1.47722	valid_1's rmse: 1.5672
[4000]	training's rmse: 1.47547	valid_1's rmse: 1.56

[4900]	training's rmse: 1.46096	valid_1's rmse: 1.56102
Early stopping, best iteration is:
[4769]	training's rmse: 1.46316	valid_1's rmse: 1.56096
CV score: 1.55592 
CPU times: user 1h 5min 57s, sys: 25.2 s, total: 1h 6min 22s
Wall time: 16min 44s


In [7]:
model_without_outliers = pd.DataFrame({"card_id":df_test["card_id"].values})
model_without_outliers["target"] = predictions

In [10]:
sub_df = pd.DataFrame({"card_id":df_test["card_id"].values})
sub_df["target"] = predictions
sub_df.to_csv("submission_outlier_style_original.csv", index=False)

## Part 2 Training Model For Outliers Classification

In [11]:
%%time
df_train = pd.read_csv('elo_train.csv')
df_test = pd.read_csv('elo_test.csv')

CPU times: user 8.31 s, sys: 559 ms, total: 8.87 s
Wall time: 8.88 s


In [12]:
target = df_train['outliers']
del df_train['outliers']
del df_train['target']

In [13]:
features = [c for c in df_train.columns if c not in ['card_id', 'first_active_month']]
categorical_feats = [c for c in features if 'feature_' in c]

In [14]:
param = {'num_leaves': 31,
         'min_data_in_leaf': 30, 
         'objective':'binary',
         'max_depth': 6,
         'learning_rate': 0.01,
         "boosting": "rf",
         "feature_fraction": 0.9,
         "bagging_freq": 1,
         "bagging_fraction": 0.9 ,
         "bagging_seed": 11,
         "metric": 'binary_logloss',
         "lambda_l1": 0.1,
         "verbosity": -1,
         "random_state": 2333}

In [15]:
%%time
folds = KFold(n_splits=5, shuffle=True, random_state=15)
oof = np.zeros(len(df_train))
predictions = np.zeros(len(df_test))
feature_importance_df = pd.DataFrame()

start = time.time()


for fold_, (trn_idx, val_idx) in enumerate(folds.split(df_train.values, target.values)):
    print("fold n°{}".format(fold_))
    trn_data = lgb.Dataset(df_train.iloc[trn_idx][features], label=target.iloc[trn_idx], categorical_feature=categorical_feats)
    val_data = lgb.Dataset(df_train.iloc[val_idx][features], label=target.iloc[val_idx], categorical_feature=categorical_feats)

    num_round = 10000
    clf = lgb.train(param, trn_data, num_round, valid_sets = [trn_data, val_data], verbose_eval=100, early_stopping_rounds = 200)
    oof[val_idx] = clf.predict(df_train.iloc[val_idx][features], num_iteration=clf.best_iteration)
    
    fold_importance_df = pd.DataFrame()
    fold_importance_df["feature"] = features
    fold_importance_df["importance"] = clf.feature_importance()
    fold_importance_df["fold"] = fold_ + 1
    feature_importance_df = pd.concat([feature_importance_df, fold_importance_df], axis=0)
    
    predictions += clf.predict(df_test[features], num_iteration=clf.best_iteration) / folds.n_splits

print("CV score: {:<8.5f}".format(log_loss(target, oof)))

fold n°0




Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.0444693	valid_1's binary_logloss: 0.0472216
[200]	training's binary_logloss: 0.0444428	valid_1's binary_logloss: 0.0472017
Early stopping, best iteration is:
[18]	training's binary_logloss: 0.0444122	valid_1's binary_logloss: 0.0471541
fold n°1
Training until validation scores don't improve for 200 rounds.
[100]	training's binary_logloss: 0.044465	valid_1's binary_logloss: 0.0454994
[200]	training's binary_logloss: 0.044422	valid_1's binary_logloss: 0.0454754
[300]	training's binary_logloss: 0.0443965	valid_1's binary_logloss: 0.0454315
[400]	training's binary_logloss: 0.0444075	valid_1's binary_logloss: 0.0454529
[500]	training's binary_logloss: 0.0444048	valid_1's binary_logloss: 0.0454686
Early stopping, best iteration is:
[309]	training's binary_logloss: 0.0443886	valid_1's binary_logloss: 0.0454221
fold n°2
Training until validation scores don't improve for 200 rounds.
[100]	training'

In [16]:
df_outlier_prob = pd.DataFrame({"card_id":df_test["card_id"].values})
df_outlier_prob["target"] = predictions
df_outlier_prob.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,0.095957
1,C_ID_130fd0cbdd,0.001884
2,C_ID_b709037bc5,0.008661
3,C_ID_d27d835a9f,0.001884
4,C_ID_2b5e3df5c2,0.001884


## Part 3 Combining Submission:

In [43]:
outlier_id = pd.DataFrame(df_outlier_prob.sort_values(by='target',ascending = False).head(26000)['card_id'])


In [44]:
best_submission = pd.read_csv('submission_remove_few_features.csv')

In [45]:
most_likely_liers = best_submission.merge(outlier_id,how='right')
most_likely_liers.head()

Unnamed: 0,card_id,target
0,C_ID_0ab67a22ab,-2.874152
1,C_ID_f7cada36d3,0.504292
2,C_ID_6d8dba8475,-1.133839
3,C_ID_7f1041e8e1,-5.320867
4,C_ID_22e4a47c72,0.355567


In [46]:
%%time
for card_id in most_likely_liers['card_id']:
    model_without_outliers.loc[model_without_outliers['card_id']==card_id,'target']\
    = most_likely_liers.loc[most_likely_liers['card_id']==card_id,'target'].values

CPU times: user 5min 54s, sys: 878 ms, total: 5min 55s
Wall time: 5min 56s


In [47]:
model_without_outliers.to_csv("combining_submission_26000.csv", index=False)