In [25]:
import pandas as pd
import numpy as np
import lightgbm as lgb

from os import listdir

from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

intput_path = "../input/"
output_path = "../output/"

In [2]:
files = [f for f in sorted(listdir(output_path)) if f.endswith("_oof.npy") and f[0]!="!"]
files

['cat_0.91617_oof.npy',
 'lgb_0.91145_oof.npy',
 'nn_0.9163_oof.npy',
 'xgb_0.91221_oof.npy']

In [3]:
scaler = StandardScaler()

dt = pd.DataFrame()

df = pd.read_csv(intput_path+'train.csv.zip')
df = df[["target"]]

cols = []
for file in files:
    train = np.load(output_path+file).reshape(-1, 1)
    col = file[:-4]
    cols.append(col)
    df[col] = scaler.fit_transform(train)
    
    test = np.load(output_path+file.replace("_oof.npy","_test.npy")).reshape(-1, 1)
    dt[col] = [x[0] for x in scaler.transform(test)]
    
df["avg"] = df[cols].mean(axis=1)
dt["avg"] = dt[cols].mean(axis=1)

dt.head()

Unnamed: 0,cat_0.91617_oof,lgb_0.91145_oof,nn_0.9163_oof,xgb_0.91221_oof,avg
0,-0.028851,-0.045147,0.329803,-0.058851,0.049238
1,0.630567,0.880821,1.106001,0.659944,0.819333
2,1.006787,0.910918,0.878276,0.883434,0.919854
3,0.844511,0.394093,0.953653,0.427158,0.654854
4,-0.234038,-0.149319,0.088908,-0.114493,-0.102236


In [4]:
df[cols].corr()

Unnamed: 0,cat_0.91617_oof,lgb_0.91145_oof,nn_0.9163_oof,xgb_0.91221_oof
cat_0.91617_oof,1.0,0.973739,0.811705,0.973639
lgb_0.91145_oof,0.973739,1.0,0.800824,0.984808
nn_0.9163_oof,0.811705,0.800824,1.0,0.797771
xgb_0.91221_oof,0.973639,0.984808,0.797771,1.0


## AVG

In [5]:
roc_auc_score(df.target, df.avg)

0.9183011099886771

## LogReg

In [12]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

df["clf"] = 0
dt["clf"] = 0

i = -1
for train_index, valid_index in skf.split(df[cols], df.target):
    i+=1
    
    X_train = df.loc[train_index, cols]
    X_valid = df.loc[valid_index, cols]

    y_train = df.loc[train_index, "target"]
    y_valid = df.loc[valid_index, "target"]
    
    clf = LogisticRegression(C=1,
                             solver="newton-cg",
                             penalty="l2", 
                             n_jobs=-1, 
                             max_iter=100).fit(X_train, y_train) 
    
    y_pred = clf.predict_proba(X_valid)[:,1] 
    df.loc[valid_index, "clf"] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 4))
    
    dt["clf"] += clf.predict_proba(dt[cols])[:,1] / skf.n_splits
    
print("\nROC AUC:", round(roc_auc_score(df.target, df["clf"]), 4))

0 ROC AUC: 0.9184
1 ROC AUC: 0.9148
2 ROC AUC: 0.9228
3 ROC AUC: 0.9189
4 ROC AUC: 0.9171

ROC AUC: 0.9182
CPU times: user 3.32 s, sys: 11.2 s, total: 14.5 s
Wall time: 6.96 s


## SVM

In [None]:
# %%time

# skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# df["svc"] = 0
# dt["svc"] = 0

# i = -1
# for train_index, valid_index in skf.split(df[cols], df.target):
#     i+=1
    
#     X_train = df.loc[train_index, cols]
#     X_valid = df.loc[valid_index, cols]

#     y_train = df.loc[train_index, "target"]
#     y_valid = df.loc[valid_index, "target"]
    
#     svc = SVC(C=10, probability=True).fit(X_train, y_train)  
    
#     y_pred = svc.predict_proba(X_valid)[:,1] 
#     df.loc[valid_index, "svc"] = y_pred
#     print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 4))
    
#     dt["svc"] += svc.predict_proba(dt[cols].values) / skf.n_splits
    
# print("\nROC AUC:", round(roc_auc_score(df.target, df["svc"]), 4))

## NN

In [22]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

df["nei"] = 0
dt["nei"] = 0

i = -1
for train_index, valid_index in skf.split(df[cols], df.target):
    i+=1
    
    X_train = df.loc[train_index, cols]
    X_valid = df.loc[valid_index, cols]

    y_train = df.loc[train_index, "target"]
    y_valid = df.loc[valid_index, "target"]
    
    nei = KNeighborsClassifier(n_neighbors=1000, p=1, n_jobs=-1).fit(X_train, y_train) 
    
    y_pred = nei.predict_proba(X_valid)[:,1] 
    df.loc[valid_index, "nei"] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 4))
    
    dt["nei"] += nei.predict_proba(dt[cols])[:,1] / skf.n_splits
    
print("\nROC AUC:", round(roc_auc_score(df.target, df["nei"]), 4))

0 ROC AUC: 0.919
1 ROC AUC: 0.9159
2 ROC AUC: 0.9228
3 ROC AUC: 0.9187
4 ROC AUC: 0.917

ROC AUC: 0.9185
CPU times: user 1h 24min 28s, sys: 1min 28s, total: 1h 25min 56s
Wall time: 5min 4s


## LGBM

In [23]:
rounds = 1000
early_stop_rounds = 50

params = {'lambda_l1': 0, 
          'lambda_l2': 1,
          'feature_fraction':0.9,
          'learning_rate': 0.03, 
          'max_depth': 4,
          'boosting_type': 'gbrt', 
          'objective': 'binary', 
          'metric': 'auc',
          'weight': [1, 1],
          'max_bin': 1024,
          'n_jobs':-1
         }

In [27]:
%%time

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

df["lgb"] = 0
dt["lgb"] = 0

i = -1
for train_index, valid_index in skf.split(df[cols], df.target):
    i+=1
    
    X_train = df.loc[train_index, cols]
    X_valid = df.loc[valid_index, cols]

    y_train = df.loc[train_index, "target"]
    y_valid = df.loc[valid_index, "target"]
    
    d_train = lgb.Dataset(X_train, y_train, feature_name=cols)
    d_valid = lgb.Dataset(X_valid, y_valid, feature_name=cols)    
    
    model = lgb.train(params,
                      d_train,
                      num_boost_round=rounds,
                      valid_sets=[d_train, d_valid],
                      valid_names=['train','valid'],
                      early_stopping_rounds=early_stop_rounds,
                      verbose_eval=50) 

    y_pred = model.predict(X_valid)
    df.loc[valid_index, "lgb"] = y_pred
    print(i, "ROC AUC:", round(roc_auc_score(y_valid, y_pred), 4))
    
    dt["lgb"] += model.predict(dt[cols]) / skf.n_splits
    
print("\nROC AUC:", round(roc_auc_score(df.target, df["lgb"]), 4))

Please use weight argument of the Dataset constructor to pass this parameter.
  .format(key))


Training until validation scores don't improve for 50 rounds.
[50]	train's auc: 0.920537	valid's auc: 0.919915
[100]	train's auc: 0.920871	valid's auc: 0.920003
[150]	train's auc: 0.921214	valid's auc: 0.919904
Early stopping, best iteration is:
[101]	train's auc: 0.920874	valid's auc: 0.920008
0 ROC AUC: 0.92
Training until validation scores don't improve for 50 rounds.


Please use weight argument of the Dataset constructor to pass this parameter.
  .format(key))


[50]	train's auc: 0.921276	valid's auc: 0.916769
[100]	train's auc: 0.92159	valid's auc: 0.916915
Early stopping, best iteration is:
[79]	train's auc: 0.921414	valid's auc: 0.916953
1 ROC AUC: 0.917
Training until validation scores don't improve for 50 rounds.


Please use weight argument of the Dataset constructor to pass this parameter.
  .format(key))


[50]	train's auc: 0.919568	valid's auc: 0.923711
[100]	train's auc: 0.919849	valid's auc: 0.923688
Early stopping, best iteration is:
[72]	train's auc: 0.919707	valid's auc: 0.923748
2 ROC AUC: 0.9237
Training until validation scores don't improve for 50 rounds.


Please use weight argument of the Dataset constructor to pass this parameter.
  .format(key))


[50]	train's auc: 0.920729	valid's auc: 0.918907
[100]	train's auc: 0.92108	valid's auc: 0.919147
[150]	train's auc: 0.921447	valid's auc: 0.919201
Early stopping, best iteration is:
[136]	train's auc: 0.921328	valid's auc: 0.919214
3 ROC AUC: 0.9192
Training until validation scores don't improve for 50 rounds.


Please use weight argument of the Dataset constructor to pass this parameter.
  .format(key))


[50]	train's auc: 0.92102	valid's auc: 0.91764
[100]	train's auc: 0.921339	valid's auc: 0.917778
[150]	train's auc: 0.9217	valid's auc: 0.917802
[200]	train's auc: 0.922082	valid's auc: 0.91777
Early stopping, best iteration is:
[165]	train's auc: 0.921777	valid's auc: 0.91783
4 ROC AUC: 0.9178

ROC AUC: 0.918
CPU times: user 4min 58s, sys: 751 ms, total: 4min 59s
Wall time: 10.9 s


In [39]:
w = [1,1,1]

df["blend"] = (w[0]*df["lgb"]+w[1]*df["nei"]+w[2]*df["clf"])/sum(w)
dt["blend"] = (w[0]*dt["lgb"]+w[1]*dt["nei"]+w[2]*dt["clf"])/sum(w)

print("\nROC AUC:", round(roc_auc_score(df.target, df["blend"]), 4))


ROC AUC: 0.9191


In [40]:
sub = pd.read_csv(intput_path+'sample_submission.csv.zip')
sub["target"] = dt["blend"]
sub.head()

Unnamed: 0,ID_code,target
0,test_0,0.063179
1,test_1,0.224664
2,test_2,0.205822
3,test_3,0.203759
4,test_4,0.0404


In [41]:
sub.to_csv(output_path + "best_blend.csv", index=False)

In [42]:
!ls ../output/

 best_auc_nn.pkl	  cat_0.91617_oof.npy	 nn_0.9163_oof.npy
 best_blend.csv		  cat_0.91617_test.npy	 nn_0.9163_test.npy
'!cat_0.91558_oof.npy'	  lgb_0.91145_oof.npy	 xgb_0.91221_oof.npy
'!cat_0.91558_test.npy'   lgb_0.91145_test.npy	 xgb_0.91221_test.npy


In [43]:
!kaggle competitions submit -c santander-customer-transaction-prediction -f ../output/best_blend.csv -m "0.9191 lgb,nei,clf"

100%|██████████████████████████████████████| 6.07M/6.07M [00:05<00:00, 1.26MB/s]
Successfully submitted to Santander Customer Transaction Prediction