In [27]:
import pandas as pd
import pandas_profiling
import os
import pickle
import gc
import numpy as np
from matplotlib import pyplot as plt
%matplotlib inline
import seaborn as sns


import warnings
warnings.filterwarnings('ignore')

from sklearn.linear_model import Lasso
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
#評価用

train = pd.read_csv("data/train_df.csv")
test = pd.read_csv("data/test_df.csv")


#データ読み込み
with open('ensamble/lgb_test.pickle', mode='br') as fi:
    test_p1 = pickle.load(fi)
    
with open('ensamble/tabnet_test.pickle', mode='br') as fi:
    test_p2 = pickle.load(fi)

    
    
with open('ensamble/lgb_train.pickle', mode='br') as fi:
    train_p1 = pickle.load(fi)
    
with open('ensamble/tabnet_train.pickle', mode='br') as fi:
    train_p2 = pickle.load(fi)
    

## アンサンブル

In [28]:
#　評価用
df_train = pd.concat(
    [train['target_label'],train_p1['pred'],train_p2['pred'],], 
                     axis=1, ignore_index=True).rename(columns={
    0: 'true', 1: 'pred1', 2: 'pred2',
}).dropna(axis=0)

df_train['ensemble'] = df_train['pred1']*0.7 + df_train['pred2']*0.3 

# 提出用
df_test = pd.concat(
    [test['id'],test_p1['target_label'],test_p2['target_label']], 
                    axis=1, ignore_index=True).rename(columns={
    0: 'id', 1: 'pred1', 2: 'pred2', 
})

df_test['ensemble'] = df_test['pred1']*0.7 + df_test['pred2']*0.3

def evaluate_ensemble(input_df, col_pred):
    print('[auc] model1:{:.4f},model2:{:.4f},  ->    emsamble]{:.4f}'.format(
        roc_auc_score(input_df['true'], input_df['pred1']),
        roc_auc_score(input_df['true'], input_df['pred2']),
        
        
        roc_auc_score(input_df['true'], input_df[col_pred]),
    ))

In [29]:
df_train

Unnamed: 0,true,pred1,pred2,ensemble
0,0.0,0.457839,0.931352,0.599893
1,0.0,0.323485,0.762955,0.455326
2,0.0,0.027774,0.019028,0.025150
3,0.0,0.028758,0.021631,0.026620
4,0.0,0.053912,0.463972,0.176930
...,...,...,...,...
51354,0.0,0.062445,0.060634,0.061901
51355,0.0,0.204498,0.510024,0.296156
51356,0.0,0.120706,0.755321,0.311090
51357,0.0,0.012176,0.037450,0.019758


In [30]:
evaluate_ensemble(df_train, col_pred='ensemble')

[auc] model1:0.8869,model2:0.8743,  ->    emsamble]0.8885


In [31]:
sub = df_test.copy()    

sub = sub[['id', 'ensemble']] .rename({'ensemble':'target_label'}, axis=1)
sub.to_csv('sub/submission_lgb+tab.csv', index=None, header=True,)

## スタッキング

In [32]:
x, y = df_train[['pred1', 'pred2', ]], df_train[['true']]
oof = np.zeros(len(x))
models = []

cv = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=123).split(x,y))
for nfold in np.arange(5):
    idx_tr, idx_va = cv[nfold][0], cv[nfold][1]
    x_tr, y_tr = x.loc[idx_tr, :], y.loc[idx_tr, :]
    x_va, y_va = x.loc[idx_va, :], y.loc[idx_tr, :]
    
    model = Lasso(alpha=0.01)
    model.fit(x_tr, y_tr)
    models.append(model)
    
    y_va_pred = model.predict(x_va)
    oof[idx_va] = y_va_pred
    
df_train['stacking'] = oof
df_train['stacking'] = df_train['stacking'].clip(lower=0, upper=1)
df_train.head(20)

Unnamed: 0,true,pred1,pred2,ensemble,stacking
0,0.0,0.457839,0.931352,0.599893,0.229338
1,0.0,0.323485,0.762955,0.455326,0.16755
2,0.0,0.027774,0.019028,0.02515,0.027276
3,0.0,0.028758,0.021631,0.02662,0.027478
4,0.0,0.053912,0.463972,0.17693,0.044154
5,1.0,0.875559,0.965412,0.902515,0.412261
6,0.0,0.004346,0.004183,0.004297,0.017574
7,0.0,0.05956,0.272705,0.123503,0.045653
8,0.0,0.026237,0.049704,0.033277,0.026706
9,0.0,0.049387,0.553939,0.200753,0.043246


In [33]:
evaluate_ensemble(df_train, col_pred='stacking')

[auc] model1:0.8869,model2:0.8743,  ->    emsamble]0.8879


In [34]:
sub = df_test.copy()    
sub = sub[['id', 'ensemble']].rename({'ensemble':'target_label'}, axis=1)
sub.to_csv('sub/submission_stack_w.csv', index=None, header=True,)

In [35]:
sub

Unnamed: 0,id,target_label
0,51359,0.111197
1,51360,0.034709
2,51361,0.277563
3,51362,0.061114
4,51363,0.332291
...,...,...
12835,64194,0.073713
12836,64195,0.117765
12837,64196,0.187775
12838,64197,0.952846
