In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [None]:
display(train_err.tail())
display(train_qua.tail())
display(train_prob.tail())

In [4]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [5]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train_err = train_err.merge(train_prob1, on='user_id', how='left').fillna(0)

In [6]:
err_code = train_err.groupby(['errtype_code'])['prob'].sum().rank(method='min').reset_index()
err_code.columns = ['errtype_code','err_code_rank']
err_code

Unnamed: 0,errtype_code,err_code_rank
0,0,940.0
1,10,2815.0
2,101,2853.0
3,111,2855.0
4,121,2856.0
...,...,...
2866,9C-14014,2727.0
2867,9V-21002,2737.0
2868,9V-21004,940.0
2869,9V-21005,2730.0


In [7]:
train_err = train_err.merge(err_code, on='errtype_code', how='left').fillna(0)
test_err = test_err.merge(err_code, on='errtype_code', how='left').fillna(0)

In [None]:
train_err

In [8]:
date_rank = train_err.groupby(['date'])['prob'].sum().rank(method='min').reset_index()
date_rank.columns = ['date','date_rank']
date_rank.min(), date_rank.max()

(date         20201031
 date_rank           1
 dtype: object,
 date         20201202
 date_rank          33
 dtype: object)

In [9]:
train_err = train_err.merge(date_rank, on='date', how='left').fillna(0)
test_err = test_err.merge(date_rank, on='date', how='left').fillna(0)
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank,date_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0,31.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0,31.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0,31.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0,31.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0,4.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0,4.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0


In [10]:
model_fwver_rank = train_err.groupby(['model_fwver'])['prob'].sum().rank(method='min').reset_index()
model_fwver_rank.columns = ['model_fwver','model_fwver_rank']
model_fwver_rank.min(), model_fwver_rank.max()

(model_fwver         model_004.22.1442
 model_fwver_rank                    1
 dtype: object,
 model_fwver         model_804.73.2571
 model_fwver_rank                   37
 dtype: object)

In [11]:
train_err = train_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
test_err = test_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank,date_rank,model_fwver_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0,31.0,32.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0,31.0,32.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0,31.0,32.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0,31.0,32.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0,31.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0,4.0,32.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0,4.0,32.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0,32.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0,32.0


In [12]:
model_time = pd.concat([train_err[['model_fwver','time']], test_err[['model_fwver','time']]])

In [13]:
model_time['time_rank'] = model_time.groupby(['model_fwver'])['time'].rank(method='min')

In [14]:
model_time = model_time.drop_duplicates()

In [15]:
train_err = train_err.merge(model_time, on=['model_fwver','time'], how='left')

In [16]:
test_err = test_err.merge(model_time, on=['model_fwver','time'], how='left')

In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [17]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [18]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [19]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [20]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [21]:
train_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,...,model_fwver_rank,time_rank,weekday,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,...,32.0,13091.0,6,0,0,0,0,0,0,1
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,...,32.0,13590.0,6,0,0,0,0,0,0,1
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,...,32.0,13590.0,6,0,0,0,0,0,0,1
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,...,32.0,23520.0,6,0,0,0,0,0,0,1
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,...,32.0,23521.0,6,0,0,0,0,0,0,1


In [22]:
train_err_8 = train_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
train_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

test_err_8 = test_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
test_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

train = train.merge(train_err_8, on='user_id', how='left').fillna(0)
test = test.merge(test_err_8, on='user_id', how='left').fillna(0)

In [23]:
train_err_7 = train_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
train_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

test_err_7 = test_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
test_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

train = train.merge(train_err_7, on='user_id', how='left').fillna(0)
test = test.merge(test_err_7, on='user_id', how='left').fillna(0)

In [26]:
train_err_6 = train_err.groupby('user_id').agg({'time_rank':['min','max']}).reset_index()
train_err_6.columns = ['user_id', 'time_rank_min', 'time_rank_max']

test_err_6 = test_err.groupby('user_id').agg({'time_rank':['min','max']}).reset_index()
test_err_6.columns = ['user_id', 'time_rank_min', 'time_rank_max']

train = train.merge(train_err_6, on='user_id', how='left').fillna(0)
test = test.merge(test_err_6, on='user_id', how='left').fillna(0)

In [27]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,time_rank_sum,time_rank_max
0,10000,30,317,0,0,0,0,0,0,0,...,54.0,52.0,907345.0,2869.0,5840.0,33.0,10144.0,32.0,13091.0,3695261.0
1,10001,30,2365,0,0,0,0,0,379,0,...,117.0,139.0,6763505.0,2870.0,25937.0,33.0,81259.0,35.0,13563.0,5096898.0
2,10002,29,306,0,0,0,0,0,0,0,...,42.0,58.0,876071.0,2869.0,5764.0,33.0,9792.0,32.0,13562.0,3694745.0
3,10003,30,306,0,0,0,0,0,81,0,...,22.0,57.0,870419.0,2871.0,5388.0,33.0,10386.0,35.0,141675.0,5053042.0
4,10004,30,777,0,0,645,0,0,0,0,...,84.0,108.0,2226738.0,2871.0,14420.0,33.0,27576.0,36.0,27219.0,5764312.0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,time_rank_sum,time_rank_max
0,30000,29,2750,0,0,0,0,0,0,0,...,1024.0,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,20289.0,9304648.0
1,30001,28,284,0,0,0,0,0,0,0,...,54.0,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,19621.0,3700260.0
2,30002,30,941,0,0,733,0,0,0,0,...,140.0,135.0,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,7637.0,5807641.0
3,30003,28,371,0,0,246,0,0,0,0,...,24.0,71.0,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,267623.0,5399825.0
4,30004,30,881,0,0,0,0,0,0,0,...,102.0,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,7648.0,8524743.0


((15000, 102), (14998, 102))

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [28]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [33]:
train.columns[1:-1]

Index(['date_cnt', 'date_sum', 'model_403.11.1167', 'model_804.73.2571',
       'model_004.22.1750', 'model_610', 'model_68.5.3', 'model_204.33.1185',
       'model_705.66.3237', 'model_104.16.3571',
       ...
       'wd_5', 'wd_6', 'err_rank_sum', 'err_rank_max', 'date_rank_sum',
       'date_rank_max', 'model_fwver_rank_sum', 'model_fwver_rank_max',
       'time_rank_sum', 'time_rank_max'],
      dtype='object', length=101)

In [43]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-1]
           , ignore_features=['err_rank_sum']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 103)"
4,Missing Values,False
5,Numeric Features,102
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 4.58 s


In [44]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7941,0.8186,0.504,0.8057,0.62,0.4889,0.5142,13.5725
1,Gradient Boosting Classifier,0.7931,0.8166,0.4834,0.8231,0.6087,0.4809,0.5125,5.4439
2,Light Gradient Boosting Machine,0.7908,0.8135,0.518,0.7808,0.6227,0.4861,0.5057,0.6955
3,Extra Trees Classifier,0.7914,0.8059,0.5017,0.7974,0.6158,0.4827,0.5071,0.6224
4,Ada Boost Classifier,0.782,0.8023,0.5026,0.7623,0.6054,0.4636,0.4829,1.2274
5,Extreme Gradient Boosting,0.7774,0.7968,0.5289,0.7292,0.6128,0.4622,0.4741,2.3419
6,Random Forest Classifier,0.7597,0.7668,0.4503,0.7239,0.5549,0.4025,0.4239,0.1269
7,Linear Discriminant Analysis,0.7491,0.7427,0.3294,0.8007,0.4667,0.3381,0.3952,0.1632


In [45]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7852,0.8103,0.4814,0.7929,0.5991,0.4642,0.4911
1,0.8048,0.833,0.5257,0.8251,0.6422,0.5169,0.5417
2,0.7933,0.8228,0.4971,0.8093,0.6159,0.4854,0.5123
3,0.8029,0.8351,0.4957,0.8505,0.6264,0.5048,0.5387
4,0.7856,0.8022,0.49,0.7867,0.6039,0.4676,0.4922
Mean,0.7944,0.8207,0.498,0.8129,0.6175,0.4878,0.5152
SD,0.0083,0.0127,0.0149,0.0231,0.0156,0.0205,0.0218


In [46]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7927,0.8218,0.4833,0.8211,0.6085,0.4801,0.5112


In [34]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 48s


In [35]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 102)

In [36]:
predictions = predict_model(final_model, data = test_x)

In [37]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.8488
1,30001,0.2298
2,30002,0.3197
3,30003,0.7273
4,30004,0.8144


In [38]:
sample_submssion.to_csv("./submission/submission_20210203-1-1.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.8488
1,30001,0.2298
2,30002,0.3197
3,30003,0.7273
4,30004,0.8144
...,...,...
14994,44994,0.3187
14995,44995,0.2540
14996,44996,0.5401
14997,44997,0.7650
