In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
np.random.seed(42)
python_random.seed(42)
set_config('seed', 42)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
train_err

In [3]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [4]:
train_err.shape, train_err_0.shape

((16554663, 9), (3480961, 4))

In [5]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

Unnamed: 0,user_id,date_cnt,date_sum
0,10000,30,317
1,10001,30,2365
2,10002,29,306
3,10003,30,306
4,10004,30,777


Unnamed: 0,user_id,date_cnt,date_sum
0,30000,29,2750
1,30001,28,284
2,30002,30,941
3,30003,28,371
4,30004,30,881


(15000, 3)

In [6]:
train = train_err_1.copy()
test = test_err_1.copy()

In [7]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

In [8]:
for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

((15000, 34), (14998, 34))

In [None]:
train

In [9]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
#train_err_2['model_fwver_rank'] = train_err_2.model_fwver_cnt.rank()
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
#test_err_2['model_fwver_rank'] = test_err_2.model_fwver_cnt.rank()
display(test_err_2.head())

Unnamed: 0,user_id,model_fwver,model_fwver_cnt
0,10000,model_305.15.2138,317
1,10001,model_204.33.1185,379
2,10001,model_204.33.1261,1986
3,10002,model_305.15.2138,306
4,10003,model_204.33.1185,81


Unnamed: 0,user_id,model_fwver,model_fwver_cnt
0,30000,model_104.16.3553,2320
1,30000,model_104.16.3571,367
2,30000,model_204.33.1261,63
3,30001,model_305.15.2138,284
4,30002,model_004.22.1750,733


In [10]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

24062it [01:08, 352.81it/s]
24096it [01:08, 351.86it/s]


In [11]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

In [12]:
for error in errors:
    train[error] = 0
    test[error] = 0
    
train.shape, test.shape    

((15000, 75), (14998, 75))

In [13]:
# train_err > errtype => rank
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
#train_err_3['errtype_rank'] = train_err_3.errtype_cnt.rank()
display(train_err_3.head())

# test_err > errtype => rank
test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
#test_err_3['errtype_rank'] = test_err_3.errtype_cnt.rank()
display(test_err_3.head())

Unnamed: 0,user_id,errtype,errtype_cnt
0,10000,3,8
1,10000,4,104
2,10000,6,1
3,10000,7,1
4,10000,10,7


Unnamed: 0,user_id,errtype,errtype_cnt
0,30000,5,62
1,30000,6,1
2,30000,7,1
3,30000,11,16
4,30000,12,16


In [14]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_3.loc[idx].errtype]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_3.loc[idx].errtype]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

231039it [11:12, 343.65it/s]
231241it [11:10, 344.89it/s]


In [None]:
train

In [None]:
# train_err > errtype_code => rank
train_err_4 = train_err.groupby(['model_fwver','errtype_code']).count().reset_index()[['model_fwver','errtype_code','user_id']]
train_err_4.columns = ['model_fwver','errtype_code','cnt']
train_err_4['rank'] = train_err_4.cnt.rank()
display(train_err_4.head())

In [None]:
# test_err > errtype_code => rank
test_err_4 = test_err.groupby(['model_fwver','errtype_code']).count().reset_index()[['model_fwver','errtype_code','user_id']]
test_err_4.columns = ['model_fwver','errtype_code','cnt']
test_err_4['rank'] = test_err_4.cnt.rank()
display(test_err_4.head())

In [None]:
train = train_err.merge(train_err_1, on=['user_id'], how='left').merge(train_err_4, on=['model_fwver','errtype_code'], how='left')#.merge(train_err_3, on='errtype_code', how='left')
train = train[['user_id','date_cnt','date_sum','model_fwver','errtype_code','cnt','rank']].drop_duplicates()
train = train.groupby(['user_id','date_cnt','date_sum']).agg({'errtype_code':['count'],'cnt':['sum'],'rank':['max']}).reset_index()
train.columns = ['user_id','date_cnt','date_sum','errtype_code_cnt','cnt_sum','rank_max']
train

In [None]:
test = test_err.merge(test_err_1, on=['user_id'], how='left').merge(test_err_4, on=['model_fwver','errtype_code'], how='left')#.merge(test_err_3, on='errtype_code', how='left')
test = test[['user_id','date_cnt','date_sum','model_fwver','errtype_code','cnt','rank']].drop_duplicates()
test = test.groupby(['user_id','date_cnt','date_sum']).agg({'errtype_code':['count'],'cnt':['sum'],'rank':['max']}).reset_index()
test.columns = ['user_id','date_cnt','date_sum','errtype_code_cnt','cnt_sum','rank_max']
test

In [15]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)
train_qua_0

Unnamed: 0,user_id,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,10000,0.0,0,0.0,0,0,0,0,0,4,0,0
6,10000,0.0,0,0.0,4,0,0,0,0,4,0,0
12,10000,0.0,0,0.0,0,0,0,0,0,8,0,0
22,10000,0.0,0,0.0,8,0,0,0,0,8,0,0
24,10002,0.0,0,0.0,0,0,0,0,1,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
828612,24997,0.0,0,0.0,0,0,0,0,0,17,0,0
828615,24997,0.0,0,0.0,2,0,0,0,0,17,0,0
828619,24997,0.0,0,0.0,1,0,0,0,0,17,0,0
828621,24997,0.0,0,0.0,3,0,0,0,0,17,0,0


In [16]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [17]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [18]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()
train_qua_1

Unnamed: 0,user_id,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,10000,0,0,0,4,0,4,0,4,4,0,0
1,10002,2,1,2,22,4,22,0,22,22,1,0
2,10004,1,1,1,6,2,6,0,6,6,1,0
3,10005,1,1,1,6,3,6,0,6,6,1,0
4,10006,0,0,0,6,1,6,0,6,6,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...
8276,24990,0,1,1,5,1,0,0,0,5,1,0
8277,24992,2,2,2,9,2,0,0,0,9,2,0
8278,24993,2,2,2,6,2,0,0,0,6,2,0
8279,24995,2,2,2,6,2,0,0,0,6,2,0


%%time
for col in cols:
    train_qua_1[col] = train_qua_1[col].apply(chg_qua)
    test_qua_1[col] = test_qua_1[col].apply(chg_qua)

In [None]:
cols=['quality_0', 'quality_1','quality_2','quality_5',  'quality_6','quality_7', 'quality_10',]

In [None]:
train_qua_1.drop(cols, axis=1, inplace=True)
test_qua_1.drop(cols, axis=1, inplace=True)

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(test_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [19]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [20]:
train_err_5 = train_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
train_err_5.columns = ['model_fwver','model_fwver_cnt']
train_err_5['model_fwver_rank'] = train_err_5.model_fwver_cnt.rank()
display(train_err_5.head())

test_err_5 = test_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
test_err_5.columns = ['model_fwver','model_fwver_cnt']
test_err_5['model_fwver_rank'] = test_err_5.model_fwver_cnt.rank()
display(test_err_5.head())

Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1442,2522,21.0
1,model_004.22.1656,39,7.0
2,model_004.22.1666,5,1.0
3,model_004.22.1684,5554,25.0
4,model_004.22.1750,2874213,36.0


Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1170,817,18.0
1,model_004.22.1448,840,21.0
2,model_004.22.1478,535,15.0
3,model_004.22.1608,12,2.0
4,model_004.22.1656,835,19.0


In [21]:
# train_err > errtype_code => rank
train_err_6 = train_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
train_err_6.columns = ['errtype_code','errtype_code_cnt']
train_err_6['errtype_code_rank'] = train_err_6.errtype_code_cnt.rank()
display(train_err_6.head())

# test_err > errtype_code => rank
test_err_6 = test_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
test_err_6.columns = ['errtype_code','errtype_code_cnt']
test_err_6['errtype_code_rank'] = test_err_6.errtype_code_cnt.rank()
display(test_err_6.head())

Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21079,2825.0
1,101,133403,2851.0
2,111,307030,2854.0
3,121,320181,2855.0
4,131,22843,2828.0


Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21381,2980.0
1,101,149008,3002.0
2,111,327050,3008.0
3,121,304129,3005.0
4,131,13944,2974.0


In [None]:
train

In [22]:
train_7 = train_err.merge(train_err_5, on='model_fwver', how='left').merge(train_err_6, on='errtype_code', how='left')
train_7 = train_7[['user_id','model_fwver_rank','errtype_code_rank']].drop_duplicates()
train_7 = train_7.groupby(['user_id']).agg({'model_fwver_rank':['max'],'errtype_code_rank':['max']}).reset_index()
train_7.columns = ['user_id','model_fwver_max','errtype_code_max']
train_7

Unnamed: 0,user_id,model_fwver_max,errtype_code_max
0,10000,34.0,2868.0
1,10001,35.0,2870.0
2,10002,34.0,2868.0
3,10003,35.0,2870.0
4,10004,36.0,2870.0
...,...,...,...
14995,24995,35.0,2870.0
14996,24996,34.0,2860.0
14997,24997,36.0,2870.0
14998,24998,36.0,2870.0


In [23]:
test_7 = test_err.merge(test_err_5, on='model_fwver', how='left').merge(test_err_6, on='errtype_code', how='left')
test_7 = test_7[['user_id','model_fwver_rank','errtype_code_rank']].drop_duplicates()
test_7 = test_7.groupby(['user_id']).agg({'model_fwver_rank':['max'],'errtype_code_rank':['max']}).reset_index()
test_7.columns = ['user_id','model_fwver_max','errtype_code_max']
test_7

Unnamed: 0,user_id,model_fwver_max,errtype_code_max
0,30000,40.0,3021.0
1,30001,37.0,3019.0
2,30002,39.0,3021.0
3,30003,39.0,3021.0
4,30004,40.0,3021.0
...,...,...,...
14993,44994,40.0,3021.0
14994,44995,39.0,3019.0
14995,44996,39.0,3021.0
14996,44997,39.0,3021.0


In [24]:
train['model_fwver_max'] = train_7['model_fwver_max'] 
train['errtype_code_max'] = train_7['errtype_code_max'] 

test['model_fwver_max'] = test_7['model_fwver_max'] 
test['errtype_code_max'] = test_7['errtype_code_max'] 

In [25]:
train

Unnamed: 0,user_id,date_cnt,date_sum,model_610,model_204.33.1171,model_305.15.2138,model_403.11.1167,model_705.66.3237,model_104.16.3439,model_804.73.2237,...,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,model_fwver_max,errtype_code_max
0,10000,30,317,0,0,317,0,0,0,0,...,4.0,0.0,4.0,0.0,4.0,4.0,0.0,0.0,34.0,2868.0
1,10001,30,2365,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,2870.0
2,10002,29,306,0,0,306,0,0,0,0,...,22.0,4.0,22.0,0.0,22.0,22.0,1.0,0.0,34.0,2868.0
3,10003,30,306,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,2870.0
4,10004,30,777,0,0,0,0,0,0,0,...,6.0,2.0,6.0,0.0,6.0,6.0,1.0,0.0,36.0,2870.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,6.0,2.0,0.0,0.0,0.0,6.0,2.0,0.0,35.0,2870.0
14996,24996,1,4,0,0,4,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,34.0,2860.0
14997,24997,30,826,0,0,0,0,0,0,0,...,8.0,1.0,0.0,0.0,0.0,8.0,1.0,0.0,36.0,2870.0
14998,24998,21,155,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,36.0,2870.0


In [26]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,prob
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [27]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)

In [28]:
display(train.head())
display(test.head())

Unnamed: 0,user_id,date_cnt,date_sum,model_610,model_204.33.1171,model_305.15.2138,model_403.11.1167,model_705.66.3237,model_104.16.3439,model_804.73.2237,...,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,model_fwver_max,errtype_code_max,prob
0,10000,30,317,0,0,317,0,0,0,0,...,0.0,4.0,0.0,4.0,4.0,0.0,0.0,34.0,2868.0,0.0
1,10001,30,2365,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,2870.0,1.0
2,10002,29,306,0,0,306,0,0,0,0,...,4.0,22.0,0.0,22.0,22.0,1.0,0.0,34.0,2868.0,0.0
3,10003,30,306,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,35.0,2870.0,0.0
4,10004,30,777,0,0,0,0,0,0,0,...,2.0,6.0,0.0,6.0,6.0,1.0,0.0,36.0,2870.0,1.0


Unnamed: 0,user_id,date_cnt,date_sum,model_610,model_204.33.1171,model_305.15.2138,model_403.11.1167,model_705.66.3237,model_104.16.3439,model_804.73.2237,...,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,model_fwver_max,errtype_code_max
0,30000,29,2750,0,0,0,0,0,0,0,...,2.0,1.0,2.0,0.0,2.0,2.0,0.0,0.0,40.0,3021.0
1,30001,28,284,0,0,284,0,0,0,0,...,10.0,1.0,10.0,0.0,10.0,10.0,1.0,0.0,37.0,3019.0
2,30002,30,941,0,0,0,0,0,0,0,...,26.0,3.0,26.0,0.0,26.0,26.0,3.0,0.0,39.0,3021.0
3,30003,28,371,0,0,0,0,0,0,0,...,13.0,5.0,13.0,0.0,13.0,13.0,0.0,0.0,39.0,3021.0
4,30004,30,881,0,0,0,0,0,0,0,...,5.0,3.0,5.0,0.0,5.0,5.0,1.0,0.0,40.0,3021.0


In [None]:
train.shape, test.shape

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
cols = ['date_cnt', 'date_sum', 'model_fwver_cnt', 'model_fwver_max','errtype_code_max']

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [35]:
train.prob = train.prob.astype(int)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']
train[cols] = train[cols].astype(int)
test[cols] = test[cols].astype(int)

In [None]:
train.info()

In [33]:
train.columns[1:88]

Index([         'date_cnt',          'date_sum',         'model_610',
       'model_204.33.1171', 'model_305.15.2138', 'model_403.11.1167',
       'model_705.66.3237', 'model_104.16.3439', 'model_804.73.2237',
       'model_305.15.2114', 'model_204.33.1261', 'model_204.33.1185',
       'model_504.82.1684',      'model_68.5.3', 'model_804.73.2571',
       'model_104.16.3571', 'model_403.11.1141', 'model_403.11.1149',
       'model_305.15.3104', 'model_004.22.1656', 'model_705.66.3571',
       'model_104.16.3569', 'model_004.22.1684', 'model_204.33.1149',
       'model_204.33.1125', 'model_004.22.1778', 'model_305.15.2092',
       'model_504.82.1778', 'model_004.22.1750', 'model_305.15.2120',
       'model_104.16.3553', 'model_504.82.1730', 'model_004.22.1666',
                         1,                   2,                   3,
                         4,                   5,                   6,
                         7,                   8,                   9,
                    

In [36]:
%%time
clf = setup(session_id=42, data=train, target='prob'
           , numeric_features=train.columns[1:88])

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 89)"
4,Missing Values,False
5,Numeric Features,88
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 6.22 s


In [37]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7945,0.8166,0.5046,0.8065,0.6206,0.4897,0.5152,11.4967
1,Gradient Boosting Classifier,0.7921,0.8134,0.4763,0.8266,0.604,0.4766,0.5101,3.5041
2,Light Gradient Boosting Machine,0.7918,0.8098,0.5134,0.7881,0.6217,0.4868,0.5081,0.5513
3,Extra Trees Classifier,0.7883,0.8011,0.504,0.7835,0.6133,0.4769,0.4989,0.6314
4,Ada Boost Classifier,0.7794,0.7986,0.4894,0.764,0.5964,0.4546,0.476,0.9148
5,Extreme Gradient Boosting,0.7802,0.7962,0.5309,0.7359,0.6166,0.4682,0.4806,1.8915
6,Random Forest Classifier,0.7684,0.7729,0.4717,0.7391,0.5757,0.4269,0.4474,0.1232
7,Linear Discriminant Analysis,0.7509,0.7409,0.3317,0.808,0.4702,0.3427,0.4008,0.175


In [38]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7843,0.806,0.4771,0.7933,0.5959,0.4609,0.4886
1,0.8095,0.8296,0.5271,0.8425,0.6485,0.5272,0.5544
2,0.7905,0.8165,0.49,0.8052,0.6092,0.4774,0.5049
3,0.8033,0.8305,0.4929,0.8561,0.6256,0.505,0.5404
4,0.7885,0.8017,0.4871,0.8005,0.6057,0.4726,0.4998
Mean,0.7952,0.8169,0.4949,0.8195,0.617,0.4886,0.5176
SD,0.0096,0.0118,0.017,0.025,0.0184,0.0241,0.0253


In [39]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7934,0.8172,0.488,0.8188,0.6115,0.4828,0.5128


In [40]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 14s


In [41]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 88)

In [42]:
predictions = predict_model(final_model, data = test_x)

In [43]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.8379
1,30001,0.2808
2,30002,0.3719
3,30003,0.7762
4,30004,0.8101


In [44]:
sample_submssion.to_csv("./submission/submission_20210128-4.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.8379
1,30001,0.2808
2,30002,0.3719
3,30003,0.7762
4,30004,0.8101
...,...,...
14994,44994,0.3240
14995,44995,0.3523
14996,44996,0.5436
14997,44997,0.7590
