In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [86]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
display(train_err.tail())
display(train_qua.tail())
display(train_prob.tail())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0
16554662,24999,20201130210625,model_3,05.15.2138,15,1


Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828623,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,9,0,0,0,0,17,0,0


Unnamed: 0,user_id,time
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000
5428,18822,20201102120000


In [5]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [6]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train_err = train_err.merge(train_prob1, on='user_id', how='left').fillna(0)

In [7]:
err_code = train_err.groupby(['errtype_code'])['prob'].sum().rank(method='min').reset_index()
err_code.columns = ['errtype_code','err_code_rank']
err_code

Unnamed: 0,errtype_code,err_code_rank
0,0,940.0
1,10,2815.0
2,101,2853.0
3,111,2855.0
4,121,2856.0
...,...,...
2866,9C-14014,2727.0
2867,9V-21002,2737.0
2868,9V-21004,940.0
2869,9V-21005,2730.0


In [8]:
train_err = train_err.merge(err_code, on='errtype_code', how='left').fillna(0)
test_err = test_err.merge(err_code, on='errtype_code', how='left').fillna(0)

In [9]:
date_rank = train_err.groupby(['date'])['prob'].sum().rank(method='min').reset_index()
date_rank.columns = ['date','date_rank']
date_rank.min(), date_rank.max()

(date         20201031
 date_rank           1
 dtype: object,
 date         20201202
 date_rank          33
 dtype: object)

In [10]:
train_err = train_err.merge(date_rank, on='date', how='left').fillna(0)
test_err = test_err.merge(date_rank, on='date', how='left').fillna(0)

In [11]:
model_fwver_rank = train_err.groupby(['model_fwver'])['prob'].sum().rank(method='min').reset_index()
model_fwver_rank.columns = ['model_fwver','model_fwver_rank']
model_fwver_rank.min(), model_fwver_rank.max()

(model_fwver         model_004.22.1442
 model_fwver_rank                    1
 dtype: object,
 model_fwver         model_804.73.2571
 model_fwver_rank                   37
 dtype: object)

In [12]:
train_err = train_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
test_err = test_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)

In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [13]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [14]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [15]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [16]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [17]:
train_err_8 = train_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
train_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

test_err_8 = test_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
test_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

train = train.merge(train_err_8, on='user_id', how='left').fillna(0)
test = test.merge(test_err_8, on='user_id', how='left').fillna(0)

In [18]:
train_err_7 = train_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
train_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

test_err_7 = test_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
test_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

train = train.merge(train_err_7, on='user_id', how='left').fillna(0)
test = test.merge(test_err_7, on='user_id', how='left').fillna(0)

In [19]:
train_model_user = train_err[['user_id','model_nm']].drop_duplicates()
test_model_user = test_err[['user_id','model_nm']].drop_duplicates()

mode_nm = sorted(train_err.model_nm.unique())
for col in mode_nm:
    train[col] = 0
    test[col] = 0

In [20]:
for idx, col in tqdm(enumerate(train_model_user.values)):
    i = train[train.user_id == col[0]].index
    train.loc[i, col[1]] = 1

15704it [00:18, 857.56it/s]


In [21]:
for idx, col in tqdm(enumerate(test_model_user.values)):
    i = test[test.user_id == col[0]].index
    test.loc[i, col[1]] = 1

15657it [00:17, 898.63it/s]


In [23]:
train['model'] = ''
test['model'] = ''

for col in mode_nm:
    train['model'] += train[col].astype(str)
    test['model'] += test[col].astype(str)

In [24]:
col = ['model_0', 'model_1', 'model_2', 'model_3', 'model_4', 'model_5','model_6', 'model_7', 'model_8']
train.drop(col, axis=1, inplace=True)
test.drop(col, axis=1, inplace=True)

In [25]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model
0,10000,30,317,0,0,0,0,0,0,0,...,43.0,54.0,52.0,907345.0,2869.0,5840.0,33.0,10144.0,32.0,100000
1,10001,30,2365,0,0,0,0,0,379,0,...,1534.0,117.0,139.0,6763505.0,2870.0,25937.0,33.0,81259.0,35.0,1000000
2,10002,29,306,0,0,0,0,0,0,0,...,34.0,42.0,58.0,876071.0,2869.0,5764.0,33.0,9792.0,32.0,100000
3,10003,30,306,0,0,0,0,0,81,0,...,37.0,22.0,57.0,870419.0,2871.0,5388.0,33.0,10386.0,35.0,1000000
4,10004,30,777,0,0,645,0,0,0,0,...,171.0,84.0,108.0,2226738.0,2871.0,14420.0,33.0,27576.0,36.0,100000000


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model
0,30000,29,2750,0,0,0,0,0,0,0,...,223.0,1024.0,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,11000000
1,30001,28,284,0,0,0,0,0,0,0,...,18.0,54.0,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,100000
2,30002,30,941,0,0,733,0,0,0,0,...,99.0,140.0,135.0,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,100000000
3,30003,28,371,0,0,246,0,0,0,0,...,92.0,24.0,71.0,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,100000000
4,30004,30,881,0,0,0,0,0,0,0,...,105.0,102.0,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,11000000


((15000, 101), (14998, 101))

In [30]:
train_1 = train[(train.model == '100000000') | (train.model == '001000000')]
train_2 = train[~((train.model == '100000000') | (train.model == '001000000'))]

test_1 = test[(test.model == '100000000') | (test.model == '001000000')]
test_2 = test[~((test.model == '100000000') | (test.model == '001000000'))]

In [32]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']

In [43]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train.prob = train.prob.astype(int)

In [33]:
train_1 = train_1.merge(train_prob1, on='user_id', how='left').fillna(0)
train_1.prob = train_1.prob.astype(int)

train_2 = train_2.merge(train_prob1, on='user_id', how='left').fillna(0)
train_2.prob = train_2.prob.astype(int)

## train_2

In [56]:
%%time
clf_1 = setup(session_id=seed, data=train_2, target='prob'
           , numeric_features=train.columns[1:-2] # model cat
            , ignore_features=['err_rank_sum']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(6853, 102)"
4,Missing Values,False
5,Numeric Features,100
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 11.3 s


In [57]:
best_1 = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.8024,0.8289,0.516,0.8186,0.6321,0.5067,0.5323,9.7371
1,Gradient Boosting Classifier,0.8007,0.8257,0.5041,0.8245,0.625,0.5001,0.5284,2.4215
2,Ada Boost Classifier,0.7892,0.8186,0.511,0.775,0.6154,0.4787,0.4986,0.6357
3,Light Gradient Boosting Machine,0.7965,0.8185,0.5331,0.7815,0.6334,0.4996,0.5173,0.4839
4,Extra Trees Classifier,0.7953,0.8127,0.511,0.7972,0.6222,0.491,0.5142,0.3387
5,Extreme Gradient Boosting,0.7874,0.8044,0.5406,0.7464,0.6265,0.4832,0.4959,1.2248
6,Random Forest Classifier,0.7844,0.7862,0.4807,0.7843,0.5953,0.4598,0.4859,0.121
7,Linear Discriminant Analysis,0.7686,0.7717,0.3734,0.8369,0.5153,0.3916,0.4471,0.1036


In [58]:
blended_1 = blend_models(estimator_list = best_1, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.8062,0.8326,0.5142,0.8359,0.6367,0.5146,0.5429
1,0.7958,0.829,0.4921,0.8168,0.6142,0.4867,0.5156
2,0.8206,0.8434,0.5552,0.8502,0.6718,0.5557,0.5796
3,0.804,0.8267,0.511,0.8308,0.6328,0.5092,0.5372
4,0.7956,0.8232,0.4826,0.827,0.6096,0.4838,0.516
Mean,0.8045,0.831,0.511,0.8321,0.633,0.51,0.5383
SD,0.0091,0.0069,0.025,0.011,0.022,0.0259,0.0234


In [59]:
pred_holdout_1 = predict_model(blended_1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.8035,0.8225,0.5214,0.8176,0.6367,0.5109,0.5352


In [60]:
%%time
final_model_1 = finalize_model(blended_1)

Wall time: 2min 26s


In [61]:
predictions_1 = pd.DataFrame()
predictions_1 = predict_model(final_model_1, data = test_2)
predictions_1

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score
0,30000.0,29.0,2750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,011000000,1.0,0.5882
1,30001.0,28.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,000100000,0.0,0.2273
2,,,,,,,,,,,...,,,,,,,,,,0.6181
3,,,,,,,,,,,...,,,,,,,,,,0.5673
4,30004.0,30.0,881.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,011000000,1.0,0.1474
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14985,44986.0,30.0,4003.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,971.0,11440237.0,2871.0,80116.0,33.0,148111.0,37.0,010000000,0.0,
14990,44991.0,27.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,79.0,813287.0,2869.0,5847.0,33.0,9088.0,32.0,000100000,0.0,
14992,44993.0,30.0,567.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,171.0,1622548.0,2869.0,10983.0,33.0,18144.0,32.0,000100000,0.0,
14993,44994.0,30.0,1115.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,245.0,3193105.0,2871.0,20790.0,33.0,41255.0,37.0,010000000,0.0,


In [44]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-2] # model cat
            , ignore_features=['err_rank_sum']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 102)"
4,Missing Values,False
5,Numeric Features,100
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 4.76 s


In [45]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7934,0.8198,0.5034,0.8039,0.6189,0.4873,0.5126,13.6054
1,Gradient Boosting Classifier,0.7921,0.8178,0.4797,0.8229,0.6058,0.4777,0.5099,5.5975
2,Light Gradient Boosting Machine,0.79,0.8144,0.5106,0.784,0.6181,0.4822,0.5034,0.7024
3,Ada Boost Classifier,0.7877,0.805,0.5014,0.7845,0.6112,0.4749,0.4977,1.3469
4,Extra Trees Classifier,0.7901,0.8046,0.4943,0.7995,0.6107,0.4778,0.5038,0.6351
5,Extreme Gradient Boosting,0.7786,0.8007,0.5266,0.734,0.6131,0.464,0.4765,2.5658
6,Random Forest Classifier,0.7719,0.7756,0.464,0.7574,0.5752,0.4315,0.4558,0.125
7,Linear Discriminant Analysis,0.769,0.7591,0.374,0.8477,0.5186,0.3957,0.4531,0.2029


In [46]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7829,0.8087,0.4671,0.7976,0.5892,0.455,0.485
1,0.8095,0.835,0.53,0.8394,0.6497,0.5279,0.5542
2,0.7948,0.8241,0.4943,0.818,0.6162,0.4875,0.5163
3,0.8062,0.8342,0.4957,0.8653,0.6303,0.5118,0.5483
4,0.7904,0.8042,0.4943,0.8009,0.6113,0.4786,0.5047
Mean,0.7967,0.8212,0.4963,0.8242,0.6194,0.4922,0.5217
SD,0.0099,0.0127,0.02,0.0253,0.0201,0.0255,0.0262


In [47]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7954,0.8224,0.4867,0.8286,0.6132,0.4866,0.5184


In [48]:
%%time
final_model = finalize_model(blended)

Wall time: 4min 9s


In [49]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 101)

In [51]:
predictions = predict_model(final_model, data = test)
predictions

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score
0,30000,29,2750,0,0,0,0,0,0,0,...,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,011000000,1,0.6500
1,30001,28,284,0,0,0,0,0,0,0,...,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,000100000,0,0.2431
2,30002,30,941,0,0,733,0,0,0,0,...,135.0,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,100000000,0,0.3113
3,30003,28,371,0,0,246,0,0,0,0,...,71.0,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,100000000,1,0.7904
4,30004,30,881,0,0,0,0,0,0,0,...,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,011000000,0,0.4098
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14993,44994,30,1115,0,0,0,0,0,0,0,...,245.0,3193105.0,2871.0,20790.0,33.0,41255.0,37.0,010000000,0,0.2811
14994,44995,30,515,0,0,41,0,0,0,0,...,78.0,1475297.0,2869.0,10111.0,33.0,17118.0,36.0,100000000,0,0.2557
14995,44996,30,2233,0,0,1602,0,0,0,0,...,533.0,6366640.0,2871.0,42344.0,33.0,78495.0,36.0,100000000,0,0.4919
14996,44997,28,24671,0,0,21466,0,0,0,0,...,4329.0,70754656.0,2871.0,464041.0,33.0,878541.0,36.0,100000000,1,0.7584


In [62]:
display(predictions.head())
display(predictions_1.head())

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score
0,30000,29,2750,0,0,0,0,0,0,0,...,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,11000000,1,0.65
1,30001,28,284,0,0,0,0,0,0,0,...,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,100000,0,0.2431
2,30002,30,941,0,0,733,0,0,0,0,...,135.0,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,100000000,0,0.3113
3,30003,28,371,0,0,246,0,0,0,0,...,71.0,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,100000000,1,0.7904
4,30004,30,881,0,0,0,0,0,0,0,...,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,11000000,0,0.4098


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score
0,30000.0,29.0,2750.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,11000000.0,1.0,0.5882
1,30001.0,28.0,284.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,100000.0,0.0,0.2273
2,,,,,,,,,,,...,,,,,,,,,,0.6181
3,,,,,,,,,,,...,,,,,,,,,,0.5673
4,30004.0,30.0,881.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,11000000.0,1.0,0.1474


In [69]:
pred = predictions.merge(predictions_1[['user_id','Score']], on='user_id', how='left')
pred.head()

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score_x,Score_y
0,30000,29,2750,0,0,0,0,0,0,0,...,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,11000000,1,0.65,0.5882
1,30001,28,284,0,0,0,0,0,0,0,...,812544.0,2869.0,5767.0,33.0,9088.0,32.0,100000,0,0.2431,0.2273
2,30002,30,941,0,0,733,0,0,0,0,...,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,100000000,0,0.3113,
3,30003,28,371,0,0,246,0,0,0,0,...,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,100000000,1,0.7904,
4,30004,30,881,0,0,0,0,0,0,0,...,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,11000000,0,0.4098,0.1474


In [71]:
pred['Score'] = (pred['Score_x'] + pred['Score_y'])/2
pred.head()

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score_x,Score_y,Score
0,30000,29,2750,0,0,0,0,0,0,0,...,2871.0,57359.0,33.0,99055.0,37.0,11000000,1,0.65,0.5882,0.6191
1,30001,28,284,0,0,0,0,0,0,0,...,2869.0,5767.0,33.0,9088.0,32.0,100000,0,0.2431,0.2273,0.2352
2,30002,30,941,0,0,733,0,0,0,0,...,2871.0,16311.0,33.0,33252.0,36.0,100000000,0,0.3113,,
3,30003,28,371,0,0,246,0,0,0,0,...,2871.0,6783.0,33.0,12981.0,36.0,100000000,1,0.7904,,
4,30004,30,881,0,0,0,0,0,0,0,...,2870.0,16093.0,33.0,32051.0,37.0,11000000,0,0.4098,0.1474,0.2786


In [73]:
pred['Score'] = pred['Score'].fillna(pred['Score_x'])
pred.head()

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score_x,Score_y,Score
0,30000,29,2750,0,0,0,0,0,0,0,...,2871.0,57359.0,33.0,99055.0,37.0,11000000,1,0.65,0.5882,0.6191
1,30001,28,284,0,0,0,0,0,0,0,...,2869.0,5767.0,33.0,9088.0,32.0,100000,0,0.2431,0.2273,0.2352
2,30002,30,941,0,0,733,0,0,0,0,...,2871.0,16311.0,33.0,33252.0,36.0,100000000,0,0.3113,,0.3113
3,30003,28,371,0,0,246,0,0,0,0,...,2871.0,6783.0,33.0,12981.0,36.0,100000000,1,0.7904,,0.7904
4,30004,30,881,0,0,0,0,0,0,0,...,2870.0,16093.0,33.0,32051.0,37.0,11000000,0,0.4098,0.1474,0.2786


In [76]:
pred.tail()

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,Label,Score_x,Score_y,Score
14993,44994,30,1115,0,0,0,0,0,0,0,...,2871.0,20790.0,33.0,41255.0,37.0,10000000,0,0.2811,,0.2811
14994,44995,30,515,0,0,41,0,0,0,0,...,2869.0,10111.0,33.0,17118.0,36.0,100000000,0,0.2557,,0.2557
14995,44996,30,2233,0,0,1602,0,0,0,0,...,2871.0,42344.0,33.0,78495.0,36.0,100000000,0,0.4919,,0.4919
14996,44997,28,24671,0,0,21466,0,0,0,0,...,2871.0,464041.0,33.0,878541.0,36.0,100000000,1,0.7584,,0.7584
14997,44998,30,873,0,0,0,0,0,0,0,...,2871.0,16243.0,33.0,32301.0,37.0,10000000,0,0.3132,,0.3132


In [92]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion = sample_submssion.merge(pred[['user_id','Score']], on='user_id', how='left').fillna(0)

In [93]:
sample_submssion

Unnamed: 0,user_id,problem,Score
0,30000,0,0.6191
1,30001,0,0.2352
2,30002,0,0.3113
3,30003,0,0.7904
4,30004,0,0.2786
...,...,...,...
14994,44994,0,0.2811
14995,44995,0,0.2557
14996,44996,0,0.4919
14997,44997,0,0.7584


In [94]:
sample_submssion.drop('problem', axis=1, inplace=True)

In [99]:
sample_submssion.columns = ['user_id', 'problem']
sample_submssion.head()
sample_submssion.tail()

Unnamed: 0,user_id,problem
14994,44994,0.2811
14995,44995,0.2557
14996,44996,0.4919
14997,44997,0.7584
14998,44998,0.3132


In [100]:
sample_submssion.to_csv("./submission/submission_20210202-2.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.6191
1,30001,0.2352
2,30002,0.3113
3,30003,0.7904
4,30004,0.2786
...,...,...
14994,44994,0.2811
14995,44995,0.2557
14996,44996,0.4919
14997,44997,0.7584
