In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [5]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,10000,30,317,0,0,0,0,0,0,0,...,0,0,4,0,4,0,4,4,0,0
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,0
2,10002,29,306,0,0,0,0,0,0,0,...,1,2,22,4,22,0,22,22,1,0
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,0
4,10004,30,777,0,0,645,0,0,0,0,...,1,1,6,2,6,0,6,6,1,0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,29,2750,0,0,0,0,0,0,0,...,0,0,2,1,2,0,2,2,0,0
1,30001,28,284,0,0,0,0,0,0,0,...,1,1,10,1,10,0,10,10,1,0
2,30002,30,941,0,0,733,0,0,0,0,...,3,3,26,3,26,0,26,26,3,0
3,30003,28,371,0,0,246,0,0,0,0,...,0,0,13,5,13,0,13,13,0,0
4,30004,30,881,0,0,0,0,0,0,0,...,1,1,5,3,5,0,5,5,1,0


((15000, 86), (14998, 86))

In [7]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']

In [8]:
train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [11]:
train[['date_cnt','date_sum','day_max','prob']]

Unnamed: 0,date_cnt,date_sum,day_max,prob
0,30,317,20,0.0
1,30,2365,1452,1.0
2,29,306,17,0.0
3,30,306,24,0.0
4,30,777,102,1.0
...,...,...,...,...
14995,10,194,30,0.0
14996,1,4,4,0.0
14997,30,826,64,1.0
14998,21,155,37,1.0


In [42]:
col = list(train.columns)[3:34]
col.append('prob')

In [43]:
abs(train[col].corr()['prob']).sort_values(ascending=False)

prob                 1.000000
model_004.22.1750    0.102243
model_204.33.1261    0.091829
model_104.16.3553    0.087504
model_004.22.1778    0.071591
model_104.16.3571    0.058343
model_204.33.1185    0.045622
model_204.33.1149    0.021755
model_705.66.3237    0.020557
model_804.73.2571    0.020006
model_504.82.1778    0.015697
model_104.16.3569    0.013058
model_403.11.1167    0.012519
model_610            0.011730
model_705.66.3571    0.011670
model_504.82.1684    0.011604
model_504.82.1730    0.011547
model_004.22.1656    0.011547
model_305.15.2092    0.011547
model_403.11.1141    0.011089
model_68.5.3         0.010569
model_403.11.1149    0.009130
model_104.16.3439    0.009081
model_004.22.1666    0.007002
model_305.15.2114    0.005774
model_305.15.2138    0.004764
model_204.33.1125    0.004332
model_305.15.2120    0.004143
model_204.33.1171    0.003120
model_305.15.3104    0.003007
model_004.22.1684    0.001942
model_804.73.2237    0.000020
Name: prob, dtype: float64

In [38]:
col = list(train.columns[34:75])
col.append('prob')

In [41]:
abs(train[col].corr()['prob']).sort_values(ascending=False)

prob    1.000000
E20     0.287503
E18     0.259057
E16     0.212896
E15     0.211715
E37     0.177654
E36     0.177015
E40     0.147105
E26     0.142512
E33     0.123586
E19     0.121187
E30     0.119862
E21     0.114873
E25     0.089620
E22     0.088032
E23     0.086900
E41     0.085379
E31     0.084241
E17     0.076258
E5      0.066219
E35     0.062018
E1      0.061372
E32     0.060246
E34     0.054898
E7      0.053498
E6      0.051343
E38     0.044696
E12     0.038468
E39     0.038100
E11     0.035437
E14     0.023783
E13     0.021389
E24     0.018058
E42     0.017767
E10     0.017578
E8      0.016353
E27     0.014831
E28     0.014724
E9      0.012951
E3      0.007393
E4      0.005186
E2      0.001488
Name: prob, dtype: float64

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [9]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,prob
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [10]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max,prob
0,10000,30,317,0,0,0,0,0,0,0,...,4,0,4,0,4,4,0,0,20,0.0
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,1452,1.0
2,10002,29,306,0,0,0,0,0,0,0,...,22,4,22,0,22,22,1,0,17,0.0
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,24,0.0
4,10004,30,777,0,0,645,0,0,0,0,...,6,2,6,0,6,6,1,0,102,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,6,2,0,0,0,6,2,0,30,0.0
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,4,0.0
14997,24997,30,826,0,0,465,0,0,0,0,...,8,1,0,0,0,8,1,0,64,1.0
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,37,1.0


In [12]:
train.prob = train.prob.astype(int)

In [None]:
train.info()

In [None]:
train.columns[1:-1]

In [47]:
col = ['E3','E4','E2','model_004.22.1666','model_305.15.2114','model_305.15.2138'
,'model_204.33.1125','model_305.15.2120','model_204.33.1171','model_305.15.3104'
,'model_004.22.1684','model_804.73.2237']

In [49]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-1]
           , ignore_features=col
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 88)"
4,Missing Values,False
5,Numeric Features,87
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 3.81 s


In [50]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7929,0.8183,0.5083,0.7973,0.6206,0.4877,0.5112,11.2748
1,Gradient Boosting Classifier,0.7903,0.8153,0.4771,0.8184,0.6025,0.4732,0.5051,3.4097
2,Light Gradient Boosting Machine,0.7909,0.8131,0.5169,0.7822,0.6223,0.486,0.506,0.5014
3,Extra Trees Classifier,0.7827,0.7995,0.4926,0.7733,0.6017,0.4623,0.4846,0.5685
4,Ada Boost Classifier,0.7749,0.7986,0.4831,0.7531,0.5884,0.4436,0.4644,0.8871
5,Extreme Gradient Boosting,0.7798,0.7952,0.5331,0.734,0.6175,0.4683,0.4802,1.6035
6,Random Forest Classifier,0.7652,0.7686,0.4609,0.7368,0.5667,0.4172,0.439,0.1202
7,Linear Discriminant Analysis,0.7512,0.7397,0.3297,0.8124,0.4689,0.3425,0.4019,0.1252


In [15]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7867,0.8055,0.4843,0.7958,0.6021,0.4679,0.4949
1,0.8038,0.8272,0.5229,0.8243,0.6399,0.5142,0.5393
2,0.7938,0.8171,0.4929,0.8156,0.6144,0.4851,0.5138
3,0.8019,0.8299,0.49,0.8532,0.6225,0.5012,0.5366
4,0.7808,0.7988,0.4871,0.7715,0.5972,0.457,0.4798
Mean,0.7934,0.8157,0.4954,0.8121,0.6152,0.4851,0.5129
SD,0.0088,0.012,0.014,0.0275,0.0152,0.0209,0.0232


In [16]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.792,0.8176,0.4833,0.8183,0.6077,0.4787,0.5094


In [51]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 10s


In [52]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 87)

In [53]:
predictions = predict_model(final_model, data = test_x)

In [54]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.8421
1,30001,0.2141
2,30002,0.3608
3,30003,0.773
4,30004,0.7658


In [55]:
sample_submssion.to_csv("./submission/submission_20210129-2.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.8421
1,30001,0.2141
2,30002,0.3608
3,30003,0.7730
4,30004,0.7658
...,...,...
14994,44994,0.3182
14995,44995,0.2597
14996,44996,0.5260
14997,44997,0.7593
