In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
np.random.seed(42)
python_random.seed(42)
set_config('seed', 42)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [4]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

Unnamed: 0,user_id,date_cnt,date_sum
0,10000,30,317
1,10001,30,2365
2,10002,29,306
3,10003,30,306
4,10004,30,777


Unnamed: 0,user_id,date_cnt,date_sum
0,30000,29,2750
1,30001,28,284
2,30002,30,941
3,30003,28,371
4,30004,30,881


In [5]:
train_err_2 = train_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
train_err_2.columns = ['model_fwver','model_fwver_cnt']
train_err_2['model_fwver_rank'] = train_err_2.model_fwver_cnt.rank()
display(train_err_2.head())

test_err_2 = test_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
test_err_2.columns = ['model_fwver','model_fwver_cnt']
test_err_2['model_fwver_rank'] = test_err_2.model_fwver_cnt.rank()
display(test_err_2.head())

Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1442,2522,21.0
1,model_004.22.1656,39,7.0
2,model_004.22.1666,5,1.0
3,model_004.22.1684,5554,25.0
4,model_004.22.1750,2874213,36.0


Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1170,817,18.0
1,model_004.22.1448,840,21.0
2,model_004.22.1478,535,15.0
3,model_004.22.1608,12,2.0
4,model_004.22.1656,835,19.0


In [6]:
# train_err > errtype_code => rank
train_err_3 = train_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
train_err_3.columns = ['errtype_code','errtype_code_cnt']
train_err_3['errtype_code_rank'] = train_err_3.errtype_code_cnt.rank()
display(train_err_3.head())

# test_err > errtype_code => rank
test_err_3 = test_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
test_err_3.columns = ['errtype_code','errtype_code_cnt']
test_err_3['errtype_code_rank'] = test_err_3.errtype_code_cnt.rank()
display(test_err_3.head())

Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21079,2825.0
1,101,133403,2851.0
2,111,307030,2854.0
3,121,320181,2855.0
4,131,22843,2828.0


Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21381,2980.0
1,101,149008,3002.0
2,111,327050,3008.0
3,121,304129,3005.0
4,131,13944,2974.0


In [7]:
train = train_err.merge(train_err_1, on=['user_id'], how='left').merge(train_err_2, on='model_fwver', how='left').merge(train_err_3, on='errtype_code', how='left')
train = train[['user_id','date_cnt','date_sum','model_fwver_rank','errtype_code_rank']].drop_duplicates()
train = train.groupby(['user_id','date_cnt','date_sum']).agg({'model_fwver_rank':['count', 'max'],'errtype_code_rank':['count', 'max']}).reset_index()
train.columns = ['user_id','date_cnt','date_sum','model_fwver_cnt','model_fwver_max','errtype_code_cnt','errtype_code_max']
train

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max
0,10000,30,317,15,34.0,15,2868.0
1,10001,30,2365,46,35.0,46,2870.0
2,10002,29,306,17,34.0,17,2868.0
3,10003,30,306,36,35.0,36,2870.0
4,10004,30,777,36,36.0,36,2870.0
...,...,...,...,...,...,...,...
14995,24995,10,194,22,35.0,22,2870.0
14996,24996,1,4,4,34.0,4,2860.0
14997,24997,30,826,44,36.0,44,2870.0
14998,24998,21,155,24,36.0,24,2870.0


In [8]:
test = test_err.merge(test_err_1, on=['user_id'], how='left').merge(test_err_2, on='model_fwver', how='left').merge(test_err_3, on='errtype_code', how='left')
test = test[['user_id','date_cnt','date_sum','model_fwver_rank','errtype_code_rank']].drop_duplicates()
test = test.groupby(['user_id','date_cnt','date_sum']).agg({'model_fwver_rank':['count', 'max'],'errtype_code_rank':['count', 'max']}).reset_index()
test.columns = ['user_id','date_cnt','date_sum','model_fwver_cnt','model_fwver_max','errtype_code_cnt','errtype_code_max']
test

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max
0,30000,29,2750,69,40.0,69,3021.0
1,30001,28,284,15,37.0,15,3019.0
2,30002,30,941,42,39.0,42,3021.0
3,30003,28,371,51,39.0,51,3021.0
4,30004,30,881,49,40.0,49,3021.0
...,...,...,...,...,...,...,...
14993,44994,30,1115,25,40.0,25,3021.0
14994,44995,30,515,27,39.0,27,3019.0
14995,44996,30,2233,44,39.0,44,3021.0
14996,44997,28,24671,49,39.0,49,3021.0


In [9]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [10]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [11]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [12]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [13]:
%%time
for col in cols:
    train_qua_1[col] = train_qua_1[col].apply(chg_qua)
    test_qua_1[col] = test_qua_1[col].apply(chg_qua)

Wall time: 64.8 ms


In [14]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [15]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,prob
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [16]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)

In [17]:
display(train.head())
display(test.head())

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,prob
0,10000,30,317,15,34.0,15,2868.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,10001,30,2365,46,35.0,46,2870.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,10002,29,306,17,34.0,17,2868.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
3,10003,30,306,36,35.0,36,2870.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004,30,777,36,36.0,36,2870.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,29,2750,69,40.0,69,3021.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
1,30001,28,284,15,37.0,15,3019.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,30002,30,941,42,39.0,42,3021.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
3,30003,28,371,51,39.0,51,3021.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,30004,30,881,49,40.0,49,3021.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [18]:
train.shape, test.shape

((15000, 19), (14998, 18))

In [19]:
train.prob = train.prob.astype(int)

In [20]:
%%time
clf = setup(session_id=42, data=train, target='prob')

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 19)"
4,Missing Values,False
5,Numeric Features,7
6,Categorical Features,11
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 29.2 s


In [21]:
best = compare_models(sort = 'AUC', n_select = 5)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,Gradient Boosting Classifier,0.735,0.7471,0.3871,0.6792,0.4929,0.3316,0.3556,0.935
1,CatBoost Classifier,0.7329,0.7465,0.4037,0.6629,0.5013,0.3337,0.3529,6.3064
2,Ada Boost Classifier,0.7268,0.739,0.3931,0.6493,0.4895,0.3181,0.3369,0.3296
3,Light Gradient Boosting Machine,0.7326,0.7379,0.4037,0.6618,0.5012,0.3332,0.3522,0.234
4,Extreme Gradient Boosting,0.7182,0.7269,0.402,0.6187,0.487,0.3048,0.3184,0.7836
5,Linear Discriminant Analysis,0.724,0.7149,0.3311,0.6747,0.444,0.2881,0.3204,0.0545
6,Logistic Regression,0.7246,0.7107,0.3391,0.6722,0.4505,0.2927,0.3232,0.0925
7,Quadratic Discriminant Analysis,0.6566,0.6822,0.1834,0.6541,0.2056,0.0919,0.1611,0.025
8,Random Forest Classifier,0.6752,0.665,0.4106,0.5156,0.4567,0.2296,0.2327,0.1179
9,Naive Bayes,0.5905,0.6594,0.6023,0.4205,0.495,0.1687,0.1766,0.0064


In [22]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7271,0.7291,0.4014,0.646,0.4952,0.3219,0.339
1,0.7533,0.7633,0.43,0.7167,0.5375,0.3833,0.4066
2,0.7367,0.7467,0.4114,0.6713,0.5102,0.344,0.3633
3,0.7348,0.746,0.3886,0.6783,0.4941,0.3319,0.3555
4,0.7122,0.7296,0.3686,0.6143,0.4607,0.2808,0.2979
Mean,0.7328,0.743,0.4,0.6653,0.4995,0.3324,0.3525
SD,0.0134,0.0127,0.0207,0.0341,0.0249,0.0332,0.0353


In [23]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7336,0.7481,0.3967,0.6693,0.4981,0.3326,0.3537


In [24]:
%%time
final_model = finalize_model(blended)

Wall time: 1min 50s


In [25]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 18)

In [26]:
predictions = predict_model(final_model, data = test_x)

In [27]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.677
1,30001,0.1811
2,30002,0.6211
3,30003,0.774
4,30004,0.7353


In [28]:
sample_submssion.to_csv("./submission/submission_20210126-2.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.6770
1,30001,0.1811
2,30002,0.6211
3,30003,0.7740
4,30004,0.7353
...,...,...
14994,44994,0.3754
14995,44995,0.3821
14996,44996,0.6183
14997,44997,0.8104
