In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
display(train_err.tail())
display(train_qua.tail())
display(train_prob.tail())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0
16554662,24999,20201130210625,model_3,05.15.2138,15,1


Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828623,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,9,0,0,0,0,17,0,0


Unnamed: 0,user_id,time
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000
5428,18822,20201102120000


In [5]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [6]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train_err = train_err.merge(train_prob1, on='user_id', how='left').fillna(0)

In [7]:
err_code = train_err.groupby(['errtype_code'])['prob'].sum().rank(method='min').reset_index()
err_code.columns = ['errtype_code','err_code_rank']
err_code

Unnamed: 0,errtype_code,err_code_rank
0,0,940.0
1,10,2815.0
2,101,2853.0
3,111,2855.0
4,121,2856.0
...,...,...
2866,9C-14014,2727.0
2867,9V-21002,2737.0
2868,9V-21004,940.0
2869,9V-21005,2730.0


In [8]:
train_err = train_err.merge(err_code, on='errtype_code', how='left').fillna(0)
test_err = test_err.merge(err_code, on='errtype_code', how='left').fillna(0)

In [9]:
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0
...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0


In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [10]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [11]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

Unnamed: 0,user_id,day_max
0,10000,20
1,10001,1452
2,10002,17
3,10003,24
4,10004,102
...,...,...
14995,24995,30
14996,24996,4
14997,24997,64
14998,24998,37


In [12]:
test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

Unnamed: 0,user_id,day_max
0,30000,808
1,30001,22
2,30002,67
3,30003,75
4,30004,85
...,...,...
14993,44994,90
14994,44995,35
14995,44996,430
14996,44997,2606


In [13]:
train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [14]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [21]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

In [23]:
train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [25]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,day_max,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max
0,10000,30,317,0,0,0,0,0,0,0,...,20,37.0,41.0,45.0,45.0,43.0,54.0,52.0,907345.0,2869.0
1,10001,30,2365,0,0,0,0,0,379,0,...,1452,164.0,138.0,107.0,166.0,1534.0,117.0,139.0,6763505.0,2870.0
2,10002,29,306,0,0,0,0,0,0,0,...,17,52.0,34.0,46.0,40.0,34.0,42.0,58.0,876071.0,2869.0
3,10003,30,306,0,0,0,0,0,81,0,...,24,51.0,52.0,42.0,45.0,37.0,22.0,57.0,870419.0,2871.0
4,10004,30,777,0,0,645,0,0,0,0,...,102,96.0,144.0,115.0,59.0,171.0,84.0,108.0,2226738.0,2871.0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,day_max,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max
0,30000,29,2750,0,0,0,0,0,0,0,...,808,267.0,62.0,410.0,254.0,223.0,1024.0,510.0,7885129.0,2871.0
1,30001,28,284,0,0,0,0,0,0,0,...,22,43.0,32.0,57.0,24.0,18.0,54.0,56.0,812544.0,2869.0
2,30002,30,941,0,0,733,0,0,0,0,...,67,138.0,120.0,158.0,151.0,99.0,140.0,135.0,2694983.0,2871.0
3,30003,28,371,0,0,246,0,0,0,0,...,75,49.0,34.0,69.0,32.0,92.0,24.0,71.0,1061508.0,2871.0
4,30004,30,881,0,0,0,0,0,0,0,...,85,109.0,112.0,120.0,176.0,105.0,102.0,157.0,2518366.0,2870.0


((15000, 96), (14998, 96))

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [26]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [31]:
train.columns[1:-1]

Index(['date_cnt', 'date_sum', 'model_403.11.1167', 'model_804.73.2571',
       'model_004.22.1750', 'model_610', 'model_68.5.3', 'model_204.33.1185',
       'model_705.66.3237', 'model_104.16.3571', 'model_004.22.1778',
       'model_104.16.3569', 'model_204.33.1171', 'model_305.15.2120',
       'model_504.82.1684', 'model_305.15.3104', 'model_504.82.1778',
       'model_104.16.3553', 'model_504.82.1730', 'model_403.11.1149',
       'model_104.16.3439', 'model_305.15.2092', 'model_305.15.2138',
       'model_004.22.1656', 'model_204.33.1125', 'model_204.33.1261',
       'model_004.22.1666', 'model_403.11.1141', 'model_705.66.3571',
       'model_305.15.2114', 'model_204.33.1149', 'model_804.73.2237',
       'model_004.22.1684', 'E1', 'E2', 'E3', 'E4', 'E5', 'E6', 'E7', 'E8',
       'E9', 'E10', 'E11', 'E12', 'E13', 'E14', 'E15', 'E16', 'E17', 'E18',
       'E19', 'E20', 'E21', 'E22', 'E23', 'E24', 'E25', 'E26', 'E27', 'E28',
       'E30', 'E31', 'E32', 'E33', 'E34', 'E35', 'E36', 'E37

In [36]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-1]
           , ignore_features=['err_rank_sum']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 97)"
4,Missing Values,False
5,Numeric Features,96
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 4.11 s


In [37]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7933,0.8188,0.5046,0.8023,0.6193,0.4874,0.5122,13.0619
1,Gradient Boosting Classifier,0.7894,0.816,0.4743,0.8177,0.6,0.4705,0.5029,4.618
2,Light Gradient Boosting Machine,0.792,0.8127,0.5197,0.7833,0.6246,0.4889,0.5087,0.5806
3,Extra Trees Classifier,0.7871,0.8049,0.492,0.7905,0.6064,0.4711,0.4961,0.6332
4,Ada Boost Classifier,0.776,0.7989,0.4911,0.7503,0.5935,0.4482,0.4674,1.0714
5,Extreme Gradient Boosting,0.7792,0.7967,0.5317,0.7324,0.6159,0.4665,0.4784,1.9518
6,Random Forest Classifier,0.762,0.7671,0.4494,0.7339,0.5571,0.4071,0.4301,0.1213
7,Linear Discriminant Analysis,0.7503,0.739,0.332,0.8036,0.4698,0.3415,0.3986,0.1561


In [38]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7867,0.8107,0.4843,0.7958,0.6021,0.4679,0.4949
1,0.8014,0.8295,0.5214,0.8166,0.6364,0.5088,0.5331
2,0.7924,0.8216,0.4971,0.8056,0.6148,0.4834,0.5098
3,0.8057,0.835,0.5,0.8578,0.6318,0.512,0.5464
4,0.788,0.801,0.4929,0.7931,0.6079,0.4733,0.4984
Mean,0.7948,0.8196,0.4991,0.8138,0.6186,0.4891,0.5165
SD,0.0075,0.0124,0.0123,0.0235,0.0133,0.0181,0.02


In [39]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.792,0.8205,0.4827,0.819,0.6074,0.4785,0.5094


In [40]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 36s


In [41]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 96)

In [42]:
predictions = predict_model(final_model, data = test_x)

In [43]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.8261
1,30001,0.2416
2,30002,0.3038
3,30003,0.7818
4,30004,0.7882


In [44]:
sample_submssion.to_csv("./submission/submission_20210131-4.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.8261
1,30001,0.2416
2,30002,0.3038
3,30003,0.7818
4,30004,0.7882
...,...,...
14994,44994,0.3254
14995,44995,0.2507
14996,44996,0.5235
14997,44997,0.7701
