In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [5]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [None]:
train_err.columns

In [7]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [8]:
train_err['hour'] = train_err['time'].astype(str).str.slice(8,10)
test_err['hour'] = test_err['time'].astype(str).str.slice(8,10)

train_err = pd.concat([train_err, pd.get_dummies(train_err['hour'], prefix='h')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['hour'], prefix='h')], axis=1)

col = ['h_00', 'h_01', 'h_02', 'h_03',
       'h_04', 'h_05', 'h_06', 'h_07', 'h_08', 'h_09', 'h_10', 'h_11', 'h_12',
       'h_13', 'h_14', 'h_15', 'h_16', 'h_17', 'h_18', 'h_19', 'h_20', 'h_21',
       'h_22', 'h_23']
train_hr = train_err.groupby('user_id')[col].sum()
test_hr = test_err.groupby('user_id')[col].sum()

train = train.merge(train_hr, on='user_id', how='left').fillna(0)
test = test.merge(test_hr, on='user_id', how='left').fillna(0)

In [9]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23
0,10000,30,317,0,0,0,0,0,0,0,...,10.0,9.0,6.0,15.0,19.0,12.0,0.0,10.0,5.0,2.0
1,10001,30,2365,0,0,0,0,0,379,0,...,183.0,193.0,210.0,153.0,85.0,230.0,176.0,258.0,70.0,45.0
2,10002,29,306,0,0,0,0,0,0,0,...,4.0,27.0,5.0,6.0,5.0,28.0,24.0,30.0,13.0,10.0
3,10003,30,306,0,0,0,0,0,81,0,...,0.0,8.0,3.0,5.0,5.0,19.0,39.0,30.0,2.0,3.0
4,10004,30,777,0,0,645,0,0,0,0,...,8.0,7.0,18.0,22.0,69.0,111.0,43.0,31.0,70.0,30.0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,h_14,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23
0,30000,29,2750,0,0,0,0,0,0,0,...,105.0,284.0,157.0,133.0,143.0,522.0,475.0,97.0,113.0,55.0
1,30001,28,284,0,0,0,0,0,0,0,...,9.0,5.0,4.0,14.0,9.0,24.0,12.0,8.0,9.0,19.0
2,30002,30,941,0,0,733,0,0,0,0,...,32.0,47.0,58.0,63.0,92.0,38.0,10.0,23.0,14.0,16.0
3,30003,28,371,0,0,246,0,0,0,0,...,6.0,11.0,29.0,13.0,10.0,31.0,13.0,17.0,20.0,10.0
4,30004,30,881,0,0,0,0,0,0,0,...,66.0,55.0,34.0,32.0,31.0,27.0,32.0,24.0,13.0,59.0


((15000, 118), (14998, 118))

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [10]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,prob
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [11]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,h_15,h_16,h_17,h_18,h_19,h_20,h_21,h_22,h_23,prob
0,10000,30,317,0,0,0,0,0,0,0,...,9.0,6.0,15.0,19.0,12.0,0.0,10.0,5.0,2.0,0.0
1,10001,30,2365,0,0,0,0,0,379,0,...,193.0,210.0,153.0,85.0,230.0,176.0,258.0,70.0,45.0,1.0
2,10002,29,306,0,0,0,0,0,0,0,...,27.0,5.0,6.0,5.0,28.0,24.0,30.0,13.0,10.0,0.0
3,10003,30,306,0,0,0,0,0,81,0,...,8.0,3.0,5.0,5.0,19.0,39.0,30.0,2.0,3.0,0.0
4,10004,30,777,0,0,645,0,0,0,0,...,7.0,18.0,22.0,69.0,111.0,43.0,31.0,70.0,30.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,0.0,14.0,11.0,11.0,20.0,4.0,14.0,18.0,16.0,0.0
14996,24996,1,4,0,0,0,0,0,0,0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14997,24997,30,826,0,0,465,0,0,0,0,...,8.0,11.0,16.0,35.0,19.0,65.0,39.0,62.0,41.0,1.0
14998,24998,21,155,0,0,8,0,0,0,0,...,4.0,1.0,1.0,37.0,3.0,2.0,2.0,0.0,0.0,1.0


In [12]:
train.prob = train.prob.astype(int)

In [None]:
train.info()

In [None]:
train.columns[1:-1]

In [13]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-1])

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 119)"
4,Missing Values,False
5,Numeric Features,118
6,Categorical Features,0
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 28 s


In [14]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7916,0.8181,0.4989,0.8014,0.6146,0.4823,0.5079,18.6183
1,Light Gradient Boosting Machine,0.7888,0.814,0.5117,0.7796,0.6174,0.4802,0.5008,1.0217
2,Gradient Boosting Classifier,0.7872,0.8139,0.472,0.811,0.5964,0.4653,0.4969,8.6065
3,Extra Trees Classifier,0.7829,0.8025,0.4786,0.7873,0.5952,0.4586,0.4852,0.7592
4,Ada Boost Classifier,0.7765,0.7954,0.494,0.7498,0.5954,0.45,0.4687,1.8638
5,Extreme Gradient Boosting,0.7809,0.7951,0.5294,0.7397,0.6167,0.4693,0.4824,3.0221
6,Random Forest Classifier,0.7577,0.7588,0.4311,0.7324,0.5424,0.3924,0.418,0.1718
7,Linear Discriminant Analysis,0.75,0.737,0.3363,0.7965,0.4727,0.3426,0.3974,0.2464


In [15]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7781,0.8104,0.4686,0.7773,0.5847,0.4457,0.4722
1,0.8043,0.8281,0.5243,0.8247,0.641,0.5155,0.5405
2,0.7867,0.8208,0.4886,0.7917,0.6042,0.4692,0.4948
3,0.7995,0.8337,0.48,0.855,0.6148,0.4934,0.5309
4,0.7842,0.7989,0.4829,0.7879,0.5988,0.4625,0.4885
Mean,0.7906,0.8184,0.4889,0.8073,0.6087,0.4773,0.5054
SD,0.0098,0.0125,0.0189,0.0286,0.0189,0.0245,0.026


In [16]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7885,0.8209,0.4773,0.81,0.6007,0.4696,0.4999


In [17]:
%%time
final_model = finalize_model(blended)

Wall time: 4min 35s


In [18]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 118)

In [19]:
predictions = predict_model(final_model, data = test_x)

In [20]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.8327
1,30001,0.2317
2,30002,0.3079
3,30003,0.7535
4,30004,0.7906


In [21]:
sample_submssion.to_csv("./submission/submission_20210131-3.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.8327
1,30001,0.2317
2,30002,0.3079
3,30003,0.7535
4,30004,0.7906
...,...,...
14994,44994,0.3706
14995,44995,0.2911
14996,44996,0.5197
14997,44997,0.7664
