In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
display(train_err.tail())
display(train_qua.tail())
display(train_prob.tail())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0
16554662,24999,20201130210625,model_3,05.15.2138,15,1


Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828623,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,9,0,0,0,0,17,0,0


Unnamed: 0,user_id,time
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000
5428,18822,20201102120000


In [5]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [6]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train_err = train_err.merge(train_prob1, on='user_id', how='left').fillna(0)

In [7]:
err_code = train_err.groupby(['errtype_code'])['prob'].sum().rank(method='min').reset_index()
err_code.columns = ['errtype_code','err_code_rank']
err_code

Unnamed: 0,errtype_code,err_code_rank
0,0,940.0
1,10,2815.0
2,101,2853.0
3,111,2855.0
4,121,2856.0
...,...,...
2866,9C-14014,2727.0
2867,9V-21002,2737.0
2868,9V-21004,940.0
2869,9V-21005,2730.0


In [8]:
train_err = train_err.merge(err_code, on='errtype_code', how='left').fillna(0)
test_err = test_err.merge(err_code, on='errtype_code', how='left').fillna(0)

In [9]:
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0
...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0


In [10]:
date_rank = train_err.groupby(['date'])['prob'].sum().rank(method='min').reset_index()
date_rank.columns = ['date','date_rank']
date_rank.min(), date_rank.max()

(date         20201031
 date_rank           1
 dtype: object,
 date         20201202
 date_rank          33
 dtype: object)

In [11]:
train_err = train_err.merge(date_rank, on='date', how='left').fillna(0)
test_err = test_err.merge(date_rank, on='date', how='left').fillna(0)
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank,date_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0,31.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0,31.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0,31.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0,31.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0,31.0
...,...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0,4.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0,4.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0


In [12]:
model_fwver_rank = train_err.groupby(['model_fwver'])['prob'].sum().rank(method='min').reset_index()
model_fwver_rank.columns = ['model_fwver','model_fwver_rank']
model_fwver_rank.min(), model_fwver_rank.max()

(model_fwver         model_004.22.1442
 model_fwver_rank                    1
 dtype: object,
 model_fwver         model_804.73.2571
 model_fwver_rank                   37
 dtype: object)

In [13]:
train_err = train_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
test_err = test_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,err_code_rank,date_rank,model_fwver_rank
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,2869.0,31.0,32.0
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,2856.0,31.0,32.0
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,2855.0,31.0,32.0
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,2868.0,31.0,32.0
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,2864.0,31.0,32.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,2869.0,4.0,32.0
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,2868.0,4.0,32.0
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0,32.0
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,2864.0,4.0,32.0


In [14]:
model_time = pd.concat([train_err[['model_fwver','time']], test_err[['model_fwver','time']]])

In [15]:
model_time['time_rank'] = model_time.groupby(['model_fwver'])['time'].rank(method='min')

In [16]:
model_time = model_time.drop_duplicates()

In [17]:
train_err = train_err.merge(model_time, on=['model_fwver','time'], how='left')

In [18]:
test_err = test_err.merge(model_time, on=['model_fwver','time'], how='left')

In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [19]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [20]:
train_qua_0.quality_5 = train_qua_0.quality_5.str.replace(',','')
train_qua_0.quality_5 = train_qua_0.quality_5.astype(float)

train_qua_0.quality_7 = train_qua_0.quality_7.str.replace(',','')
train_qua_0.quality_7 = train_qua_0.quality_7.astype(float)

train_qua_0.quality_8 = train_qua_0.quality_8.str.replace(',','')
train_qua_0.quality_8 = train_qua_0.quality_8.astype(float)

train_qua_0.quality_9 = train_qua_0.quality_9.str.replace(',','')
train_qua_0.quality_9 = train_qua_0.quality_9.astype(float)

train_qua_0.quality_10 = train_qua_0.quality_10.str.replace(',','')
train_qua_0.quality_10 = train_qua_0.quality_10.astype(float)

In [21]:
test_qua_0.quality_5 = test_qua_0.quality_5.str.replace(',','')
test_qua_0.quality_5 = test_qua_0.quality_5.astype(float)

test_qua_0.quality_7 = test_qua_0.quality_7.str.replace(',','')
test_qua_0.quality_7 = test_qua_0.quality_7.astype(float)

test_qua_0.quality_8 = test_qua_0.quality_8.str.replace(',','')
test_qua_0.quality_8 = test_qua_0.quality_8.astype(float)

test_qua_0.quality_9 = test_qua_0.quality_9.str.replace(',','')
test_qua_0.quality_9 = test_qua_0.quality_9.astype(float)

test_qua_0.quality_10 = test_qua_0.quality_10.str.replace(',','')
test_qua_0.quality_10 = test_qua_0.quality_10.astype(float)

In [22]:
train_qua_0 = train_qua_0.fillna(0)
test_qua_0 = test_qua_0.fillna(0)

In [23]:
test_qua_0.groupby('user_id').sum().reset_index()

Unnamed: 0,user_id,quality_0,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,0.0,0.0,2.0,5,10.0,0.0,0.0,4.0,0,0
1,30001,-1.0,-1.0,2561.0,-1,0.0,0.0,0.0,10245.0,-1,0
2,30002,-3.0,-3.0,130.0,-3,0.0,0.0,0.0,384.0,-3,0
3,30003,0.0,0.0,9.0,220,560.0,0.0,0.0,30.0,0,0
4,30004,-1.0,-1.0,1.0,32,90.0,0.0,0.0,4.0,-1,0
...,...,...,...,...,...,...,...,...,...,...,...
8263,44990,-2.0,-2.0,44.0,-2,0.0,0.0,0.0,141.0,-2,0
8264,44993,8.0,8.0,13.0,108,348.0,0.0,0.0,43.0,-1,0
8265,44994,-1.0,-1.0,1.0,-1,0.0,0.0,0.0,4.0,-1,0
8266,44996,-2.0,-2.0,31.0,-2,0.0,0.0,0.0,109.0,-2,0


In [24]:
col = ['user_id','quality_0_s','quality_1_s','quality_2_s','quality_5_s','quality_6_s','quality_7_s','quality_8_s','quality_9_s','quality_10_s','quality_11_s','quality_12_s']
train_qua_3 = train_qua_0.groupby('user_id').sum().reset_index()
train_qua_3.columns = col

# quality_1_s 
col = ['user_id','quality_0_s','quality_2_s','quality_5_s','quality_6_s','quality_7_s','quality_8_s','quality_9_s','quality_10_s','quality_11_s','quality_12_s']
test_qua_3 = test_qua_0.groupby('user_id').sum().reset_index()
test_qua_3.columns = col

In [25]:
col = ['user_id','quality_0_m','quality_1_m','quality_2_m','quality_5_m','quality_6_m','quality_7_m','quality_8_m','quality_9_m','quality_10_m','quality_11_m','quality_12_m']
train_qua_4 = train_qua_0.groupby('user_id').mean().reset_index()
train_qua_4.columns = col

# quality_1_m 
col = ['user_id','quality_0_m','quality_2_m','quality_5_m','quality_6_m','quality_7_m','quality_8_m','quality_9_m','quality_10_m','quality_11_m','quality_12_m']
test_qua_4 = test_qua_0.groupby('user_id').mean().reset_index()
test_qua_4.columns = col

In [None]:
test_qua_4

In [26]:
train_qua_3.drop('quality_1_s', axis=1, inplace=True)

train_qua_4.drop('quality_1_m', axis=1, inplace=True)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [27]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [28]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [29]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [30]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [31]:
train_err.head()

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob,...,model_fwver_rank,time_rank,weekday,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,...,32.0,13091.0,6,0,0,0,0,0,0,1
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,...,32.0,13590.0,6,0,0,0,0,0,0,1
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,...,32.0,13590.0,6,0,0,0,0,0,0,1
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,...,32.0,23520.0,6,0,0,0,0,0,0,1
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,...,32.0,23521.0,6,0,0,0,0,0,0,1


In [32]:
train_err_8 = train_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
train_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

test_err_8 = test_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
test_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

train = train.merge(train_err_8, on='user_id', how='left').fillna(0)
test = test.merge(test_err_8, on='user_id', how='left').fillna(0)

In [33]:
train_err_7 = train_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
train_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

test_err_7 = test_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
test_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

train = train.merge(train_err_7, on='user_id', how='left').fillna(0)
test = test.merge(test_err_7, on='user_id', how='left').fillna(0)

In [34]:
train_err_6 = train_err.groupby('user_id').agg({'time_rank':['min','max']}).reset_index()
train_err_6.columns = ['user_id', 'time_rank_min', 'time_rank_max']

test_err_6 = test_err.groupby('user_id').agg({'time_rank':['min','max']}).reset_index()
test_err_6.columns = ['user_id', 'time_rank_min', 'time_rank_max']

train = train.merge(train_err_6, on='user_id', how='left').fillna(0)
test = test.merge(test_err_6, on='user_id', how='left').fillna(0)

In [35]:
train = train.merge(train_qua_3, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_3, on='user_id', how='left').fillna(0)

In [36]:
train = train.merge(train_qua_4, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_4, on='user_id', how='left').fillna(0)

In [91]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_12_m,date_mean_sum,date_mean_mean,date_mean_min,date_mean_max,prob,d_20201031,d_20201202,date_x,date_y
0,10000,30,317,0,0,0,0,0,0,0,...,0.0,144.892059,0.457073,0.425244,0.49899,0,0,0,0,0
1,10001,30,2365,0,0,0,0,0,379,0,...,0.0,1042.001715,0.440593,0.425244,0.49899,1,0,0,0,0
2,10002,29,306,0,0,0,0,0,0,0,...,0.0,140.090511,0.457812,0.425244,0.49899,0,0,0,0,0
3,10003,30,306,0,0,0,0,0,81,0,...,0.0,139.440738,0.455689,0.425244,0.49899,0,0,0,0,0
4,10004,30,777,0,0,645,0,0,0,0,...,0.0,357.425452,0.460007,0.425244,0.49899,1,0,0,0,0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_11_m,quality_12_m,date_mean_sum,date_mean_mean,date_mean_min,date_mean_max,d_20201031,d_20201202,date_x,date_y
0,30000,29,2750,0,0,0,0,0,0,0,...,0.0,0.0,1255.162878,0.456423,0.425244,0.49899,0,0,0,0
1,30001,28,284,0,0,0,0,0,0,0,...,-0.1,0.0,130.987454,0.461223,0.425244,0.49899,0,0,0,0
2,30002,30,941,0,0,733,0,0,0,0,...,-0.115385,0.0,428.731842,0.455613,0.425244,0.49899,0,0,0,0
3,30003,28,371,0,0,246,0,0,0,0,...,0.0,0.0,169.332979,0.456423,0.425244,0.49899,0,0,0,0
4,30004,30,881,0,0,0,0,0,0,0,...,-0.2,0.0,401.420888,0.455642,0.425244,0.49899,0,0,0,0


((15000, 131), (14998, 130))

In [139]:
train_err

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode,date,model_fwver,errtype_code,prob_x,...,time_rank,weekday,wd_0,wd_1,wd_2,wd_3,wd_4,wd_5,wd_6,date_mean
0,10000,20201101025616,model_3,05.15.2138,15,1,20201101,model_305.15.2138,151,0.0,...,13091.0,6,0,0,0,0,0,0,1,0.480152
1,10000,20201101030309,model_3,05.15.2138,12,1,20201101,model_305.15.2138,121,0.0,...,13590.0,6,0,0,0,0,0,0,1,0.480152
2,10000,20201101030309,model_3,05.15.2138,11,1,20201101,model_305.15.2138,111,0.0,...,13590.0,6,0,0,0,0,0,0,1,0.480152
3,10000,20201101050514,model_3,05.15.2138,16,1,20201101,model_305.15.2138,161,0.0,...,23520.0,6,0,0,0,0,0,0,1,0.480152
4,10000,20201101050515,model_3,05.15.2138,4,0,20201101,model_305.15.2138,40,0.0,...,23521.0,6,0,0,0,0,0,0,1,0.480152
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16554658,24999,20201130163051,model_3,05.15.2138,15,1,20201130,model_305.15.2138,151,0.0,...,3671153.0,0,1,0,0,0,0,0,0,0.429518
16554659,24999,20201130172625,model_3,05.15.2138,16,1,20201130,model_305.15.2138,161,0.0,...,3675038.0,0,1,0,0,0,0,0,0,0.429518
16554660,24999,20201130172625,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,...,3675038.0,0,1,0,0,0,0,0,0,0.429518
16554661,24999,20201130172631,model_3,05.15.2138,4,0,20201130,model_305.15.2138,40,0.0,...,3675043.0,0,1,0,0,0,0,0,0,0.429518


In [None]:
day_all = set(train_err.date.unique()) | set(test_err.date.unique())

In [156]:
day_all = sorted(day_all)

In [159]:
set(train_err.date.unique()) - set(test_err.date.unique())

set()

In [162]:
train_err.date.max()

'20201202'

In [160]:
set(test_err.date.unique()) - set(train_err.date.unique())

{'20201203', '20201204', '20201205', '20201209', '20201211', '20201214'}

In [53]:
date_mean = train_err.groupby('date').prob.mean().sort_values()
date_mean_df = pd.DataFrame(date_mean).reset_index()
date_mean_df

Unnamed: 0,date,prob
0,20201202,0.0
1,20201126,0.425244
2,20201127,0.428967
3,20201122,0.429482
4,20201130,0.429518
5,20201123,0.436371
6,20201118,0.437313
7,20201108,0.440335
8,20201129,0.440505
9,20201115,0.440629


In [None]:
date_mean_df.columns = ['date','date_mean']

In [54]:
train_err = train_err.merge(date_mean_df, on='date', how='left').fillna(0)
test_err = test_err.merge(date_mean_df, on='date', how='left').fillna(0)

In [57]:
train_err.rename(columns={'prob_y':'date_mean'}, inplace=True)

In [58]:
test_err.rename(columns={'prob':'date_mean'}, inplace=True)

In [59]:
train_err_5 = train_err.groupby('user_id').agg({'date_mean':['sum','mean','min','max']}).reset_index()
train_err_5.columns = ['user_id', 'date_mean_sum', 'date_mean_mean', 'date_mean_min', 'date_mean_max']

test_err_5 = test_err.groupby('user_id').agg({'date_mean':['sum','mean','min','max']}).reset_index()
test_err_5.columns = ['user_id', 'date_mean_sum', 'date_mean_mean', 'date_mean_min', 'date_mean_max']

train = train.merge(train_err_5, on='user_id', how='left').fillna(0)
test = test.merge(test_err_5, on='user_id', how='left').fillna(0)

In [82]:
train['d_20201031'] = 0
test['d_20201031'] = 0

train['d_20201202'] = 0
test['d_20201202'] = 0

In [84]:
train_dates = train_err[['user_id','date']].drop_duplicates()
test_dates = test_err[['user_id','date']].drop_duplicates()

In [87]:
train_d20201031 = train_dates[train_dates.date == '20201031']
test_d20201031 = test_dates[test_dates.date == '20201031']

In [88]:
train_d20201202 = train_dates[train_dates.date == '20201202']
test_d20201202 = test_dates[test_dates.date == '20201202']

In [112]:
train_d20201031.date = 1
test_d20201031.date = 1

train_d20201202.date = 1
test_d20201202.date = 1

In [113]:
train_d20201202

Unnamed: 0,user_id,date
4378222,14187,1
5119339,14779,1
11961601,20822,1


In [118]:
train = train.merge(train_d20201031, on=['user_id'], how='left').fillna(0)
test = test.merge(test_d20201031, on='user_id', how='left').fillna(0)

In [119]:
train = train.merge(train_d20201202, on='user_id', how='left').fillna(0)
test = test.merge(test_d20201202, on='user_id', how='left').fillna(0)

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [115]:
train.drop(['d_20201031','d_20201202','date_x','date_y'], axis=1, inplace=True)

In [116]:
test.drop(['d_20201031','d_20201202','date_x','date_y'], axis=1, inplace=True)

In [120]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_9_m,quality_10_m,quality_11_m,quality_12_m,date_mean_sum,date_mean_mean,date_mean_min,date_mean_max,date_x,date_y
0,10000,30,317,0,0,0,0,0,0,0,...,0.0,6.0,0.0,0.0,144.892059,0.457073,0.425244,0.49899,0.0,0.0
1,10001,30,2365,0,0,0,0,0,379,0,...,0.0,0.0,0.0,0.0,1042.001715,0.440593,0.425244,0.49899,0.0,0.0
2,10002,29,306,0,0,0,0,0,0,0,...,0.090909,4.318182,-0.045455,0.0,140.090511,0.457812,0.425244,0.49899,0.0,0.0
3,10003,30,306,0,0,0,0,0,81,0,...,0.0,0.0,0.0,0.0,139.440738,0.455689,0.425244,0.49899,0.0,0.0
4,10004,30,777,0,0,645,0,0,0,0,...,0.0,2.333333,-0.166667,0.0,357.425452,0.460007,0.425244,0.49899,0.0,0.0


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_9_m,quality_10_m,quality_11_m,quality_12_m,date_mean_sum,date_mean_mean,date_mean_min,date_mean_max,date_x,date_y
0,30000,29,2750,0,0,0,0,0,0,0,...,0.0,2.0,0.0,0.0,1255.162878,0.456423,0.425244,0.49899,0.0,0.0
1,30001,28,284,0,0,0,0,0,0,0,...,0.0,1024.5,-0.1,0.0,130.987454,0.461223,0.425244,0.49899,0.0,0.0
2,30002,30,941,0,0,733,0,0,0,0,...,0.0,14.769231,-0.115385,0.0,428.731842,0.455613,0.425244,0.49899,0.0,0.0
3,30003,28,371,0,0,246,0,0,0,0,...,0.0,2.307692,0.0,0.0,169.332979,0.456423,0.425244,0.49899,0.0,0.0
4,30004,30,881,0,0,0,0,0,0,0,...,0.0,0.8,-0.2,0.0,401.420888,0.455642,0.425244,0.49899,0.0,0.0


((15000, 128), (14998, 128))

In [121]:
train.date_x.value_counts()

0.0    14995
1.0        5
Name: date_x, dtype: int64

In [122]:
test.date_x.value_counts()

0.0    14994
1.0        4
Name: date_x, dtype: int64

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [125]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [130]:
train.columns[80:-1]

Index(['quality_7', 'quality_8', 'quality_9', 'quality_10', 'quality_11',
       'quality_12', 'day_max', 'wd_0', 'wd_1', 'wd_2', 'wd_3', 'wd_4', 'wd_5',
       'wd_6', 'err_rank_sum', 'err_rank_max', 'date_rank_sum',
       'date_rank_max', 'model_fwver_rank_sum', 'model_fwver_rank_max',
       'time_rank_min', 'time_rank_max', 'quality_0_s', 'quality_2_s',
       'quality_5_s', 'quality_6_s', 'quality_7_s', 'quality_8_s',
       'quality_9_s', 'quality_10_s', 'quality_11_s', 'quality_12_s',
       'quality_0_m', 'quality_2_m', 'quality_5_m', 'quality_6_m',
       'quality_7_m', 'quality_8_m', 'quality_9_m', 'quality_10_m',
       'quality_11_m', 'quality_12_m', 'date_mean_sum', 'date_mean_mean',
       'date_mean_min', 'date_mean_max', 'date_x', 'date_y'],
      dtype='object')

In [135]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-3]
           , ignore_features=['err_rank_sum','quality_0', 'quality_1',
       'quality_2', 'quality_5', 'quality_6', 'quality_7', 'quality_8',
       'quality_9', 'quality_10', 'quality_11', 'quality_12'
      , 'date_mean_sum', 'date_mean_mean', 'date_mean_min', 'date_mean_max'
                              ,'date_y'
                             ]
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 129)"
4,Missing Values,False
5,Numeric Features,126
6,Categorical Features,2
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 1.92 s


In [136]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7955,0.8202,0.5086,0.8063,0.6235,0.493,0.5177,14.8483
1,Gradient Boosting Classifier,0.796,0.8186,0.4871,0.831,0.614,0.4879,0.5202,6.109
2,Light Gradient Boosting Machine,0.7918,0.8159,0.5183,0.7838,0.6236,0.488,0.5081,0.7853
3,Extra Trees Classifier,0.7914,0.808,0.506,0.7937,0.618,0.484,0.5072,0.6951
4,Ada Boost Classifier,0.7806,0.805,0.4974,0.7618,0.6017,0.4595,0.4794,1.3932
5,Extreme Gradient Boosting,0.7852,0.8035,0.5409,0.7454,0.6266,0.4812,0.4936,2.4354
6,Random Forest Classifier,0.7619,0.7722,0.4537,0.7298,0.5593,0.4082,0.4299,0.1284
7,Linear Discriminant Analysis,0.7498,0.7458,0.3349,0.7971,0.4715,0.3416,0.3969,0.2016


In [137]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7857,0.8116,0.4786,0.7976,0.5982,0.4643,0.4924
1,0.8086,0.8325,0.5343,0.8311,0.6504,0.5271,0.5514
2,0.79,0.8233,0.4914,0.8019,0.6094,0.4769,0.5036
3,0.8019,0.8367,0.4986,0.843,0.6266,0.5036,0.5357
4,0.7904,0.8034,0.4943,0.8009,0.6113,0.4786,0.5047
Mean,0.7953,0.8215,0.4994,0.8149,0.6192,0.4901,0.5176
SD,0.0085,0.0125,0.0187,0.0185,0.018,0.0225,0.0222


In [138]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7943,0.8249,0.486,0.8247,0.6116,0.4841,0.5154


In [None]:
%%time
final_model = finalize_model(blended)

In [None]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

In [None]:
predictions = predict_model(final_model, data = test_x)

In [None]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

In [None]:
sample_submssion.to_csv("./submission/submission_20210203-3.csv", index = False)
sample_submssion