In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [4]:
display(train_err.tail())
display(train_qua.tail())
display(train_prob.tail())

Unnamed: 0,user_id,time,model_nm,fwver,errtype,errcode
16554658,24999,20201130163051,model_3,05.15.2138,15,1
16554659,24999,20201130172625,model_3,05.15.2138,16,1
16554660,24999,20201130172625,model_3,05.15.2138,4,0
16554661,24999,20201130172631,model_3,05.15.2138,4,0
16554662,24999,20201130210625,model_3,05.15.2138,15,1


Unnamed: 0,time,user_id,fwver,quality_0,quality_1,quality_2,quality_3,quality_4,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
828619,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,1,0,0,0,0,17,0,0
828620,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828621,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,3,0,0,0,0,17,0,0
828622,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,0,0,0,0,0,17,0,0
828623,20201124033000,24997,04.22.1778,0.0,0,0.0,0,0,9,0,0,0,0,17,0,0


Unnamed: 0,user_id,time
5424,20167,20201125120000
5425,16270,20201110120000
5426,19114,20201106230000
5427,21505,20201104110000
5428,18822,20201102120000


In [5]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


Unnamed: 0,user_id,date,model_fwver,errtype_code
0,30000,20201101,model_104.16.3553,311
1,30000,20201101,model_104.16.3553,332
2,30000,20201101,model_104.16.3553,151
3,30000,20201101,model_104.16.3553,221
4,30000,20201101,model_104.16.3553,111


In [6]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train_err = train_err.merge(train_prob1, on='user_id', how='left').fillna(0)

In [7]:
err_code = train_err.groupby(['errtype_code'])['prob'].sum().rank(method='min').reset_index()
err_code.columns = ['errtype_code','err_code_rank']
err_code

Unnamed: 0,errtype_code,err_code_rank
0,0,940.0
1,10,2815.0
2,101,2853.0
3,111,2855.0
4,121,2856.0
...,...,...
2866,9C-14014,2727.0
2867,9V-21002,2737.0
2868,9V-21004,940.0
2869,9V-21005,2730.0


In [8]:
train_err = train_err.merge(err_code, on='errtype_code', how='left').fillna(0)
test_err = test_err.merge(err_code, on='errtype_code', how='left').fillna(0)

In [9]:
date_rank = train_err.groupby(['date'])['prob'].sum().rank(method='min').reset_index()
date_rank.columns = ['date','date_rank']
date_rank.min(), date_rank.max()

(date         20201031
 date_rank           1
 dtype: object,
 date         20201202
 date_rank          33
 dtype: object)

In [10]:
train_err = train_err.merge(date_rank, on='date', how='left').fillna(0)
test_err = test_err.merge(date_rank, on='date', how='left').fillna(0)

In [11]:
model_fwver_rank = train_err.groupby(['model_fwver'])['prob'].sum().rank(method='min').reset_index()
model_fwver_rank.columns = ['model_fwver','model_fwver_rank']
model_fwver_rank.min(), model_fwver_rank.max()

(model_fwver         model_004.22.1442
 model_fwver_rank                    1
 dtype: object,
 model_fwver         model_804.73.2571
 model_fwver_rank                   37
 dtype: object)

In [12]:
train_err = train_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)
test_err = test_err.merge(model_fwver_rank, on='model_fwver', how='left').fillna(0)

In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [13]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [14]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1

train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,day_max
0,10000,30,317,0,0,0,0,0,0,0,...,0,4,0,4,0,4,4,0,0,20
1,10001,30,2365,0,0,0,0,0,379,0,...,0,0,0,0,0,0,0,0,0,1452
2,10002,29,306,0,0,0,0,0,0,0,...,2,22,4,22,0,22,22,1,0,17
3,10003,30,306,0,0,0,0,0,81,0,...,0,0,0,0,0,0,0,0,0,24
4,10004,30,777,0,0,645,0,0,0,0,...,1,6,2,6,0,6,6,1,0,102
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,2,6,2,0,0,0,6,2,0,30
14996,24996,1,4,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,4
14997,24997,30,826,0,0,465,0,0,0,0,...,1,8,1,0,0,0,8,1,0,64
14998,24998,21,155,0,0,8,0,0,0,0,...,0,0,0,0,0,0,0,0,0,37


In [15]:
import datetime

train_err['weekday'] = pd.to_datetime(train_err.date).dt.weekday
test_err['weekday'] = pd.to_datetime(test_err.date).dt.weekday

train_err = pd.concat([train_err, pd.get_dummies(train_err['weekday'], prefix='wd')], axis=1)
test_err = pd.concat([test_err, pd.get_dummies(test_err['weekday'], prefix='wd')], axis=1)

train_wd = train_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()
test_wd = test_err.groupby('user_id')[['wd_0', 'wd_1', 'wd_2','wd_3', 'wd_4', 'wd_5', 'wd_6']].sum()

train = train.merge(train_wd, on='user_id', how='left').fillna(0)
test = test.merge(test_wd, on='user_id', how='left').fillna(0)

In [16]:
train_err_9 = train_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
train_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

test_err_9 = test_err.groupby('user_id').agg({'err_code_rank':['sum','max']}).reset_index()
test_err_9.columns = ['user_id', 'err_rank_sum', 'err_rank_max']

train = train.merge(train_err_9, on='user_id', how='left').fillna(0)
test = test.merge(test_err_9, on='user_id', how='left').fillna(0)

In [17]:
train_err_8 = train_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
train_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

test_err_8 = test_err.groupby('user_id').agg({'date_rank':['sum','max']}).reset_index()
test_err_8.columns = ['user_id', 'date_rank_sum', 'date_rank_max']

train = train.merge(train_err_8, on='user_id', how='left').fillna(0)
test = test.merge(test_err_8, on='user_id', how='left').fillna(0)

In [18]:
train_err_7 = train_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
train_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

test_err_7 = test_err.groupby('user_id').agg({'model_fwver_rank':['sum','max']}).reset_index()
test_err_7.columns = ['user_id', 'model_fwver_rank_sum', 'model_fwver_rank_max']

train = train.merge(train_err_7, on='user_id', how='left').fillna(0)
test = test.merge(test_err_7, on='user_id', how='left').fillna(0)

In [19]:
train_model_user = train_err[['user_id','model_nm']].drop_duplicates()
test_model_user = test_err[['user_id','model_nm']].drop_duplicates()

mode_nm = sorted(train_err.model_nm.unique())
for col in mode_nm:
    train[col] = 0
    test[col] = 0

In [20]:
for idx, col in tqdm(enumerate(train_model_user.values)):
    i = train[train.user_id == col[0]].index
    train.loc[i, col[1]] = 1

15704it [00:15, 1007.36it/s]


In [21]:
for idx, col in tqdm(enumerate(test_model_user.values)):
    i = test[test.user_id == col[0]].index
    test.loc[i, col[1]] = 1

15657it [00:17, 882.05it/s]


In [22]:
train['model'] = 0
test['model'] = 0

for col in mode_nm:
    train['model'] += train[col]#.astype(str)
    test['model'] += test[col]#.astype(str)

In [23]:
col = ['model_0', 'model_1', 'model_2', 'model_3', 'model_4', 'model_5','model_6', 'model_7', 'model_8']
train.drop(col, axis=1, inplace=True)
test.drop(col, axis=1, inplace=True)

In [24]:
display(train.head())
display(test.head())

train.shape, test.shape

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model
0,10000,30,317,0,0,0,0,0,0,0,...,43.0,54.0,52.0,907345.0,2869.0,5840.0,33.0,10144.0,32.0,1
1,10001,30,2365,0,0,0,0,0,379,0,...,1534.0,117.0,139.0,6763505.0,2870.0,25937.0,33.0,81259.0,35.0,1
2,10002,29,306,0,0,0,0,0,0,0,...,34.0,42.0,58.0,876071.0,2869.0,5764.0,33.0,9792.0,32.0,1
3,10003,30,306,0,0,0,0,0,81,0,...,37.0,22.0,57.0,870419.0,2871.0,5388.0,33.0,10386.0,35.0,1
4,10004,30,777,0,0,645,0,0,0,0,...,171.0,84.0,108.0,2226738.0,2871.0,14420.0,33.0,27576.0,36.0,1


Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_4,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model
0,30000,29,2750,0,0,0,0,0,0,0,...,223.0,1024.0,510.0,7885129.0,2871.0,57359.0,33.0,99055.0,37.0,2
1,30001,28,284,0,0,0,0,0,0,0,...,18.0,54.0,56.0,812544.0,2869.0,5767.0,33.0,9088.0,32.0,1
2,30002,30,941,0,0,733,0,0,0,0,...,99.0,140.0,135.0,2694983.0,2871.0,16311.0,33.0,33252.0,36.0,1
3,30003,28,371,0,0,246,0,0,0,0,...,92.0,24.0,71.0,1061508.0,2871.0,6783.0,33.0,12981.0,36.0,1
4,30004,30,881,0,0,0,0,0,0,0,...,105.0,102.0,157.0,2518366.0,2870.0,16093.0,33.0,32051.0,37.0,2


((15000, 101), (14998, 101))

In [25]:
display(train.model.value_counts())
display(test.model.value_counts())

1    14297
2      702
3        1
Name: model, dtype: int64

1    14339
2      659
Name: model, dtype: int64

In [36]:
train[train.model == 1]

Unnamed: 0,user_id,date_cnt,date_sum,model_403.11.1167,model_804.73.2571,model_004.22.1750,model_610,model_68.5.3,model_204.33.1185,model_705.66.3237,...,wd_5,wd_6,err_rank_sum,err_rank_max,date_rank_sum,date_rank_max,model_fwver_rank_sum,model_fwver_rank_max,model,prob
0,10000,30,317,0,0,0,0,0,0,0,...,54.0,52.0,907345.0,2869.0,5840.0,33.0,10144.0,32.0,1,0
1,10001,30,2365,0,0,0,0,0,379,0,...,117.0,139.0,6763505.0,2870.0,25937.0,33.0,81259.0,35.0,1,1
2,10002,29,306,0,0,0,0,0,0,0,...,42.0,58.0,876071.0,2869.0,5764.0,33.0,9792.0,32.0,1,0
3,10003,30,306,0,0,0,0,0,81,0,...,22.0,57.0,870419.0,2871.0,5388.0,33.0,10386.0,35.0,1,0
4,10004,30,777,0,0,645,0,0,0,0,...,84.0,108.0,2226738.0,2871.0,14420.0,33.0,27576.0,36.0,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14995,24995,10,194,0,0,0,0,0,0,0,...,39.0,34.0,554483.0,2870.0,3353.0,33.0,6790.0,35.0,1,0
14996,24996,1,4,0,0,0,0,0,0,0,...,0.0,0.0,11350.0,2861.0,16.0,4.0,128.0,32.0,1,0
14997,24997,30,826,0,0,465,0,0,0,0,...,118.0,133.0,2365691.0,2871.0,16077.0,33.0,28653.0,36.0,1,1
14998,24998,21,155,0,0,8,0,0,0,0,...,12.0,24.0,443241.0,2870.0,2548.0,33.0,5139.0,36.0,1,1


In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [26]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [None]:
train.columns[1:-10]

In [27]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
           , numeric_features=train.columns[1:-2] # model cat
            , ignore_features=['err_rank_sum']
           )

Setup Succesfully Completed!


Unnamed: 0,Description,Value
0,session_id,42
1,Target Type,Binary
2,Label Encoded,
3,Original Data,"(15000, 102)"
4,Missing Values,False
5,Numeric Features,100
6,Categorical Features,1
7,Ordinal Features,False
8,High Cardinality Features,False
9,High Cardinality Method,


Wall time: 14.8 s


In [28]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
0,CatBoost Classifier,0.7953,0.8202,0.5077,0.8069,0.6231,0.4924,0.5174,13.3591
1,Gradient Boosting Classifier,0.7948,0.8184,0.4871,0.826,0.6126,0.4855,0.5169,5.1254
2,Light Gradient Boosting Machine,0.7924,0.8153,0.5151,0.7891,0.623,0.4884,0.5097,0.662
3,Extra Trees Classifier,0.792,0.8063,0.4963,0.8052,0.6139,0.4823,0.5089,0.6694
4,Ada Boost Classifier,0.7851,0.8046,0.5026,0.7735,0.6091,0.4701,0.4909,1.1915
5,Extreme Gradient Boosting,0.7827,0.8022,0.5346,0.7422,0.621,0.4744,0.4873,2.3672
6,Random Forest Classifier,0.7725,0.7754,0.4729,0.7531,0.5805,0.4355,0.4579,0.1245
7,Linear Discriminant Analysis,0.7689,0.7607,0.3763,0.8438,0.52,0.3962,0.4524,0.1896


In [29]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7824,0.81,0.4714,0.7914,0.5909,0.4553,0.4836
1,0.8086,0.8353,0.5343,0.8311,0.6504,0.5271,0.5514
2,0.7948,0.8217,0.5014,0.8106,0.6196,0.4895,0.516
3,0.8062,0.8342,0.4914,0.8709,0.6283,0.5106,0.5489
4,0.787,0.805,0.4829,0.7991,0.602,0.4684,0.4961
Mean,0.7958,0.8213,0.4963,0.8206,0.6182,0.4902,0.5192
SD,0.0103,0.0123,0.0214,0.0285,0.0208,0.0263,0.0273


In [30]:
pred_holdout = predict_model(blended)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7934,0.8216,0.4833,0.8239,0.6092,0.4815,0.5131


In [31]:
%%time
final_model = finalize_model(blended)

Wall time: 3min 43s


In [32]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

(14999, 101)

In [33]:
predictions = predict_model(final_model, data = test_x)

In [34]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

Unnamed: 0,user_id,problem
0,30000,0.7292
1,30001,0.3887
2,30002,0.4505
3,30003,0.7022
4,30004,0.5716


In [35]:
sample_submssion.to_csv("./submission/submission_20210202-1-1.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.7292
1,30001,0.3887
2,30002,0.4505
3,30003,0.7022
4,30004,0.5716
...,...,...
14994,44994,0.5070
14995,44995,0.3934
14996,44996,0.5861
14997,44997,0.7135
