In [None]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
seed = 42
np.random.seed(seed)
python_random.seed(seed)
set_config('seed', seed)

In [None]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [None]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

In [None]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

train_err_1.shape

In [None]:
train = train_err_1.copy()
test = test_err_1.copy()

In [None]:
train_model = set(train_err.model_fwver.unique())
test_model = set(test_err.model_fwver.unique())
models = train_model & test_model
models = list(models)

for model in models:
    train[model] = 0
    test[model] = 0
    
train.shape, test.shape    

In [None]:
train_err_2 = train_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
train_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(train_err_2.head())

test_err_2 = test_err.groupby(['user_id','model_fwver']).count().reset_index()[['user_id','model_fwver','time']]
test_err_2.columns = ['user_id','model_fwver','model_fwver_cnt']
display(test_err_2.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_2.values)):
    i = train[train.user_id == train_err_2.loc[idx].user_id].index
    c = train.columns[train.columns == train_err_2.loc[idx].model_fwver]
    
    train.loc[i, c] += train_err_2.loc[idx].model_fwver_cnt    

for idx, col in tqdm(enumerate(test_err_2.values)):
    i = test[test.user_id == test_err_2.loc[idx].user_id].index
    c = test.columns[test.columns == test_err_2.loc[idx].model_fwver]
    
    test.loc[i, c] += test_err_2.loc[idx].model_fwver_cnt    

In [None]:
train_error = set(train_err.errtype.unique())
test_error = set(test_err.errtype.unique())
errors = train_error & test_error
errors = list(errors)

for error in errors:
    train['E'+str(error)] = 0
    test['E'+str(error)] = 0
    
train.shape, test.shape    

In [None]:
train_err_3 = train_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
train_err_3.columns = ['user_id','errtype','errtype_cnt']
display(train_err_3.head())

test_err_3 = test_err.groupby(['user_id','errtype']).count().reset_index()[['user_id','errtype','time']]
test_err_3.columns = ['user_id','errtype','errtype_cnt']
display(test_err_3.head())

In [None]:
for idx, col in tqdm(enumerate(train_err_3.values)):
    i = train[train.user_id == train_err_3.loc[idx].user_id].index
    c = train.columns[train.columns == 'E'+str(train_err_3.loc[idx].errtype)]
    
    train.loc[i, c] += train_err_3.loc[idx].errtype_cnt    

for idx, col in tqdm(enumerate(test_err_3.values)):
    i = test[test.user_id == test_err_3.loc[idx].user_id].index
    c = test.columns[test.columns == 'E'+str(test_err_3.loc[idx].errtype)]
    
    test.loc[i, c] += test_err_3.loc[idx].errtype_cnt    

In [None]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [None]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [None]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [None]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [None]:
train.to_csv("./train.csv", index=False)
test.to_csv("./test.csv", index=False)

In [None]:
train  = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
display(train.head())
display(test.head())

train.shape, test.shape

In [None]:
train.corr()['prob'].sort_values(ascending=False)[:10]

In [None]:
train_err_d1 = train_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
train_err_d1.columns = ['user_id', 'day_max']
train_err_d1

test_err_d1 = test_err.groupby(['user_id','date']).count().groupby('user_id')['time'].max().reset_index()
test_err_d1.columns = ['user_id', 'day_max']
test_err_d1



train = train.merge(train_err_d1, on='user_id', how='left')
test = test.merge(test_err_d1, on='user_id', how='left')
train

In [None]:
train.columns

col1 = ['date_cnt', 'date_sum', 'day_max','prob']
col2 = ['model_403.11.1167', 'model_004.22.1750', 'model_610'
        , 'model_68.5.3','model_204.33.1185', 'model_705.66.3237','model_104.16.3571','prob']
col4 = ['model_004.22.1778', 'model_104.16.3569', 'model_204.33.1171', 'model_305.15.2120', 'model_504.82.1684', 'model_305.15.3104','prob']
col5 = ['model_504.82.1778', 'model_104.16.3553', 'model_504.82.1730', 'model_403.11.1149', 'model_104.16.3439', 'model_305.15.2092','prob']
col6 = ['model_305.15.2138', 'model_004.22.1656', 'model_204.33.1125', 'model_204.33.1261', 'model_004.22.1666', 'model_403.11.1141','prob']
col7 = ['model_705.66.3571', 'model_305.15.2114', 'model_204.33.1149','model_804.73.2237', 'model_004.22.1684','prob']
col8 = ['E1', 'E2', 'E3', 'E4', 'E5','prob']
col81 =  ['E6', 'E7', 'E8', 'E9', 'E10','prob']
col9 = [ 'E11', 'E12', 'E13', 'E14', 'E15','prob']
col91 = [ 'E16','E17', 'E18', 'E19', 'E20','prob']
col10 = [ 'E21', 'E22', 'E23', 'E24', 'E25','prob']
col101 = [ 'E26','E27', 'E28', 'E30', 'E31','prob']
col11 = [ 'E32', 'E33', 'E34', 'E35', 'E36','prob']
col111 = [ 'E37','E38', 'E39', 'E40', 'E41', 'E42','prob']
col12 = ['quality_0', 'quality_1', 'quality_2', 'quality_5', 'quality_6','prob']
col13 = ['quality_9', 'quality_10', 'quality_11', 'quality_12', 'quality_7', 'quality_8','prob']

In [None]:
train.describe().to_csv('desc.csv')

In [None]:
colE = ['E2','E4','E31','E37', 'prob']

In [None]:
train['E37'].value_counts()

In [None]:
colE = col = [ 'model_004.22.1666'
, 'model_705.66.3571'
, 'model_305.15.2114'
, 'model_004.22.1684','prob']


sns.pairplot(train[colE], hue='prob')
plt.show()

In [None]:
colE = col = [ 'model_104.16.3569'
, 'model_504.82.1730'
, 'model_305.15.2092'
, 'model_305.15.2138'
, 'model_004.22.1656','prob']


sns.pairplot(train[colE], hue='prob')
plt.show()

In [None]:
col = [  
, 'model_004.22.1684']

for c in col:
    print(c)
    print(train[c].value_counts())

In [None]:
sns.pairplot(train[col4], hue='prob')
plt.show()

In [None]:
plt.figure(figsize=(8,8))
sns.heatmap(train_qua_1.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(train.corr(), annot=True, fmt='.2f', cmap='RdYlGn_r')

In [None]:
test.columns

In [None]:
train

In [None]:
test.info()

In [None]:
cols = test.columns[1:]
cols = list(cols)

In [None]:
min = train[cols].min()
max  = train[cols].max()

In [None]:
for i, col in enumerate(cols):
    train[col] = (train[col] - min[i]) / (max[i] - min[i])
    test[col] = (test[col] - min[i]) / (max[i] - min[i])

In [None]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

train = train.merge(train_prob1, on='user_id', how='left').fillna(0)
train

train.prob = train.prob.astype(int)

In [None]:
train.info()

In [None]:
train.columns[1:-1]

In [None]:
cat = ['model_004.22.1666','model_305.15.2114','E30','E36','E37']

In [None]:
%%time
clf = setup(session_id=seed, data=train, target='prob'
            , ignore_features=['model_104.16.3569', 'model_504.82.1730', 'model_305.15.2092', 'model_004.22.1656', 'model_004.22.1666',  'model_305.15.2114']
           , numeric_features=train.columns[1:-1]
           )

In [None]:
best = compare_models(sort = 'AUC', n_select = 5
                     ,exclude=['lr', 'knn', 'nb', 'dt', 'svm', 'rbfsvm', 'ridge','qda']
                     )

In [None]:
blended = blend_models(estimator_list = best, fold = 5, method = 'soft')

In [None]:
pred_holdout = predict_model(blended)

In [None]:
%%time
final_model = finalize_model(blended)

In [None]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test_x.shape

In [None]:
predictions = predict_model(final_model, data = test_x)

In [None]:
sample_submssion['problem'] = predictions['Score']
sample_submssion.head()

In [None]:
sample_submssion.to_csv("./submission/submission_20210130-2.csv", index = False)
sample_submssion