In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
np.random.seed(42)
python_random.seed(42)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [86]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(train_err_0.head())

test_err['date'] = test_err['time'].astype(str).str.slice(0,8)
test_err['model_fwver'] = test_err.model_nm + test_err.fwver
test_err['errtype_code'] = test_err.errtype.astype(str) + test_err.errcode

test_err_0 = test_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
display(test_err_0.head())

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


In [106]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(train_err_1.head())

test_err_1 = test_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
test_err_1.columns = ['user_id','date','date_cnt']
test_err_1 = test_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
test_err_1.columns = ['user_id','date_cnt', 'date_sum']
display(test_err_1.head())

Unnamed: 0,user_id,date_cnt,date_sum
0,10000,30,317
1,10001,30,2365
2,10002,29,306
3,10003,30,306
4,10004,30,777


In [88]:
train_err_2 = train_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
train_err_2.columns = ['model_fwver','model_fwver_cnt']
train_err_2['model_fwver_rank'] = train_err_2.model_fwver_cnt.rank()
display(train_err_2.head())

test_err_2 = test_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
test_err_2.columns = ['model_fwver','model_fwver_cnt']
test_err_2['model_fwver_rank'] = test_err_2.model_fwver_cnt.rank()
display(test_err_2.head())

Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1442,2522,21.0
1,model_004.22.1656,39,7.0
2,model_004.22.1666,5,1.0
3,model_004.22.1684,5554,25.0
4,model_004.22.1750,2874213,36.0


In [89]:
# train_err > errtype_code => rank
train_err_3 = train_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
train_err_3.columns = ['errtype_code','errtype_code_cnt']
train_err_3['errtype_code_rank'] = train_err_3.errtype_code_cnt.rank()
display(train_err_3.head())

# test_err > errtype_code => rank
test_err_3 = test_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
test_err_3.columns = ['errtype_code','errtype_code_cnt']
test_err_3['errtype_code_rank'] = test_err_3.errtype_code_cnt.rank()
display(test_err_3.head())

Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21079,2825.0
1,101,133403,2851.0
2,111,307030,2854.0
3,121,320181,2855.0
4,131,22843,2828.0


In [90]:
train = train_err.merge(train_err_1, on=['user_id'], how='left').merge(train_err_2, on='model_fwver', how='left').merge(train_err_3, on='errtype_code', how='left')
train = train[['user_id','date_cnt','date_sum','model_fwver_rank','errtype_code_rank']].drop_duplicates()
train = train.groupby(['user_id','date_cnt','date_sum']).agg({'model_fwver_rank':['count', 'max'],'errtype_code_rank':['count', 'max']}).reset_index()
train.columns = ['user_id','date_cnt','date_sum','model_fwver_cnt','model_fwver_max','errtype_code_cnt','errtype_code_max']
train

In [109]:
test = test_err.merge(test_err_1, on=['user_id'], how='left').merge(test_err_2, on='model_fwver', how='left').merge(test_err_3, on='errtype_code', how='left')
test = test[['user_id','date_cnt','date_sum','model_fwver_rank','errtype_code_rank']].drop_duplicates()
test = test.groupby(['user_id','date_cnt','date_sum']).agg({'model_fwver_rank':['count', 'max'],'errtype_code_rank':['count', 'max']}).reset_index()
test.columns = ['user_id','date_cnt','date_sum','model_fwver_cnt','model_fwver_max','errtype_code_cnt','errtype_code_max']
test

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max
0,30000,29,2750,69,40.0,69,3021.0
1,30001,28,284,15,37.0,15,3019.0
2,30002,30,941,42,39.0,42,3021.0
3,30003,28,371,51,39.0,51,3021.0
4,30004,30,881,49,40.0,49,3021.0
...,...,...,...,...,...,...,...
14993,44994,30,1115,25,40.0,25,3021.0
14994,44995,30,515,27,39.0,27,3019.0
14995,44996,30,2233,44,39.0,44,3021.0
14996,44997,28,24671,49,39.0,49,3021.0


In [65]:
train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [69]:
def chg_qua(x):
    if x == 0:
        return 0
    else:
        return 1

In [68]:
cols = ['quality_0', 'quality_1', 'quality_2', 'quality_5','quality_6', 'quality_7', 'quality_8', 'quality_9', 'quality_10','quality_11', 'quality_12']

%%time
for col in cols:
    train_qua_0[col] = train_qua_0[col].apply(chg_qua)
    test_qua_0[col] = test_qua_0[col].apply(chg_qua)

In [71]:
train_qua_1 = train_qua_0.groupby('user_id').sum().reset_index()
test_qua_1 = test_qua_0.groupby('user_id').sum().reset_index()

In [74]:
%%time
for col in cols:
    train_qua_1[col] = train_qua_1[col].apply(chg_qua)
    test_qua_1[col] = test_qua_1[col].apply(chg_qua)

Wall time: 77.4 ms


In [96]:
train = train.merge(train_qua_1, on='user_id', how='left').fillna(0)
test = test.merge(test_qua_1, on='user_id', how='left').fillna(0)

In [85]:
train_prob1 = train_prob.groupby('user_id').count().reset_index()
train_prob1.time = train_prob1.time.apply(chg_qua)
train_prob1.columns = ['user_id', 'prob']
train_prob1

Unnamed: 0,user_id,time
0,10001,1
1,10004,1
2,10005,1
3,10006,1
4,10008,1
...,...,...
4995,24983,1
4996,24984,1
4997,24990,1
4998,24997,1


In [101]:
train = train.merge(train_prob1, on='user_id', how='left').fillna(0)

In [112]:
display(train.head())
display(test.head())

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12,prob
0,10000,30,317,15,34.0,15,2868.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,0.0,0.0
1,10001,30,2365,46,35.0,46,2870.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,10002,29,306,17,34.0,17,2868.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,0.0
3,10003,30,306,36,35.0,36,2870.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,10004,30,777,36,36.0,36,2870.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0


Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max,quality_0,quality_1,quality_2,quality_5,quality_6,quality_7,quality_8,quality_9,quality_10,quality_11,quality_12
0,30000,29,2750,69,40.0,69,3021.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
1,30001,28,284,15,37.0,15,3019.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
2,30002,30,941,42,39.0,42,3021.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0
3,30003,28,371,51,39.0,51,3021.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0
4,30004,30,881,49,40.0,49,3021.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0


In [113]:
train.shape, test.shape

((15000, 19), (14998, 18))

In [114]:
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True

In [115]:
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }

In [124]:
train_x = train.iloc[:,:-1]
train_y = train.prob

In [129]:
train_x.iloc[0]

user_id             10000.0
date_cnt               30.0
date_sum              317.0
model_fwver_cnt        15.0
model_fwver_max        34.0
errtype_code_cnt       15.0
errtype_code_max     2868.0
quality_0               0.0
quality_1               0.0
quality_2               0.0
quality_5               1.0
quality_6               0.0
quality_7               1.0
quality_8               0.0
quality_9               1.0
quality_10              1.0
quality_11              0.0
quality_12              0.0
Name: 0, dtype: float64

In [130]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x.iloc[train_idx]
    y = train_y.iloc[train_idx]
    valid_x = train_x.iloc[val_idx]
    valid_y = train_y.iloc[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[13]	valid_0's auc: 0.736742	valid_0's pr_auc: 0.709859
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[5]	valid_0's auc: 0.744612	valid_0's pr_auc: 0.241711
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[14]	valid_0's auc: 0.728384	valid_0's pr_auc: 0.279373
Training until validation scores don't improve for 3 rounds
Early stopping, best iteration is:
[3]	valid_0's auc: 0.731444	valid_0's pr_auc: 0.376015
Training until validation scores don't improve for 3 rounds
[20]	valid_0's auc: 0.735225	valid_0's pr_auc: 0.960405
Early stopping, best iteration is:
[22]	valid_0's auc: 0.736092	valid_0's pr_auc: 0.960643


In [131]:
print(np.mean(auc_scores))

0.7354545819878867


In [147]:
test_x = sample_submssion[['user_id']].merge(test, on='user_id', how='left').fillna(0)
test.shape

(14998, 18)

In [148]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    print(pred_y)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

[0.6781263  0.20821687 0.53389307 ... 0.52816749 0.66488272 0.30148626]
[0.57981061 0.35593639 0.50527187 ... 0.50527187 0.57981061 0.37792155]
[0.75964764 0.221149   0.59396895 ... 0.63435181 0.71643031 0.36117632]
[0.50721633 0.3641099  0.45225508 ... 0.45225508 0.49770818 0.36695013]
[0.65954009 0.09648399 0.3695817  ... 0.39719919 0.62209288 0.14782407]


In [149]:
pred_y_list

[array([[0.6781263 ],
        [0.20821687],
        [0.53389307],
        ...,
        [0.52816749],
        [0.66488272],
        [0.30148626]]),
 array([[0.57981061],
        [0.35593639],
        [0.50527187],
        ...,
        [0.50527187],
        [0.57981061],
        [0.37792155]]),
 array([[0.75964764],
        [0.221149  ],
        [0.59396895],
        ...,
        [0.63435181],
        [0.71643031],
        [0.36117632]]),
 array([[0.50721633],
        [0.3641099 ],
        [0.45225508],
        ...,
        [0.45225508],
        [0.49770818],
        [0.36695013]]),
 array([[0.65954009],
        [0.09648399],
        [0.3695817 ],
        ...,
        [0.39719919],
        [0.62209288],
        [0.14782407]])]

In [150]:
pred_ensemble.shape

(14999, 1)

In [151]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [152]:
sample_submssion.to_csv("./submission/submission_20210126-1.csv", index = False)
sample_submssion

Unnamed: 0,user_id,problem
0,30000,0.636868
1,30001,0.249179
2,30002,0.490994
3,30003,0.631513
4,30004,0.562499
...,...,...
14994,44994,0.327306
14995,44995,0.331693
14996,44996,0.503449
14997,44997,0.616185
