In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random

import lightgbm as lgb
import re
from sklearn.metrics import *
from sklearn.model_selection import KFold

import warnings
warnings.filterwarnings(action='ignore')

from pycaret.classification import *

import random as python_random
np.random.seed(42)
python_random.seed(42)

In [2]:
PATH = './data/'

train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
train_prob = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [3]:
train_err['date'] = train_err['time'].astype(str).str.slice(0,8)
train_err['model_fwver'] = train_err.model_nm + train_err.fwver
train_err['errtype_code'] = train_err.errtype.astype(str) + train_err.errcode

train_err_0 = train_err[['user_id','date','model_fwver','errtype_code']].drop_duplicates()
train_err_0.head()

Unnamed: 0,user_id,date,model_fwver,errtype_code
0,10000,20201101,model_305.15.2138,151
1,10000,20201101,model_305.15.2138,121
2,10000,20201101,model_305.15.2138,111
3,10000,20201101,model_305.15.2138,161
4,10000,20201101,model_305.15.2138,40


In [16]:
train_err_1 = train_err.groupby(['user_id','date']).count().reset_index()[['user_id','date','time']]
train_err_1.columns = ['user_id','date','date_cnt']
train_err_1 = train_err_1.groupby('user_id').agg({'date':'count','date_cnt': 'sum'}).reset_index()
train_err_1.columns = ['user_id','date_cnt', 'date_sum']
train_err_1.head()

Unnamed: 0,user_id,date_cnt,date_sum
0,10000,30,317
1,10001,30,2365
2,10002,29,306
3,10003,30,306
4,10004,30,777


In [17]:
train_err_2 = train_err.groupby('model_fwver').count().reset_index()[['model_fwver','user_id']]
train_err_2.columns = ['model_fwver','model_fwver_cnt']
train_err_2['model_fwver_rank'] = train_err_2.model_fwver_cnt.rank()
train_err_2.head()

Unnamed: 0,model_fwver,model_fwver_cnt,model_fwver_rank
0,model_004.22.1442,2522,21.0
1,model_004.22.1656,39,7.0
2,model_004.22.1666,5,1.0
3,model_004.22.1684,5554,25.0
4,model_004.22.1750,2874213,36.0


In [18]:
# train_err > errtype_code => rank
train_err_3 = train_err.groupby('errtype_code').count().reset_index()[['errtype_code','user_id']]
train_err_3.columns = ['errtype_code','errtype_code_cnt']
train_err_3['errtype_code_rank'] = train_err_3.errtype_code_cnt.rank()
train_err_3.head()

Unnamed: 0,errtype_code,errtype_code_cnt,errtype_code_rank
0,10,21079,2825.0
1,101,133403,2851.0
2,111,307030,2854.0
3,121,320181,2855.0
4,131,22843,2828.0


In [36]:
train = train_err.merge(train_err_1, on=['user_id'], how='left').merge(train_err_2, on='model_fwver', how='left').merge(train_err_3, on='errtype_code', how='left')

In [40]:
train = train[['user_id','date_cnt','date_sum','model_fwver_rank','errtype_code_rank']].drop_duplicates()

In [47]:
train = train.groupby(['user_id','date_cnt','date_sum']).agg({'model_fwver_rank':['count', 'max'],'errtype_code_rank':['count', 'max']}).reset_index()

In [50]:
train.columns = ['user_id','date_cnt','date_sum','model_fwver_cnt','model_fwver_max','errtype_code_cnt','errtype_code_max']
train

Unnamed: 0,user_id,date_cnt,date_sum,model_fwver_cnt,model_fwver_max,errtype_code_cnt,errtype_code_max
0,10000,30,317,15,34.0,15,2868.0
1,10001,30,2365,46,35.0,46,2870.0
2,10002,29,306,17,34.0,17,2868.0
3,10003,30,306,36,35.0,36,2870.0
4,10004,30,777,36,36.0,36,2870.0
...,...,...,...,...,...,...,...
14995,24995,10,194,22,35.0,22,2870.0
14996,24996,1,4,4,34.0,4,2860.0
14997,24997,30,826,44,36.0,44,2870.0
14998,24998,21,155,24,36.0,24,2870.0


In [171]:
train_qua = pd.read_csv(PATH+'train_quality_data.csv')
test_qua  = pd.read_csv(PATH+'test_quality_data.csv')

train_qua_0 = train_qua.drop_duplicates()
test_qua_0 = test_qua.drop_duplicates()

In [172]:
train_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)
test_qua_0.drop(['quality_3','quality_4','time','fwver'], axis=1, inplace=True)

In [175]:
train_qua_0.fillna(0, inplace=True)
test_qua_0.fillna(0, inplace=True)

In [197]:
train_qua_0.index

Int64Index([     0,      6,     12,     22,     24,     26,     36,     47,
                48,     54,
            ...
            828589, 828597, 828600, 828602, 828608, 828612, 828615, 828619,
            828621, 828623],
           dtype='int64', length=284202)

In [211]:
train_qua_0.loc[0,'quality_10']

'4'

In [213]:
for idx in tqdm(train_qua_0.index):
    for col in tqdm(['quality_10']):#train_qua_0.columns:
        if train_qua_0.loc[idx, col] != 0:
            train_qua_0.loc[idx, col] = 1

 65%|███████████████████████████████████████████████▏                         | 183608/284202 [12:35<06:53, 243.15it/s]


KeyboardInterrupt: 

In [187]:
for idx in tqdm(test_qua_0.index):
    for col in test_qua_0.columns:
        if test_qua_0.iloc[idx][col] != 0:
            test_qua_0.iloc[idx][col] = 1

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [None]:
train_qua_1 = train_qua.drop_duplicates().groupby('user_id').count().reset_index()[['user_id', 'time']]
train_qua_1.columns = ['user_id', 'qua_cnt']
#828624-284202 = 544422

In [None]:
train_prob_1 = train_prob.groupby('user_id').count().reset_index()
train_prob_1.columns = ['user_id', 'prob_cnt']

In [None]:
train = train_err_1.merge(train_qua_1, on='user_id', how='left').merge(train_prob_1, on='user_id', how='left').fillna(0)
train.head()

In [None]:
train.describe()

In [None]:
train.groupby('prob_cnt').count()

In [None]:
train[train.prob_cnt == 5]

In [None]:
train_err_1[train_err_1.user_id == 24407] # 1910

In [None]:
train_qua_1[train_qua_1.user_id == 24407] # 2

In [None]:
train_err[train_err.user_id == 24407] # 1910

In [None]:
# 데이터 설명을 확인하면
# ueser_id가 10000부터 24999까지 총 15000개가 연속적으로 존재.
display(train_err.head())
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [None]:
print(np.sort(train_err.errtype.unique()))
# errtype이 1부터 42까지 29를 제외한 41개가 존재.

In [None]:
# user_id와 errtype만을 사용하여 데이터 셋 생성
# 모든 일자에 대해서 errtype별 발생 건수를 count
# pandas의 groupby를 활용할 경우 큰 연산 자원이 필요.
# numpy로 placeholder를 만들어 구현함.
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

In [None]:
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True

In [None]:
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'seed': 1015
                }

In [None]:
k_fold = KFold(n_splits=5, shuffle=True, random_state=42)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 3
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

In [None]:
print(np.mean(auc_scores))

In [None]:
test_err  = pd.read_csv(PATH+'test_err_data.csv')
display(test_err.head())

In [None]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [None]:
id_error = test_err[['user_id','errtype']].values
test_x = np.zeros((test_user_number,42))
for person_idx, err in tqdm(id_error):
    # person_idx - test_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    test_x[person_idx - test_user_id_min,err - 1] += 1
test_x = test_x.reshape(test_x.shape[0],-1)
print(test_x.shape)

In [None]:
# 예측
pred_y_list = []
for model in models:
    pred_y = model.predict(test_x)
    print(pred_y)
    pred_y_list.append(pred_y.reshape(-1,1))
    
pred_ensemble = np.mean(pred_y_list, axis = 0)

In [None]:
pred_y_list

In [None]:
pred_ensemble

In [None]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')

In [None]:
sample_submssion['problem'] = pred_ensemble.reshape(-1)

In [None]:
sample_submssion.to_csv("./submission/submission_20210125-1.csv", index = False)
sample_submssion