In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
import seaborn as sns
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = './data/'

In [2]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_quality  = pd.read_csv(PATH+'train_quality_data.csv')
train_problem  = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_quality  = pd.read_csv(PATH+'test_quality_data.csv')

In [3]:
# train_quality 데이터에 대해서 strin2num 함수 적용 int로 변환
train_quality['quality_10'] = train_quality['quality_10'].apply(string2num)

In [4]:
# quality 10 에 대해서 행값 평균이 200이 넘는 경우 추출
train_quality.groupby('user_id').quality_10.mean()[train_quality.groupby('user_id').quality_10.mean() >= 200]

user_id
10019    8299.315789
10032     669.235294
10057     259.500000
10062     452.000000
10077    5543.666667
            ...     
24964    7972.611111
24979    1001.750000
24981    2494.750000
24984    2048.500000
24986     332.125000
Name: quality_10, Length: 950, dtype: float64

In [5]:
# 저장
filter_train=train_quality.groupby('user_id').quality_10.mean()>=200

In [6]:
# 인덱스 값
filter_train.index[filter_train==True]

Int64Index([10019, 10032, 10057, 10062, 10077, 10081, 10089, 10090, 10150,
            10166,
            ...
            24892, 24900, 24920, 24922, 24934, 24964, 24979, 24981, 24984,
            24986],
           dtype='int64', name='user_id', length=950)

In [7]:
# 인덱스 리스트 저장
q10train_list=[x for x in filter_train.index[filter_train==True]]

In [8]:
# 넘파이로 200이 넘는 경우 1 과 0 으로 표시
q10train_check = np.zeros((15000,1)) # 15000 아이디에 대해서 0 열 만들기

for idx in range(15000):
    if np.isin(idx+10000, [q10train_list]): # 10000부터 아이디가 시작하므로
        q10train_check[idx] = 1 #인덱스 리스트에 있으면 1 지정

In [9]:
q10train_check.shape

(15000, 1)

In [10]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [11]:
##시간별 데이터 만드는 코딩
def time_encoding(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    day   = int(x[6:8])
    hour  = int(x[8:10])
    y = day*100 + hour
    for i in range(30):
        if y == (i+1)*100:
            return 0
        if y == (i+1)*100 + 1:
            return 1
        if y == (i+1)*100 + 2:
            return 2
        if y == (i+1)*100 + 3:
            return 3
        if y == (i+1)*100 + 4:
            return 4
        if y == (i+1)*100 + 5:
            return 5
        if y == (i+1)*100 + 6:
            return 6
        if y == (i+1)*100 + 7:
            return 7
        if y == (i+1)*100 + 8:
            return 8
        if y == (i+1)*100 + 9:
            return 9
        if y == (i+1)*100 + 10:
            return 10
        if y == (i+1)*100 + 11:
            return 11
        if y == (i+1)*100 + 12:
            return 12
        if y == (i+1)*100 + 13:
            return 13
        if y == (i+1)*100 + 14:
            return 14
        if y == (i+1)*100 + 15:
            return 15
        if y == (i+1)*100 + 16:
            return 16
        if y == (i+1)*100 + 17:
            return 17
        if y == (i+1)*100 + 18:
            return 18
        if y == (i+1)*100 + 19:
            return 19
        if y == (i+1)*100 + 20:
            return 20
        if y == (i+1)*100 + 21:
            return 21
        if y == (i+1)*100 + 22:
            return 22
        if y == (i+1)*100 + 23:
            return 23
        if day == 31: # 10월 31일 23:59의 경우
            return 0

In [12]:
#적용하기
train_err['timeslot'] = train_err['time'].apply(time_encoding)

In [13]:
# 시간대별로 타임라벨링 하기 넘파이 활용
timeslot_label = train_err[['user_id','timeslot']].values
timeslot_check = np.zeros((train_user_number,len(train_err['timeslot'].unique())))

for person_idx, ch in tqdm(timeslot_label):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    timeslot_check[person_idx - train_user_id_min,ch - 1] += 1
timeslot_check.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:51<00:00, 321565.09it/s]


(15000, 24)

In [14]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:51<00:00, 322434.01it/s]


(15000, 42)

In [15]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# 숫자 값으로 변환하기 위해 LabelEncoder로 먼저 변환한다.
### 모델명에 대해서 라벨 인코딩 진행
encoder = LabelEncoder()
encoder.fit(train_err['model_nm'].astype('|S80')) ## object 변수이므로 str 변수로 변환
labels_model = encoder.transform(train_err['model_nm'].astype('|S80'))
train_err = pd.concat([train_err,pd.Series(labels_model)],axis=1)
train_err.rename(columns = {0 : 'labels_model'}, inplace = True)

In [16]:
model_label = train_err[['user_id','labels_model']].values
model_nm = np.zeros((train_user_number,9))

for person_idx, ch in tqdm(model_label):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    model_nm[person_idx - train_user_id_min,ch - 1] += 1
model_nm_ = (model_nm > 0).astype(int) #0보다 큰 데이터는 그냥 전부 1로 변경
model_nm_.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:51<00:00, 319759.89it/s]


(15000, 9)

In [17]:
# 분석에 활용하도록 넘파이 데이터 결합
x1 = np.concatenate((error, timeslot_check), axis=1) ## 펌웨어 버전 + 날짜 
x2 = np.concatenate((x1, model_nm_), axis=1) ## 펌웨어 버전 + 날짜 +시간 
x3 = np.concatenate((x2, q10train_check), axis=1) ## 펌웨어 버전 + 날짜 +시간 + 에러

In [18]:
# Y 데이터 만들기
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [19]:
# 변수 이름 변경
# x3  -> train_x
# problem-> train_y

train_x = x3
train_y = problem

print(train_x.shape)
print(train_y.shape)

(15000, 76)
(15000,)


In [20]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'learning_rate' : 0.01,
                'max_bin' : 255,
                'feature_fraction' : 0.8,
                'min_data_in_leaf'     : 10,
                'num_threads'    : 4, 
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 10 Kfold cross validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=150)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 100
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 4543, number of negative: 8957
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10787
[LightGBM] [Info] Number of data points in the train set: 13500, number of used features: 75
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.336519 -> initscore=-0.678848
[LightGBM] [Info] Start training from score -0.678848
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.79948	valid_0's pr_auc: 0.7137
[40]	valid_0's auc: 0.803802	valid_0's pr_auc: 0.715163
[60]	valid_0's auc: 0.805251	valid_0's pr_auc: 0.715144
[80]	valid_0's auc: 0.805641	valid_0's pr_auc: 0.717073
[100]	valid_0's auc: 0.806369	valid_0's pr_auc: 0.716956
[120]	valid_0's auc: 0.806915	valid_0's pr_auc: 0.716932
Early stopping, best iteration is:
[33]	valid_0's auc: 0.804153	valid_0's pr_auc: 0.718643
[LightGBM] [Info] Number of positive: 4521, number of negat

In [21]:
print(np.mean(auc_scores))

0.8131972795667093


In [22]:
# 정리
# q10train_check - 퀄리티10 데이터가 평균이 200이 넘는 경우, 데이터프레임
# timeslot_check - 시간대별 데이터
# error - 에러타입별 데이터
# model_nm_ - 모델별 데이터

In [23]:
# train_quality 데이터에 대해서 strin2num 함수 적용 int로 변환
test_quality['quality_10'] = test_quality['quality_10'].apply(string2num)

# quality 10 에 대해서 행값 평균이 200이 넘는 경우 추출
test_quality.groupby('user_id').quality_10.mean()[test_quality.groupby('user_id').quality_10.mean() >= 200]

user_id
30001     854.333333
30005    1580.937500
30008     458.571429
30009    3086.333333
30061     718.500000
            ...     
44949     978.100000
44956    4532.000000
44971     425.513514
44978    2221.000000
44985     355.000000
Name: quality_10, Length: 934, dtype: float64

In [24]:
# 저장
filter_test=test_quality.groupby('user_id').quality_10.mean()>=200
# 인덱스 값
filter_test.index[filter_test==True]
# 인덱스 리스트 저장
q10test_list=[x for x in filter_test.index[filter_test==True]]

In [25]:
# 넘파이로 200이 넘는 경우 1 과 0 으로 표시
q10test_check = np.zeros((14999,1)) # 15000 아이디에 대해서 0 열 만들기

for idx in range(14999):
    if np.isin(idx+30000, [q10test_list]): # 10000부터 아이디가 시작하므로
        q10test_check[idx] = 1 #인덱스 리스트에 있으면 1 지정

In [26]:
q10test_check.shape

(14999, 1)

In [27]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [28]:
# 시간대 나누는 함수 적용하기
test_err['timeslot'] = test_err['time'].apply(time_encoding)

In [29]:
# 시간대별로 타임라벨링 하기 넘파이 활용
timeslot_label_test = test_err[['user_id','timeslot']].values
timeslot_check_test = np.zeros((test_user_number,len(test_err['timeslot'].unique())))

for person_idx, ch in tqdm(timeslot_label_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    timeslot_check_test[person_idx - test_user_id_min,ch - 1] += 1
timeslot_check_test.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:51<00:00, 320185.85it/s]


(14999, 24)

In [30]:
id_error_test = test_err[['user_id','errtype']].values
error_test = np.zeros((test_user_number,42))

for person_idx, err in tqdm(id_error_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error_test[person_idx - test_user_id_min,err - 1] += 1
error_test.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:52<00:00, 317883.31it/s]


(14999, 42)

In [31]:
# 숫자 값으로 변환하기 위해 LabelEncoder로 먼저 변환한다.
### 모델명에 대해서 라벨 인코딩 진행
encoder = LabelEncoder()
encoder.fit(test_err['model_nm'].astype('|S80')) ## object 변수이므로 str 변수로 변환
labels_model_test = encoder.transform(test_err['model_nm'].astype('|S80'))
test_err = pd.concat([test_err,pd.Series(labels_model_test)],axis=1)
test_err.rename(columns = {0 : 'labels_model_test'}, inplace = True)

In [32]:
model_label_test = test_err[['user_id','labels_model_test']].values
model_nm_test = np.zeros((test_user_number,9))

for person_idx, ch in tqdm(model_label_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    model_nm_test[person_idx - test_user_id_min,ch - 1] += 1
model_nm_test_ = (model_nm_test > 0).astype(int) #0보다 큰 데이터는 그냥 전부 1로 변경
model_nm_test_.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:51<00:00, 322910.64it/s]


(14999, 9)

In [33]:
model_nm_test_ = (model_nm_test > 0).astype(int) #0보다 큰 데이터는 그냥 전부 1로 변경
model_nm_test_.shape

(14999, 9)

In [34]:
# 분석에 활용하도록 넘파이 데이터 결합
x1_test = np.concatenate((error_test, timeslot_check_test), axis=1) ## 펌웨어 버전 + 날짜 
x2_test = np.concatenate((x1_test, model_nm_test_), axis=1) ## 펌웨어 버전 + 날짜 +시간 
x3_test = np.concatenate((x2_test, q10test_check), axis=1) ## 펌웨어 버전 + 날짜 +시간 + 에러

In [35]:
x3_test.shape

(14999, 76)

In [36]:
from pycaret.classification import *

In [37]:
X = pd.DataFrame(x3)
y = pd.DataFrame(problem)

In [38]:
X['problem'] = problem

In [39]:
clf = setup(data = X, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,5542
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 77)"
5,Missing Values,False
6,Numeric Features,66
7,Categorical Features,10
8,Ordinal Features,False
9,High Cardinality Features,False


In [49]:
#Stacking model 만들기
lightgbm_ = create_model('lightgbm') 
catboost_ = create_model('catboost') 
xgboost_ = create_model('xgboost') 

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.76,0.773,0.5157,0.6882,0.5896,0.4249,0.4337
1,0.7838,0.8008,0.547,0.7385,0.6285,0.4808,0.4915
2,0.78,0.8053,0.5299,0.7381,0.6169,0.4684,0.481
3,0.7571,0.7809,0.5128,0.6818,0.5854,0.4185,0.427
4,0.7752,0.8045,0.5356,0.7203,0.6144,0.4606,0.4706
5,0.7848,0.7935,0.5499,0.7395,0.6307,0.4834,0.494
6,0.7657,0.785,0.5114,0.7087,0.5941,0.4354,0.4468
7,0.7714,0.8017,0.5341,0.7121,0.6104,0.4533,0.4627
8,0.7686,0.7776,0.5,0.7243,0.5916,0.4376,0.4522
9,0.7788,0.7787,0.5185,0.7429,0.6107,0.463,0.4776


NameError: name 'stack_model' is not defined

In [50]:
stacker_2 = stack_models(estimator_list = [lightgbm_, xgboost_], meta_model = catboost_)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7733,0.7905,0.4815,0.7511,0.5868,0.4408,0.4614
1,0.7981,0.8288,0.5214,0.8062,0.6332,0.5026,0.5253
2,0.78,0.8183,0.5128,0.75,0.6091,0.4635,0.4797
3,0.7857,0.8176,0.5185,0.7647,0.618,0.4766,0.494
4,0.7914,0.8158,0.5071,0.7946,0.6191,0.485,0.5082
5,0.7971,0.8119,0.4957,0.8286,0.6203,0.4936,0.5239
6,0.7914,0.8086,0.517,0.7879,0.6244,0.4885,0.5092
7,0.7962,0.8098,0.5341,0.7899,0.6373,0.5028,0.5214
8,0.7857,0.8007,0.483,0.7981,0.6018,0.4671,0.4946
9,0.7884,0.7974,0.4843,0.8057,0.605,0.4724,0.501


In [51]:
stack_pred_holdout = predict_model(stacker_2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.7858,0.8099,0.4929,0.7773,0.6033,0.4665,0.4892


In [62]:
stacker_1 = stack_models(estimator_list = [catboost_], meta_model = lightgbm_)

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.78,0.8032,0.4929,0.7655,0.5997,0.4576,0.4787
1,0.7933,0.8122,0.5442,0.7702,0.6377,0.4991,0.5138
2,0.7924,0.8159,0.5328,0.7759,0.6318,0.4941,0.511
3,0.781,0.8137,0.5128,0.7531,0.6102,0.4654,0.482
4,0.7857,0.8134,0.4986,0.7812,0.6087,0.4709,0.4934
5,0.8048,0.8164,0.5242,0.8288,0.6422,0.5172,0.5428
6,0.7848,0.8006,0.5085,0.7716,0.613,0.4725,0.4922
7,0.7838,0.8056,0.5483,0.7395,0.6297,0.4817,0.4925
8,0.779,0.7933,0.4688,0.7857,0.5872,0.4492,0.4771
9,0.7979,0.7965,0.5071,0.8203,0.6268,0.4986,0.5257


In [63]:
stack_pred_holdout_1 = predict_model(stacker_1)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Stacking Classifier,0.7856,0.8021,0.4943,0.7753,0.6037,0.4664,0.4886


In [64]:
final_model = finalize_model(stacker_1)

In [40]:
#best_5 = compare_models(sort = 'Accuracy', n_select = 5)

In [41]:
best_2 = compare_models(sort = 'Accuracy', n_select = 2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
catboost,CatBoost Classifier,0.7918,0.8142,0.5084,0.7961,0.6202,0.4863,0.5097,8.23
et,Extra Trees Classifier,0.7904,0.8075,0.4987,0.8001,0.614,0.4806,0.5061,0.509
lightgbm,Light Gradient Boosting Machine,0.7849,0.8086,0.5075,0.7719,0.6121,0.472,0.492,0.219
gbc,Gradient Boosting Classifier,0.7831,0.8097,0.4711,0.7985,0.5922,0.4576,0.4874,1.593
rf,Random Forest Classifier,0.7758,0.7991,0.4526,0.7876,0.5744,0.437,0.4681,0.722
xgboost,Extreme Gradient Boosting,0.7726,0.7901,0.5255,0.7194,0.6072,0.4526,0.4637,1.008
ada,Ada Boost Classifier,0.7684,0.7885,0.4708,0.7431,0.5761,0.4277,0.4489,0.351
lda,Linear Discriminant Analysis,0.764,0.7683,0.376,0.8224,0.5154,0.3871,0.4391,0.07
ridge,Ridge Classifier,0.7626,0.0,0.3669,0.8283,0.508,0.3812,0.4362,0.03
qda,Quadratic Discriminant Analysis,0.75,0.7581,0.3934,0.7374,0.5127,0.3649,0.3977,0.041


In [42]:
#blended_5 = blend_models(estimator_list = best_5, fold = 10, method = 'soft')

In [43]:
blended_2 = blend_models(estimator_list = best_2, fold = 10, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.7743,0.7958,0.4644,0.7689,0.579,0.4374,0.4634
1,0.8029,0.8293,0.5185,0.8273,0.6375,0.5117,0.538
2,0.8019,0.8229,0.5214,0.8206,0.6376,0.5105,0.5354
3,0.7943,0.822,0.4986,0.814,0.6184,0.4885,0.516
4,0.7933,0.8271,0.5043,0.8045,0.62,0.4881,0.5132
5,0.8105,0.8214,0.51,0.8689,0.6427,0.5254,0.5599
6,0.7876,0.8152,0.5085,0.7817,0.6162,0.4783,0.4994
7,0.8029,0.8247,0.5341,0.8139,0.6449,0.5165,0.5384
8,0.7914,0.8048,0.4688,0.8376,0.6011,0.4747,0.5114
9,0.7941,0.8043,0.4843,0.8293,0.6115,0.4843,0.5167


In [44]:
pred_holdout = predict_model(blended_2)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7896,0.8116,0.4862,0.798,0.6043,0.4722,0.4991


In [45]:
final_model = finalize_model(blended_2)

In [65]:
X_test = pd.DataFrame(x3_test)

In [66]:
predictions = predict_model(final_model, data = X_test)

#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
x = []
for i in range(len(predictions['Score'])):
  if predictions['Label'][i] =='1.0':
    x.append(predictions['Score'][i])
  else:
    x.append(1-predictions['Score'][i])


In [67]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("best_5.csv", index = False)