In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt
from tqdm import tqdm
import gc
import random
import lightgbm as lgb
import re
import seaborn as sns
from sklearn.metrics import *
from sklearn.model_selection import KFold
import warnings
warnings.filterwarnings(action='ignore')

# 필요한 함수 정의
def make_datetime(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    year  = int(x[:4])
    month = int(x[4:6])
    day   = int(x[6:8])
    hour  = int(x[8:10])
    #mim  = int(x[10:12])
    #sec  = int(x[12:])
    return dt.datetime(year, month, day, hour)

def string2num(x):
    # (,)( )과 같은 불필요한 데이터 정제
    x = re.sub(r"[^0-9]+", '', str(x))
    if x =='':
        return 0
    else:
        return int(x)


PATH = './data/'

In [2]:
train_err  = pd.read_csv(PATH+'train_err_data.csv')
train_quality  = pd.read_csv(PATH+'train_quality_data.csv')
train_problem  = pd.read_csv(PATH+'train_problem_data.csv')

test_err  = pd.read_csv(PATH+'test_err_data.csv')
test_quality  = pd.read_csv(PATH+'test_quality_data.csv')

In [3]:
# train_quality 데이터에 대해서 strin2num 함수 적용 int로 변환
train_quality['quality_10'] = train_quality['quality_10'].apply(string2num)

In [139]:
# quality 10 에 대해서 행의 평균값에 따라 분류
q10_mean = train_quality.groupby('user_id').quality_10.mean()

In [181]:
# 저장
filter1500_train = q10_mean[q10_mean >= 1500]
filter200_train = q10_mean[(q10_mean >= 200) & (q10_mean <= 1500)]
filter50_train = q10_mean[(q10_mean >= 50) & (q10_mean <= 200)]

In [182]:
# 인덱스값
filter1500_train.index
filter200_train.index
filter50_train.index 

Int64Index([10049, 10056, 10072, 10104, 10110, 10113, 10152, 10154, 10167,
            10236,
            ...
            24786, 24811, 24844, 24845, 24874, 24875, 24888, 24951, 24953,
            24985],
           dtype='int64', name='user_id', length=469)

In [183]:
# 인덱스 리스트 저장
q10train1500_list=[x for x in filter1500_train.index]
q10train200_list=[x for x in filter200_train.index]
q10train50_list=[x for x in filter50_train.index]

In [275]:
# 넘파이로 인덱싱 1 과 0 으로 표시
q10train_check = np.zeros((15000,1)) # 15000 아이디에 대해서 0 열 만들기

for idx in range(15000):
    if np.isin(idx+10000, [q10train1500_list]): # 10000부터 아이디가 시작하므로
        q10train_check[idx] = 3 #인덱스 리스트에 있으면 1 지정
        
for idx in range(15000):
    if np.isin(idx+10000, [q10train200_list]):
        q10train_check[idx] = 2

for idx in range(15000):
    if np.isin(idx+10000, [q10train50_list]):
        q10train_check[idx] = 1
        
q10train_check.shape
# quality 10 데이터에서 대해서 1500 / 200 / 50 을 기준으로 인코딩하였음

In [73]:
##시간별 데이터 만드는 코딩
def time_encoding(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    day   = int(x[6:8])
    hour  = int(x[8:10])
    y = day*100 + hour
    for i in range(30):
        if y == (i+1)*100:
            return 0
        if y == (i+1)*100 + 1:
            return 1
        if y == (i+1)*100 + 2:
            return 2
        if y == (i+1)*100 + 3:
            return 3
        if y == (i+1)*100 + 4:
            return 4
        if y == (i+1)*100 + 5:
            return 5
        if y == (i+1)*100 + 6:
            return 6
        if y == (i+1)*100 + 7:
            return 7
        if y == (i+1)*100 + 8:
            return 8
        if y == (i+1)*100 + 9:
            return 9
        if y == (i+1)*100 + 10:
            return 10
        if y == (i+1)*100 + 11:
            return 11
        if y == (i+1)*100 + 12:
            return 12
        if y == (i+1)*100 + 13:
            return 13
        if y == (i+1)*100 + 14:
            return 14
        if y == (i+1)*100 + 15:
            return 15
        if y == (i+1)*100 + 16:
            return 16
        if y == (i+1)*100 + 17:
            return 17
        if y == (i+1)*100 + 18:
            return 18
        if y == (i+1)*100 + 19:
            return 19
        if y == (i+1)*100 + 20:
            return 20
        if y == (i+1)*100 + 21:
            return 21
        if y == (i+1)*100 + 22:
            return 22
        if y == (i+1)*100 + 23:
            return 23
        if day == 31: # 10월 31일 23:59의 경우
            return 0

In [234]:
##날짜로 변환하는 함수
def day_encoding(x):
    # string 타입의 Time column을 datetime 타입으로 변경
    x     = str(x)
    month = int(x[4:6])
    day   = int(x[6:8])
    return month * 100 + day

In [323]:
def weekday_encoding(x): #월 / 주중 / 주말로 구분하는 코드
    monday_list = [1102, 1109, 1116, 1123, 1130, 1214]
    tuesday_list = [1103, 1110, 1117, 1124, 1201]
    wendesday_list = [1104, 1111, 1118, 1125, 1202, 1209]
    thursday_list = [1105, 1112, 1119, 1126, 1203]
    friday_list = [1106, 1113, 1120, 1127, 1204, 1211]
    saturday_list = [1205, 1031, 1107, 1114, 1121, 1128]
    sunday_list = [1101, 1108, 1115, 1122, 1129]
    if np.isin(x, monday_list): 
        return 0
    if np.isin(x, tuesday_list):
        return 1
    if np.isin(x, wendesday_list):
        return 2
    if np.isin(x, thursday_list):
        return 3
    if np.isin(x, friday_list):
        return 4
    if np.isin(x, saturday_list):
        return 5
    else:
        return 6
    

In [None]:
train_user_id_max = 24999
train_user_id_min = 10000
train_user_number = 15000

In [240]:
#시간 별로 나누기
train_err['timeslot'] = train_err['time'].apply(time_encoding)
train_err['day'] = train_err['time'].apply(day_encoding)
train_err['week'] = train_err['day'].apply(weekday_encoding)

In [325]:
# 요일별로 타임라벨링 하기 넘파이 활용
week_label = train_err[['user_id','week']].values
week_check = np.zeros((train_user_number,len(train_err['week'].unique())))

for person_idx, ch in tqdm(week_label):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    week_check[person_idx - train_user_id_min,ch - 1] += 1
week_check.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:49<00:00, 336854.45it/s]


(15000, 7)

In [75]:
# 시간별로 타임라벨링 하기 넘파이 활용
timeslot_label = train_err[['user_id','timeslot']].values
timeslot_check = np.zeros((train_user_number,len(train_err['timeslot'].unique())))

for person_idx, ch in tqdm(timeslot_label):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    timeslot_check[person_idx - train_user_id_min,ch - 1] += 1
timeslot_check.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:50<00:00, 329430.98it/s]


(15000, 24)

In [246]:
id_error = train_err[['user_id','errtype']].values
error = np.zeros((train_user_number,42))

for person_idx, err in tqdm(id_error):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error[person_idx - train_user_id_min,err - 1] += 1
error.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:49<00:00, 335207.30it/s]


(15000, 42)

In [265]:
from sklearn.preprocessing import OneHotEncoder, LabelEncoder


# 숫자 값으로 변환하기 위해 LabelEncoder로 먼저 변환한다.
### 모델명에 대해서 라벨 인코딩 진행
encoder = LabelEncoder()
encoder.fit(train_err['model_nm'].astype('|S80')) ## object 변수이므로 str 변수로 변환
labels_model = encoder.transform(train_err['model_nm'].astype('|S80'))
train_err = pd.concat([train_err,pd.Series(labels_model)],axis=1)
train_err.rename(columns = {0 : 'labels_model'}, inplace = True)

In [266]:
model_label = train_err[['user_id','labels_model']].values
model_nm = np.zeros((train_user_number,9))

for person_idx, ch in tqdm(model_label):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    model_nm[person_idx - train_user_id_min,ch - 1] += 1
model_nm_ = (model_nm > 0).astype(int) #0보다 큰 데이터는 그냥 전부 1로 변경
model_nm_.shape

100%|██████████████████████████████████████████████████████████████████| 16554663/16554663 [00:48<00:00, 338516.05it/s]


(15000, 9)

In [328]:
# 분석에 활용하도록 넘파이 데이터 결합
x1 = np.concatenate((error, timeslot_check), axis=1) ## 에러 + 시간 
x2 = np.concatenate((x1, model_nm_), axis=1) ## 에러 + 시간 + 모델
x3 = np.concatenate((x2, week_check), axis=1) ## 에러 + 시간 + 모델 + 요일
x4 = np.concatenate((x3, q10train_check), axis=1) ## 에러 + 시간 + 모델 + 요일 + 퀄리티10

In [135]:
# Y 데이터 만들기
train_prob = pd.read_csv(PATH+'train_problem_data.csv')
problem = np.zeros(15000)
# error와 동일한 방법으로 person_idx - 10000 위치에 
# person_idx의 problem이 한 번이라도 발생했다면 1
# 없다면 0
problem[train_prob.user_id.unique()-10000] = 1 
problem.shape

(15000,)

In [329]:
# 변수 이름 변경
# x3  -> train_x
# problem-> train_y

train_x = x4
train_y = problem

print(train_x.shape)
print(train_y.shape)

(15000, 83)
(15000,)


In [331]:
# Train
#-------------------------------------------------------------------------------------
# validation auc score를 확인하기 위해 정의
def f_pr_auc(probas_pred, y_true):
    labels=y_true.get_label()
    p, r, _ = precision_recall_curve(labels, probas_pred)
    score=auc(r,p) 
    return "pr_auc", score, True
#-------------------------------------------------------------------------------------
models     = []
recalls    = []
precisions = []
auc_scores   = []
threshold = 0.5
# 파라미터 설정
params =      {
                'task' : 'train',
                'boosting_type' : 'gbdt',
                'objective'     : 'binary',
                'metric'        : 'auc',
                'learning_rate' : 0.01,
                'max_bin' : 255,
                'feature_fraction' : 0.8,
                'min_data_in_leaf'     : 10,
                'num_threads'    : 4, 
                'seed': 1015
                }
#-------------------------------------------------------------------------------------
# 10 Kfold cross validation
k_fold = KFold(n_splits=10, shuffle=True, random_state=150)
for train_idx, val_idx in k_fold.split(train_x):

    # split train, validation set
    X = train_x[train_idx]
    y = train_y[train_idx]
    valid_x = train_x[val_idx]
    valid_y = train_y[val_idx]

    d_train= lgb.Dataset(X, y)
    d_val  = lgb.Dataset(valid_x, valid_y)
    
    #run traning
    model = lgb.train(
                        params,
                        train_set       = d_train,
                        num_boost_round = 1000,
                        valid_sets      = d_val,
                        feval           = f_pr_auc,
                        verbose_eval    = 20, 
                        early_stopping_rounds = 100
                       )
    
    # cal valid prediction
    valid_prob = model.predict(valid_x)
    valid_pred = np.where(valid_prob > threshold, 1, 0)
    
    # cal scores
    recall    = recall_score(    valid_y, valid_pred)
    precision = precision_score( valid_y, valid_pred)
    auc_score = roc_auc_score(   valid_y, valid_prob)

    # append scores
    models.append(model)
    recalls.append(recall)
    precisions.append(precision)
    auc_scores.append(auc_score)

    print('==========================================================')

[LightGBM] [Info] Number of positive: 4543, number of negative: 8957
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 12574
[LightGBM] [Info] Number of data points in the train set: 13500, number of used features: 82
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.336519 -> initscore=-0.678848
[LightGBM] [Info] Start training from score -0.678848
Training until validation scores don't improve for 100 rounds
[20]	valid_0's auc: 0.798098	valid_0's pr_auc: 0.705138
[40]	valid_0's auc: 0.801451	valid_0's pr_auc: 0.709018
[60]	valid_0's auc: 0.802727	valid_0's pr_auc: 0.709252
[80]	valid_0's auc: 0.803864	valid_0's pr_auc: 0.710068
[100]	valid_0's auc: 0.804924	valid_0's pr_auc: 0.711959
[120]	valid_0's auc: 0.805857	valid_0's pr_auc: 0.715664
[140]	valid_0's auc: 0.806436	valid_0's pr_auc: 0.717272
[160]	valid_0's auc: 0.807257	valid_0's pr_auc: 0.717577
[180]	valid_0's auc: 0.809104	val

In [332]:
print(np.mean(auc_scores))

0.8148995102960628


In [195]:
# train_quality 데이터에 대해서 strin2num 함수 적용 int로 변환
test_quality['quality_10'] = test_quality['quality_10'].apply(string2num)

# quality 10 에 대해서 행값 평균이 200이 넘는 경우 추출
q10test_mean = test_quality.groupby('user_id').quality_10.mean()
filter1500_test = q10test_mean[q10test_mean >= 1500]
filter200_test = q10test_mean[(q10test_mean >= 200) & (q10test_mean <= 1500)]
filter50_test = q10test_mean[(q10test_mean >= 50) & (q10test_mean <= 200)]

In [196]:
# 인덱스 리스트 저장
q10test1500_list=[x for x in filter1500_train.index]
q10test200_list=[x for x in filter200_train.index]
q10test50_list=[x for x in filter50_train.index]

In [299]:
# 넘파이로 200이 넘는 경우 1 과 0 으로 표시
q10test_check = np.zeros((14999,1)) # 15000 아이디에 대해서 0 열 만들기

for idx in range(14999):
    if np.isin(idx+30000, [q10test1500_list]): # 10000부터 아이디가 시작하므로
        q10test_check[idx] = 3 #인덱스 리스트에 있으면 1 지정

for idx in range(14999):
    if np.isin(idx+30000, [q10test200_list]):
        q10test_check[idx] = 2

for idx in range(14999):
    if np.isin(idx+30000, [q10test50_list]):
        q10test_check[idx] = 1    
        
q10test_check.shape

(14999, 1)

In [300]:
# 데이터 설명을 확인하면
# test 데이터는 ueser_id가 30000부터 44998까지 총 14999개가 존재.
test_user_id_max = 44998
test_user_id_min = 30000
test_user_number = 14999

In [301]:
# 시간대 나누는 함수 적용하기
test_err['timeslot'] = test_err['time'].apply(time_encoding)
test_err['day'] = test_err['time'].apply(day_encoding)
test_err['week'] = test_err['day'].apply(weekday_encoding)

In [334]:
# 요일별로 타임라벨링 하기 넘파이 활용
week_label_test = test_err[['user_id','week']].values
week_check_test = np.zeros((test_user_number,len(test_err['week'].unique())))

for person_idx, ch in tqdm(week_label_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    week_check_test[person_idx - test_user_id_min,ch - 1] += 1
week_check_test.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:49<00:00, 333054.93it/s]


(14999, 7)

In [336]:
# 시간대별로 타임라벨링 하기 넘파이 활용
timeslot_label_test = test_err[['user_id','timeslot']].values
timeslot_check_test = np.zeros((test_user_number,len(test_err['timeslot'].unique())))

for person_idx, ch in tqdm(timeslot_label_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    timeslot_check_test[person_idx - test_user_id_min,ch - 1] += 1
timeslot_check_test.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:52<00:00, 314936.05it/s]


(14999, 24)

In [305]:
id_error_test = test_err[['user_id','errtype']].values
error_test = np.zeros((test_user_number,42))

for person_idx, err in tqdm(id_error_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    error_test[person_idx - test_user_id_min,err - 1] += 1
error_test.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:49<00:00, 332482.13it/s]


(14999, 42)

In [308]:
# 숫자 값으로 변환하기 위해 LabelEncoder로 먼저 변환한다.
### 모델명에 대해서 라벨 인코딩 진행
encoder = LabelEncoder()
encoder.fit(test_err['model_nm'].astype('|S80')) ## object 변수이므로 str 변수로 변환
labels_model_test = encoder.transform(test_err['model_nm'].astype('|S80'))
test_err = pd.concat([test_err,pd.Series(labels_model_test)],axis=1)
test_err.rename(columns = {0 : 'labels_model_test'}, inplace = True)

In [309]:
model_label_test = test_err[['user_id','labels_model_test']].values
model_nm_test = np.zeros((test_user_number,9))

for person_idx, ch in tqdm(model_label_test):
    # person_idx - train_user_id_min 위치에 person_idx, errtype에 해당하는 error값을 +1
    model_nm_test[person_idx - test_user_id_min,ch - 1] += 1
model_nm_test_ = (model_nm_test > 0).astype(int) #0보다 큰 데이터는 그냥 전부 1로 변경
model_nm_test_.shape

100%|██████████████████████████████████████████████████████████████████| 16532648/16532648 [00:48<00:00, 342679.79it/s]


(14999, 9)

In [337]:
# 분석에 활용하도록 넘파이 데이터 결합
x1_test = np.concatenate((error_test, timeslot_check_test), axis=1) ## 에러 + 시간 
x2_test = np.concatenate((x1_test, model_nm_test_), axis=1) ## 에러 + 시간 + 모델
x3_test = np.concatenate((x2_test, week_check_test), axis=1) ## 에러 + 시간 + 모델 + 요일
x4_test = np.concatenate((x3_test, q10test_check), axis=1) ## 에러 + 시간 + 모델 + 요일 + 퀄리티 10

In [340]:
from pycaret.classification import *

In [341]:
X = pd.DataFrame(x4)
y = pd.DataFrame(problem)

In [342]:
X['problem'] = problem

In [343]:
clf = setup(data = X, target = 'problem') 

Unnamed: 0,Description,Value
0,session_id,5946
1,Target,problem
2,Target Type,Binary
3,Label Encoded,"0.0: 0, 1.0: 1"
4,Original Data,"(15000, 84)"
5,Missing Values,False
6,Numeric Features,74
7,Categorical Features,9
8,Ordinal Features,False
9,High Cardinality Features,False


In [40]:
#best_5 = compare_models(sort = 'Accuracy', n_select = 5)

In [None]:
best_3 = compare_models(sort = 'AUC', n_select = 3)

IntProgress(value=0, description='Processing: ', max=79)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7807,0.8089,0.4766,0.7884,0.5932,0.4556,0.4831,0.649
gbc,Gradient Boosting Classifier,0.7803,0.8044,0.4633,0.8002,0.5855,0.4505,0.4823,2.093
rf,Random Forest Classifier,0.7684,0.7973,0.4333,0.7805,0.5564,0.4169,0.4503,0.972
ada,Ada Boost Classifier,0.7649,0.7866,0.4676,0.7386,0.5718,0.4211,0.4424,0.434
lda,Linear Discriminant Analysis,0.7627,0.7628,0.3738,0.8253,0.5138,0.3854,0.4385,0.07
lr,Logistic Regression,0.7479,0.7566,0.3704,0.7552,0.4962,0.3538,0.3939,1.435
qda,Quadratic Discriminant Analysis,0.7384,0.7244,0.3656,0.7216,0.483,0.3329,0.3682,0.054
nb,Naive Bayes,0.6924,0.7139,0.1699,0.6671,0.2698,0.1553,0.2138,0.026
dt,Decision Tree Classifier,0.671,0.6354,0.5268,0.5105,0.5182,0.2686,0.2689,0.152
knn,K Neighbors Classifier,0.6567,0.6254,0.2928,0.483,0.3644,0.1483,0.1571,0.739


exception calling callback for <Future at 0x1ec96fac520 state=finished raised BrokenProcessPool>
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "D:\Anaconda\lib\site-packages\joblib\externals\loky\process_executor.py", line 404, in _process_worker
    call_item = call_queue.get(block=True, timeout=timeout)
  File "D:\Anaconda\lib\multiprocessing\queues.py", line 116, in get
    return _ForkingPickler.loads(res)
  File "D:\Anaconda\lib\site-packages\xgboost\__init__.py", line 9, in <module>
    from .core import DMatrix, DeviceQuantileDMatrix, Booster
  File "D:\Anaconda\lib\site-packages\xgboost\core.py", line 174, in <module>
    _LIB = _load_lib()
  File "D:\Anaconda\lib\site-packages\xgboost\core.py", line 157, in _load_lib
    raise XGBoostError(
xgboost.core.XGBoostError: XGBoost Library (xgboost.dll) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed (vcomp140.dll or libgomp-1.dll for Windows, libomp.dy

In [42]:
#blended_5 = blend_models(estimator_list = best_5, fold = 10, method = 'soft')

In [345]:
blended_3 = blend_models(estimator_list = best_3, fold = 10, method = 'soft')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,0.779,0.8219,0.4576,0.802,0.5827,0.4473,0.4799
1,0.7876,0.8082,0.4504,0.8457,0.5878,0.4621,0.5037
2,0.8095,0.8312,0.5637,0.8122,0.6656,0.5384,0.5559
3,0.7857,0.8018,0.4873,0.7963,0.6046,0.469,0.4957
4,0.7886,0.8229,0.4958,0.7991,0.6119,0.4773,0.503
5,0.7886,0.8129,0.5014,0.7937,0.6146,0.4789,0.5029
6,0.7705,0.8003,0.4504,0.7718,0.5689,0.4269,0.4556
7,0.779,0.8001,0.4504,0.8071,0.5782,0.4444,0.4791
8,0.7962,0.8293,0.4986,0.8263,0.6219,0.4938,0.5234
9,0.7874,0.8297,0.4448,0.8533,0.5847,0.4603,0.5044


In [346]:
pred_holdout = predict_model(blended_3)

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Voting Classifier,0.7976,0.8169,0.4935,0.8128,0.6141,0.4878,0.5157


In [347]:
final_model = finalize_model(blended_3)

In [348]:
X_test = pd.DataFrame(x4_test)

In [349]:
predictions = predict_model(final_model, data = X_test)

#pycaret에서는 score이 label을 맞출 확률이기때문에 output을 제출 양식에 맞게 바꿔줍니다
x = []
for i in range(len(predictions['Score'])):
  if predictions['Label'][i] =='1.0':
    x.append(predictions['Score'][i])
  else:
    x.append(1-predictions['Score'][i])


In [351]:
sample_submssion = pd.read_csv(PATH+'sample_submission.csv')
sample_submssion['problem'] = x
sample_submssion.to_csv("best_3.csv", index = False)