### Install / Import 모듈 

In [1]:
# 코랩에 없는 라이브러리 설치해주기
# !pip install bayesian-optimization

In [2]:
import os
import pandas as pd                         # 데이터 분석 라이브러리
import numpy as np                          # 계산 라이브러리
import matplotlib.pyplot as plt             # * 그래프 이미지

from tqdm import tqdm                       # 진행바

from sklearn.metrics import roc_auc_score   # AUC 스코어 계산
from sklearn.model_selection import KFold   # K-fold CV    
from bayes_opt import BayesianOptimization  # 베이지안 최적화 라이브러리  
from functools import partial               # 함수 변수 고정
import lightgbm as lgb                      # LightGBM 라이브러리
import warnings           
import random                       
warnings.filterwarnings("ignore")           # 경고 문구 미표시

In [3]:
# 데이터가 많아 5분 정도 걸립니다.  
# 전체경로 = /content/drive/My Drive/Colab Notebooks/competition/c03_starcraft_prediction/data_raw/train.csv
# /content/drive/My Drive/Colab Notebooks/
# dir_base = '/content/drive/My Drive/Colab Notebooks/competition/''

dir_base = '/home/yk/0325_Starcraft/competition/'

raw      = 'c03_starcraft_prediction/data_raw/'
remake   = 'c03_starcraft_prediction/data_remake/'
submit   = 'c03_starcraft_prediction/data_submit/'

# WAY-01 = LGBM, WAY-02 = XGBM
init_points, n_iter = (15, 75)                      
filename_submit = f"submission_0414_way02_{init_points}_{n_iter}_final.csv"      

### 사용함수의 정의

In [4]:
os.chdir(dir_base)
from _assets.modules import *
from _assets.module_data_preps import *
from _assets.module_xgbm_model import *


# OS 화일 및 DF 정보조회를 위한 탐색 모듈


# GAME_ID 유니크 데이터 전처리를 위한 모듈 - baseline offer


# XGBM_CV 모델 - by SongDo_StudyGroup Code 참조



Using TensorFlow backend.


### Data 전처리 및 저장하기
- 전처리 된 x_train, y_train 내용을 확인하고, 저장한다.

In [5]:
# 화일위치 / list 확인하기 ... 사용함수
# show_ls('/content')          # drive 가 붙었는지 확인!
show_ls(dir_base + remake)      # 불러올 Data 화일이름 확인!

False
DIR_TARGET=/home/yk/0325_Starcraft/competition
/home/yk/0325_Starcraft/competition/c03_starcraft_prediction/data_remake/
----------------------------------------
  01. .ipynb_checkpoints
  02. accumujated_winning_rate_per_time.csv
  03. accumulated_winning_rate_per_time.csv
  04. df_new_rate04_correct.csv
  05. df_win_rate_every_10sec.csv
  06. df_win_rate_every_20sec.csv
  07. df_win_rate_every_40sec.csv
  08. df_xtest_remake.csv
  09. df_xtest_remake_final.csv
  10. df_xtrain_remake.csv
  11. df_xtrain_remake_final.csv
  12. df_ytrain_remake.csv
  13. xtra_remake_xtest.csv
  14. xtra_remake_xtest_final.csv
  15. xtra_remake_xtrain.csv
  16. xtra_remake_xtrain_final.csv


In [6]:
%%time
""" df_xtrain_remake_final.csv / ytrain 값은 변함없음 (동일하게 사용)
# 데이터 전처리 과정 - 약 7 ~ 10분 정도 걸립니다 ... 여기서 세션 리셋 됨!
# x_train, y_train = data_preparation(train, answer=True)           # [ 67,091,776 x 7 ] ... 70%
# show_infoDF_from(x_train) # *** DATA SHAPE = [ 38,872 x 27 ] ... [ game_id ... ]
# show_infoDF_from(y_train) # *** DATA SHAPE = [ 38,872 x 3 ] .. ['Unnamed: 0', 'game_id', 'winner']
"""

x_train = pd.read_csv(dir_base + remake + 'xtra_remake_xtrain_final.csv')  # to <class 'pandas.core.frame.DataFrame'>
ytrain = pd.read_csv(dir_base + remake + 'df_ytrain_remake.csv')  # to <class 'numpy.ndarray'>
y_train = ytrain['winner'].values                                 # array([1, 1, 0, ..., 0, 1, 0])

CPU times: user 2.51 s, sys: 168 ms, total: 2.68 s
Wall time: 2.68 s


In [7]:
pd.Series(y_train).value_counts()
# 1    19499
# 0    19373            total = 38,872  y_train = np.array 38,872

1    19499
0    19373
dtype: int64

# XGBM Classifier 모델 적용
 - 송도 스터디그룹에서 유창준님 - XGBM 모델 코드 공유받음!

In [8]:
%%time
# 모델 그 외 변수는 고정
var_fixed = partial(
        XGB_cv, 
        x_data=x_train, 
        y_data=y_train, 
        n_splits=5, 
        output='score'
    )

# 베이지안 최적화 범위 설정
XGBo = BayesianOptimization(
        var_fixed, 
        {
            'max_depth': (8, 512),          # ~ 500
            'learning_rate': (0.1, 0.6),    # 0.6xxx
            'subsample': (0.5, 0.9),        # 0.5 ~ 0.7
            'colsample_bytree': (0.5, 1),   # 0.5 ~ 0.8
            # 'colsample_bynode':(0, 1),
            # 'n_estimators' : (16, 1024),
            'reg_alpha' : (7, 20),          # L1 - 7 ~ 20
            'reg_lambda' : (8, 50),         # L2 - 8 ~ 45
            'max_delta_step' : (1, 25),     # 8.x
            'gamma' : (5, 20),              # 1 ~ 15
        },
        random_state = random.randrange(50000) # 유동 시드 1~50000
    )

# XGBo.maximize(init_points=15, n_iter=60)  # 처음 15회 랜덤값으로 score 계산 후 60회 최적화 = 4/04(토)
XGBo.maximize(init_points, n_iter)

|   iter    |  target   | colsam... |   gamma   | learni... | max_de... | max_depth | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.6919  [0m | [0m 0.608   [0m | [0m 5.726   [0m | [0m 0.1729  [0m | [0m 17.66   [0m | [0m 277.7   [0m | [0m 7.307   [0m | [0m 25.22   [0m | [0m 0.746   [0m |
| [0m 2       [0m | [0m 0.6755  [0m | [0m 0.7656  [0m | [0m 10.9    [0m | [0m 0.1279  [0m | [0m 4.735   [0m | [0m 472.8   [0m | [0m 12.22   [0m | [0m 40.72   [0m | [0m 0.833   [0m |
| [0m 3       [0m | [0m 0.6675  [0m | [0m 0.6242  [0m | [0m 16.92   [0m | [0m 0.3953  [0m | [0m 19.31   [0m | [0m 257.3   [0m | [0m 14.99   [0m | [0m 25.23   [0m | [0m 0.8941  [0m |
| [0m 4       [0m | [0m 0.6761  [0m | [0m 0.6211  [0m | [0m 8.51    [0m | [0m 0.3618  [0m | [0m 19.16   [0m | [0m 19.49   [0m | [0m 15.

| [0m 41      [0m | [0m 0.695   [0m | [0m 0.52    [0m | [0m 5.404   [0m | [0m 0.1424  [0m | [0m 19.45   [0m | [0m 42.79   [0m | [0m 7.411   [0m | [0m 44.06   [0m | [0m 0.8574  [0m |
| [0m 42      [0m | [0m 0.6939  [0m | [0m 0.5418  [0m | [0m 6.113   [0m | [0m 0.1755  [0m | [0m 24.08   [0m | [0m 245.1   [0m | [0m 8.634   [0m | [0m 23.32   [0m | [0m 0.7908  [0m |
| [0m 43      [0m | [0m 0.6917  [0m | [0m 0.6809  [0m | [0m 6.418   [0m | [0m 0.2798  [0m | [0m 24.59   [0m | [0m 69.72   [0m | [0m 7.136   [0m | [0m 48.98   [0m | [0m 0.8201  [0m |
| [0m 44      [0m | [0m 0.6859  [0m | [0m 0.5994  [0m | [0m 5.119   [0m | [0m 0.1404  [0m | [0m 24.4    [0m | [0m 499.0   [0m | [0m 19.72   [0m | [0m 39.0    [0m | [0m 0.7448  [0m |
| [0m 45      [0m | [0m 0.6844  [0m | [0m 0.5596  [0m | [0m 5.525   [0m | [0m 0.3702  [0m | [0m 21.3    [0m | [0m 40.88   [0m | [0m 7.354   [0m | [0m 40.8    [0m | [0m 0.743

| [0m 82      [0m | [0m 0.69    [0m | [0m 0.6197  [0m | [0m 5.256   [0m | [0m 0.2067  [0m | [0m 18.75   [0m | [0m 315.6   [0m | [0m 7.358   [0m | [0m 9.095   [0m | [0m 0.7496  [0m |
| [0m 83      [0m | [0m 0.689   [0m | [0m 0.8333  [0m | [0m 5.148   [0m | [0m 0.2618  [0m | [0m 2.184   [0m | [0m 139.9   [0m | [0m 8.473   [0m | [0m 8.987   [0m | [0m 0.8505  [0m |
| [0m 84      [0m | [0m 0.6939  [0m | [0m 0.7367  [0m | [0m 5.79    [0m | [0m 0.1776  [0m | [0m 24.31   [0m | [0m 93.17   [0m | [0m 8.526   [0m | [0m 49.27   [0m | [0m 0.8587  [0m |
| [0m 85      [0m | [0m 0.6901  [0m | [0m 0.5537  [0m | [0m 7.732   [0m | [0m 0.1115  [0m | [0m 1.216   [0m | [0m 179.5   [0m | [0m 7.814   [0m | [0m 23.16   [0m | [0m 0.834   [0m |
| [0m 86      [0m | [0m 0.6934  [0m | [0m 0.805   [0m | [0m 6.482   [0m | [0m 0.1303  [0m | [0m 3.831   [0m | [0m 302.1   [0m | [0m 11.1    [0m | [0m 8.14    [0m | [0m 0.820

### 3.0 모델 학습 및 검증
- Model Tuning & Evaluation

> 1. AUC가 가장 높은 하이퍼 파라미터를 사용해 최종 모델을 얻습니다.
> 1. 훈련 세트와 같은 방법으로 테스트 세트에서 Feature를 추출합니다.
> 1. 최종 모델을 사용해 예측을 수행합니다.
> 1. 예측 결과를 submission.csv로 저장합니다.

In [9]:
"""
# params = lgbBO.max['params']
# models = lgb_cv(
#         params['num_leaves'], 
#         params['learning_rate'], 
#         params['n_estimators'], 
#         params['subsample'], 
#         params['colsample_bytree'], 
#         params['reg_alpha'], 
#         params['reg_lambda'], 
#         x_data=x_train, 
#         y_data=y_train, 
#         n_splits=5, 
#         output='model',
#    )
"""

params = XGBo.max['params']
models = XGB_cv(
        params['max_depth'], 
        params['learning_rate'],
        params['subsample'], 
        params['colsample_bytree'],
        # params['colsample_bynode'], 
        params['reg_alpha'], 
        params['reg_lambda'],
        params['max_delta_step'],
        # params['n_estimators'],
        params['gamma'],
        x_data=x_train, 
        y_data=y_train, 
        n_splits=5, 
        output='model'
    )

# TEST를 풀기위해, 학습데이터로 변경 (data_preps)
 1. Train Data set = [ 67,091,776 x 7 ] ... 70%
  - game_id 별로 정리하면 [38,872 x 27]로 압축된다. 
  - 경기당, 평균 1,726개의 이벤트가 존재한다.(압축률 0.057939 %)
 > - x_train set = *** DATA SHAPE = [ 38,872 x 27 ] pandas.DataFrame
 > - y_train set = *** DATA SHAPE = [ 38,872 x 1 ]  np.array
 
 1. Test Data set  = [ 28,714,849 x 6 ] ... 30%
  - game_id 별로 정리하면 [16,787 x 27] 로 압축된다. 
  - 경기당 평균 1711개의 이벤트가 존재한다.(압축률 0.058461 %)
 > - x_test set = *** DATA SHAPE = [16,787 x 27] pandas.DataFrame 
 > - y_test set = *** DATA SHAPE = [16,787 x 1] np.array =  대회주관자가 가지고 있음
 
 1. Submission = y_test set 
  - submission = [16,787 x 1] np.array to [16,787 x 2 ] pandas.DataFrame --> df.csv

In [10]:
"""
# 테스트용 데이터를 전처리 해서 분석 준비 ... (시간소요) 미리 저장한 화일을 불러온다
# 저장했으면 비활성 처리
# x_test, _ = data_preparation(test, answer=False)                 # [ 28,714,849 x 6 ] ... 30%
# x_test.to_csv(dir_base + remake + 'df_xtest_remake.csv')
# x_test = pd.read_csv(dir_base + remake + 'df_xtest_remake.csv')  # [  16,787 x 27 ]
"""
x_test = pd.read_csv(dir_base + remake + 'xtra_remake_xtest_final.csv')    # *** DATA SHAPE = [ 16,787 x 27 ]


# 예측결과 Submission 화일 만들기

In [11]:
"""
# pd.read_csv('data/sample_submission.csv', index_col=0)
# sample_submission = pd.read_csv(dir_base + sub_base + "sample_submission.csv")
"""

preds = []

for model in models:
    pred = model.predict_proba(x_test)[:, 1]
    preds.append(pred)
    
pred = np.mean(preds, axis=0)

In [12]:
show_ls(dir_base + raw)

False
DIR_TARGET=/home/yk/0325_Starcraft/competition/c03_starcraft_prediction/data_remake
/home/yk/0325_Starcraft/competition/c03_starcraft_prediction/data_raw/
----------------------------------------
  01. sample_submission.csv
  02. test.csv
  03. train.csv


In [13]:
submission = pd.read_csv(dir_base + raw + "sample_submission.csv") 


submission['winner'] = pred
submission.to_csv(dir_base + submit + filename_submit, index=False)

# 결과 확인하기
- 3월 데이콘 대회 홈페이지 = https://bit.ly/39bqWVg
- 결과 제출하기 = https://dacon.io/competitions/official/235583/mysubmission/

In [14]:
df_1 = submission
df_1

Unnamed: 0,game_id,winner
0,38872,0.523022
1,38873,0.641441
2,38874,0.426662
3,38875,0.195388
4,38876,0.672514
...,...,...
16782,55654,0.645065
16783,55655,0.210758
16784,55656,0.810839
16785,55657,0.513253
