In [1]:
# 라이브러리 설치
import warnings                                      # 경고 메세지 무시
warnings.filterwarnings('ignore')
import pandas as pd                                  # 데이터 조작, 분석
import numpy as np                                   # 행렬 연산
import random                                        # 난수 생성
random.seed(2020)
random_seed = 2020
import time                                          # 시간 측정
import re                                            # 정규표현식

from sklearn.model_selection import train_test_split # train, validation 데이터 나누기
from sklearn import metrics                          # AUC 측정
!pip install catboost
from catboost import CatBoostClassifier, Pool        # CatBoost 모델링
import lightgbm as lgb                               # lightGBM 모델링
from sklearn.model_selection import KFold            # K-fold CV    
!pip install bayesian-optimization
from bayes_opt import BayesianOptimization           # 베이지안 최적화 라이브러리  
from functools import partial                        # 함수 변수 고정



In [33]:
# 데이터 불러오기
train = pd.read_csv("./data/train.csv")
test = pd.read_csv("./data/test.csv")

In [56]:
# 반응변수 전처리
def preprocess_y(data, exchange_player=False):
    y = data.drop_duplicates(['game_id', 'winner']).winner.reset_index(drop=True)
    if (exchange_player == True):
        y = y.append(-(y - 1)).reset_index(drop=True)
    return y

# 설명변수 전처리
def preprocess_X(data, exchange_player=False):
    print("start")
    # game_id 개수만큼의 index를 가진 DataFrame X 생성
    n = data.game_id.max() + 1
    X = pd.DataFrame(index=range(n)[data.game_id.min():])
    
    # time 변수
    X['time'] = data.drop_duplicates(['game_id'],keep='last').set_index('game_id').time
    X['time'] = (X.time*100//100*60 + X.time*100%100)

    # species 더미 변수
    X = pd.concat([pd.get_dummies(data[data.player == 0].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'0_protoss','T':'0_terran','Z':'0_zerg'}),
                pd.get_dummies(data[data.player == 1].drop_duplicates(['game_id']).set_index('game_id').species).rename(columns={'P':'1_protoss','T':'1_terran','Z':'1_zerg'}),
                X],axis=1)
  
    # event 카운트
    contents = data.loc[:,['player','game_id','time']].groupby(['player', 'game_id']).count().unstack(level=0)
    contents.columns = ['0_event', '1_event']
    X['0_event'], X['1_event'] = contents['0_event'], contents['1_event']

    # event 카운트 / time
    X['0_event_per_sec'], X['1_event_per_sec'] = X['0_event'] /X.time, X['1_event'] /X.time

    # event == Ability, AddToControlGroup, Camera, ControlGroup, GetControlGroup, Right Click, Selection, SetControlGroup일 때 각각 카운트
    contents = data.loc[:,['player','event','game_id','time']].groupby(['player', 'event', 'game_id']).count().unstack(level=[0,1]).fillna(0).astype(int)
    contents.columns = ['0_'+x for x in sorted(data.event.unique())] + ['1_'+x for x in sorted(data.event.unique())]
    for i in contents.columns:
        X[i] = contents[i]

    # event == Camera일 때 event_contents의 2차원 좌표 간 euclidean distance sum, min, median, max
    def move_sum(i):
        return sum(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_min(i):
        if len(i) == 1:
            return 0
        return min(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_median(i):
        if len(i) == 1:
            return 0
        return np.median(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 + 
                             np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    def move_max(i):
        if len(i) == 1:
            return 0
        return max(np.sqrt(np.diff(i.map(lambda x: x[4:x.find(',')]).astype(float)) **2 +
                       np.diff(i.map(lambda x: x[x.find(',')+2:len(x)-1]).astype(float)) **2))
    
    contents = (data[data.event == 'Camera'].loc[:,['player','game_id','event_contents']].
              groupby(['player','game_id'])).agg([move_sum,move_min,move_median,move_max]).unstack(level=0)
    contents.columns = [y+x for x in ['sum','min','median','max'] for y in ['0_move_','1_move_']]
    for i in contents.columns:
        X[i] = contents[i].fillna(0)

    # 30초 이내 move_sum
    contents = (data[(data.time < 0.3) & (data.event == 'Camera')].loc[:,['player','game_id','event_contents']].
              groupby(['player','game_id'])).agg(move_sum).unstack(level=0)
    contents.columns = ['0_move_sum_30sec','1_move_sum_30sec']
    for i in contents.columns:
        X[i] = contents[i]

    # event == Ability일 때 event_contents 더미 변수 생성, 카운트
    contents = pd.DataFrame(data.event_contents[(data.event == 'Ability')].map(lambda x: x[x.find('(')+1:x.find(')')]))  # event_contents의 16진수 코드만 추출
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # event == Ability일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        X[i+'_div_time'] = X[i] /X.time

    # event == Selection일 때 event_contents 더미 변수 생성, 카운트
    contents = data[data.event == 'Selection'].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                                                replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
    contents = contents.str.split(',')
    max_num = max(contents.map(lambda x: len(x)))
    t = [0 for x in range(max_num)]
    for i in range(max_num):
        t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
    contents = pd.concat([t[i] for i in range(max_num)])
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # event == Selection일 때 event_contents 더미 변수 생성 / time
    for i in contents_X.columns:
        X[i+'_div_time'] = X[i] /X.time

    # 30초 이내 event == Selection일 때 event_contents 더미 변수 생성, 카운트
    contents = data[(data.time < 0.3) & (data.event == 'Selection')].event_contents.map(lambda x: re.sub('\s\[.....\]', '', re.sub('\s\[......\]', '', re.sub('\s\[.......\]', '', x))).
                                    replace('[', '').replace(']', '').replace(' ', '').replace('\'', ''))
    contents = contents.str.split(',')
    max_num = max(contents.map(lambda x: len(x)))
    t = [0 for x in range(max_num)]
    for i in range(max_num):
        t[i] = pd.DataFrame(contents[contents.map(lambda x: len(x) > i)].map(lambda x: x[i]))
    contents = pd.concat([t[i] for i in range(max_num)])
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_','1_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i+'_30sec'] = contents_X[i]
        X[i+'_30sec'] = X[i+'_30sec'].fillna(0).astype(int)

    # event == Right Click일 때 Target 이름 더미 변수 생성, 카운트
    contents = pd.DataFrame(data.event_contents[(data.event == 'Right Click') & (data.event_contents.map(lambda x: str(x)[:6]) == 'Target')].map(lambda x: x[x.find(':')+2:x.find(' [')]))  # event_contents의 Target만 추출
    contents['game_id'], contents['player'], contents['count'] = data.game_id, data.player, 1
    contents_X = pd.DataFrame(columns=[x+y for x in ['0_Target_','1_Target_'] for y in contents.event_contents.unique()])
    contents = contents.groupby(['player','event_contents','game_id']).count().unstack(level=[0,1])
    contents.columns = contents.columns.map(lambda x: str(x[1])+'_Target_'+x[2])
    contents_X = pd.concat([contents_X, contents])
    for i in contents_X.columns:
        X[i] = contents_X[i]
        X[i] = X[i].fillna(0).astype(int)

    # 컬럼 이름 순서로 정렬
    X = X[sorted(X.columns)]

    # player 0,1 자리 바꾼 X1생성, X와 행 병합해 데이터 2배로 만들기
    if (exchange_player == True):
        c = X.shape[1]//2
        X1 = X.copy()
        X1.columns = list(X.columns[c:2*c])+list(X.columns[:c])+['time']
        X1.index = [x+n for x in range(n)]
        X = pd.concat([X, X1])

    return X

In [60]:
# train, test 전처리 수행, y(반응변수), X,test_X(설명변수) 생성
y = preprocess_y(train, True)
X = preprocess_X(train, True)
test_X = preprocess_X(test, False)

# X, test_X에만 있는 컬럼 삭제
X.drop(set(X.columns) - set(test_X.columns), axis=1, inplace=True)
test_X.drop(set(test_X.columns) - set(X.columns), axis=1, inplace=True)

start
start


In [11]:
X = X.dropna(subset=['time'])
X = X.fillna(0)
X.to_csv('./data/train/rank2_X.csv',index=False)

In [8]:
X = pd.read_csv('./data/train/rank2_X.csv')

In [12]:
test_X = test_X.dropna(subset=['time'])
test_X = test_X.fillna(0)
test_X.to_csv('./data/train/rank2_test_X.csv', index = False)

In [9]:
test_X = pd.read_csv('./data/train/rank2_test_X.csv')

In [13]:
y.to_csv('./data/train/rank2_y.csv',index=False)

In [38]:
y = pd.read_csv('./data/train/rank2_y.csv')
y = y.T.squeeze()

In [39]:
# CatBoost 모델링
def catboost_modeling(x_train, y_train, x_test, grow_policy, depth, learning_rate, l2_leaf_reg, random_seed, n):
  
    # 빈 Series인 test_pred 생성
    test_pred = pd.Series([0 for x in range(len(x_test))], index=x_test.index)
    
    # 10-fold 모델링을 n회 반복할 것
    for i in range(n):
        kf = KFold(n_splits=10, random_state=random_seed+i)
        for train_index, valid_index in kf.split(x_train):
            train_X, train_y = x_train.iloc[train_index], y_train[train_index]
            valid_X, valid_y = x_train.iloc[valid_index], y_train[valid_index]
      
            # catBoost(grow_policy='Depthwise')
            model = CatBoostClassifier(eval_metric = 'AUC',              # AUC로 성능 측정
                                 iterations = 25000,               # 반복횟수 최대 25000
                                 metric_period = 25000,            # 중간결과 출력X
                                 early_stopping_rounds = 1000,     # 1000iteration 동안 AUC 증가 없으면 학습 중단
                                 task_type = 'GPU',                # GPU 사용
                                 grow_policy = grow_policy,        # 트리 노드 생성 방식
                                                                   # 1) Depthwise(지정한 depth에 이를 때까지 level 순으로 노드 분할)
                                                                   # 2) Lossguide(loss 변화가 큰 순으로 노드 분할)
                                 depth = depth,                    # 트리 깊이
                                 learning_rate = learning_rate,    # 러닝레이트
                                 l2_leaf_reg = l2_leaf_reg,        # L2 정규화
                                 random_seed = random_seed+i,      # 랜덤시드 고정
                                 )
            # 모델 학습
            model.fit(train_X, train_y, eval_set=(valid_X, valid_y))
    
            # 모델 적용
            test_pred += model.predict_proba(x_test)[:,1] /(10*n)
    return test_pred

In [40]:
data1 = catboost_modeling(X, y, test_X, 'Depthwise', 10, 0.02423, 20.35, 2014, 2)

0:	learn: 0.7109942	test: 0.6092780	best: 0.6092780 (0)	total: 202ms	remaining: 1h 24m 2s
bestTest = 0.7345825434
bestIteration = 3248
Shrink model to first 3249 iterations.
0:	learn: 0.7148042	test: 0.6211064	best: 0.6211064 (0)	total: 181ms	remaining: 1h 15m 31s
bestTest = 0.7419700623
bestIteration = 4905
Shrink model to first 4906 iterations.
0:	learn: 0.7188430	test: 0.6168532	best: 0.6168532 (0)	total: 184ms	remaining: 1h 16m 41s
bestTest = 0.7567687035
bestIteration = 4442
Shrink model to first 4443 iterations.
0:	learn: 0.7153936	test: 0.6184645	best: 0.6184645 (0)	total: 198ms	remaining: 1h 22m 17s
bestTest = 0.7444160581
bestIteration = 4822
Shrink model to first 4823 iterations.
0:	learn: 0.7172568	test: 0.6201050	best: 0.6201050 (0)	total: 184ms	remaining: 1h 16m 33s
bestTest = 0.7487422824
bestIteration = 4673
Shrink model to first 4674 iterations.
0:	learn: 0.7117703	test: 0.6017268	best: 0.6017268 (0)	total: 174ms	remaining: 1h 12m 27s
bestTest = 0.7341803312
bestIterati

In [None]:
data2 = catboost_modeling(X, y, test_X, 'Lossguide', 8, 0.01063, 5.127, 2014, 2)

In [None]:
data3 = catboost_modeling(X, y, test_X, 'Depthwise', 12, 0.01564, 49.99, 2022, 2)

In [None]:
data4 = catboost_modeling(X, y, test_X, 'Lossguide', 16, 0.01213, 5.027, 2022, 2)

In [41]:
# 최종 모델 앙상블
sample_submission = pd.read_csv('./data/sample_submission.csv')
data_final = pd.DataFrame(data1) # (data1+data2)/2 *1/3 + (data3+data4)/2 *2/3
sample_submission.iloc[:,1:] = data_final[0]
sample_submission.to_csv('./result/data_final.csv', index =False)
sample_submission

Unnamed: 0,game_id,winner
0,38872,0.302120
1,38873,0.728112
2,38874,0.560615
3,38875,0.060094
4,38876,0.725650
...,...,...
16782,55654,0.678958
16783,55655,0.079206
16784,55656,0.891928
16785,55657,0.680666
