### K-means 클러스터링 : 시작위치 분류

- 각 게임의 첫번째 카메라 이벤트를 이용하여 스타팅포인트와 맵을 분류하였습니다.
- 맵에 따라 종족간 승률에 차이가 존재합니다.

| <h4> 2020.03.13 15:30 </h4> | <h5> - kmeans를 이용한 스타팅포인트,맵 클러스터링  [[ 데이콘 링크 ]](https://dacon.io/competitions/official/235583/codeshare/743) </h5>|
|:-------|:---------:
|view    | 279
|language| Python
|by Eunil| 댓글 12


### 1.0 Drive 연결 및 기본설정

In [1]:
"""
# 기본 DIR 구조를 입력한다. - _assets 폴더 제외!
# /content/drive/My Drive/Colab Notebooks/
# dir_base = '/content/drive/My Drive/Colab Notebooks/competition/''
"""
dir_base = '/home/yk/0325_Starcraft/competition/'

raw      = 'c03_starcraft_prediction/data_raw/'
remake   = 'c03_starcraft_prediction/data_remake/'
submit   = 'c03_starcraft_prediction/data_submit/'

assets = '/home/yk/0325_Starcraft/competition/_assets'

In [2]:
"""
# 모듈 import를 위한 SYS.PATH 설정!
"""
import os
import sys

if not dir_base in sys.path:
    sys.path.insert(0, dir_base)
    print(f"***'{dir_base}' has set in SYS.PATH! ***")    
    print()
    
for i, item in enumerate(sys.path,1):
    print(f"{i:02}.{item}")

***'/home/yk/0325_Starcraft/competition/' has set in SYS.PATH! ***

01./home/yk/0325_Starcraft/competition/
02./home/yk/0325_Starcraft/competition/c03_starcraft_prediction
03./home/yk/anaconda3/lib/python37.zip
04./home/yk/anaconda3/lib/python3.7
05./home/yk/anaconda3/lib/python3.7/lib-dynload
06.
07./home/yk/anaconda3/lib/python3.7/site-packages
08./home/yk/anaconda3/lib/python3.7/site-packages/IPython/extensions
09./home/yk/.ipython


In [3]:
from _assets.modules import ( __doc__, 
                                show_ls, 
                                show_infoDF_from,
                                histit,
                                hist_it,
                                plot_it,
                                get_random_n_array,
                                get_basic_df,
                                get_counts_dict_fromDF,
                            )

# from _assets.module_data_preps import ( __doc__,
#                                 tqdm,  
#                                 species_converter, 
#                                 data_preparation,
#                             )


# OS 화일 및 DF 정보조회를 위한 탐색 모듈



In [4]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.cluster import KMeans

In [5]:
import warnings                             
warnings.filterwarnings("ignore")

### 2.0 Data Read / 기본정보탐색

In [6]:
# raw_data를 df_train 에 넣는다.
df_train = pd.read_csv(dir_base + raw + 'train.csv')

In [7]:
df_train.head(3)

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']


In [8]:
# df_train 에 game_id 들 넣는데, 유니크값으로 1개씩만 저장한다.
df_temp = pd.DataFrame(df_train['game_id'].unique(), columns=['game_id'])
df_temp.head(3)

Unnamed: 0,game_id
0,0
1,1
2,2


In [9]:
df_temp.index = df_temp.game_id
df_temp.head(3)

Unnamed: 0_level_0,game_id
game_id,Unnamed: 1_level_1
0,0
1,1
2,2


In [10]:
# df_train 에 game_id 들 넣는데, 유니크값으로 1개씩만 저장한다.
df_temp = pd.DataFrame(df_train['game_id'].unique(), columns=['game_id'], index=df_train['game_id'].unique(), )
df_temp.shape

(38872, 1)

In [11]:

df_temp = df_temp.drop(['game_id'], axis = 1) # game_id 열을 삭제한다.

In [12]:
df_train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]']
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)"


In [13]:
# 처음 기록 된 카메라 좌표를 기록
df_train_p0 = df_train[(df_train.event=='Camera')&(df_train.player==0)]
df_train_p0 = df_train_p0[df_train_p0.shift(1).game_id != df_train_p0.game_id] # 쉬프트를 이용하여 각 게임의 첫번째 데이터 찾기
df_train_p0 = df_train_p0.iloc[:, [0,6]].rename({'event_contents':'player0_starting'}, axis = 1)
df_train_p0.index = df_train_p0['game_id']
df_train_p0 = df_train_p0.drop(['game_id'], axis=1)
df_train = pd.merge(df_train, df_train_p0, on='game_id', how='left')

del df_train_p0

In [14]:
# 처음 기록 된 카메라 좌표를 기록
df_train_p1 = df_train[(df_train.event=='Camera')&(df_train.player==1)]
df_train_p1 = df_train_p1[df_train_p1.shift(1).game_id!=df_train_p1.game_id]
df_train_p1 = df_train_p1.iloc[:, [0,6]].rename({'event_contents':'player1_starting'}, axis = 1)
df_train_p1.index = df_train_p1['game_id']
df_train_p1 = df_train_p1.drop(['game_id'], axis=1)
df_train = pd.merge(df_train, df_train_p1, on='game_id', how='left')

del df_train_p1

In [15]:
# 모든 게임의 첫번째 카메라 좌표
df_train.head()

Unnamed: 0,game_id,winner,time,player,species,event,event_contents,player0_starting,player1_starting
0,0,1,0.0,0,T,Camera,"at (145.25, 21.5078125)","at (145.25, 21.5078125)","at (22.75, 147.0078125)"
1,0,1,0.0,1,T,Camera,"at (22.75, 147.0078125)","at (145.25, 21.5078125)","at (22.75, 147.0078125)"
2,0,1,0.02,0,T,Selection,['OrbitalCommand [3080001]'],"at (145.25, 21.5078125)","at (22.75, 147.0078125)"
3,0,1,0.02,0,T,Ability,(1360) - TrainSCV,"at (145.25, 21.5078125)","at (22.75, 147.0078125)"
4,0,1,0.14,0,T,Camera,"at (142.99609375, 24.50390625)","at (145.25, 21.5078125)","at (22.75, 147.0078125)"


In [None]:
# x, y 값으로 분리

df_train['player0_starting'] = df_train.player0_starting.str.split('(').str[1]
df_train['player0_starting'] = df_train.player0_starting.str.split(')').str[0]
split_xy = df_train.player0_starting.str.split(',')
df_train['player0_x'] = split_xy.str[0].astype('float')
df_train['player0_y'] = split_xy.str[1].astype('float')
del split_xy

df_train['player1_starting'] = df_train.player1_starting.str.split('(').str[1]
df_train['player1_starting'] = df_train.player1_starting.str.split(')').str[0]
split_xy = df_train.player1_starting.str.split(',')
df_train['player1_x'] = split_xy.str[0].astype('float')
df_train['player1_y'] = split_xy.str[1].astype('float')
del split_xy

In [None]:
df_train.head()

In [None]:
# 플레이어의 x,y 좌표를 하나로 모음

location_p0 = df_train.loc[:, ['player0_x', 'player0_y']]
location_p0 = location_p0.rename({'player0_x':'location_x', 'player0_y':'location_y'}, axis=1)

location_p1 = df_train.loc[:, ['player1_x', 'player1_y']]
location_p1 = location_p1.rename({'player1_x':'location_x', 'player1_y':'location_y'}, axis=1)
location_p1.index += location_p0.index[-1]+1

location = pd.concat([location_p0, location_p1])
location = location.dropna()
del location_p0, location_p1

In [None]:
location.head()

In [None]:
# 모든 포인트 시각화
sns.lmplot('location_x', 'location_y', data = location, fit_reg=False)
plt.title('starting point')
plt.show()

In [None]:
# 스타팅 컬럼을 카운팅을 해보면 15개의 포이트가 많음
# 15개의 스타팅포인트 존재

df_train.player0_starting.value_counts().head(20)

In [None]:
# kmeans를 이용하여 15개로 클러스터링
kmeans_clst = KMeans(n_clusters=15).fit(location)
location['starting'] = kmeans_clst.labels_+1

In [None]:
location.head()

In [None]:
# 클러스트링한 결과를 시각화
sns.lmplot('location_x', 'location_y', data = location, fit_reg=False, hue="starting")
plt.title('starting point')
plt.show()

# 초반 진행 화면이 누락 된 게임은 엉뚱하게 분류 됨

In [None]:
# kmeans로 찾은 15개의 포인트에서 각 데이터들의 거리 계산
for cluster in range(15):
    point = location[location.starting==cluster+1]
    loc = point.loc[:,['location_x', 'location_y']]
    del point
    loc['center_x'] = kmeans_clst.cluster_centers_[cluster][0]
    loc['center_y'] = kmeans_clst.cluster_centers_[cluster][1]
    distance = np.sqrt(np. square(loc.location_x - loc.center_x) + np.square(loc.location_y - loc.center_y))
    location.loc[loc.index, 'distance'] = distance
    del loc

In [None]:
location.head()

In [None]:
# 일정 거리(5)이상 떨어진 데이터는 starting을 0으로 지정
idx = location[location.distance>5].index
location.loc[idx, 'starting'] = 0
del idx

In [None]:
# 시각화를 통해 스타팅을 모르는 게임은 분홍색(0)으로 나타며 15개의 스타팅 포인트를 확인 할 수 있음.
sns.lmplot('location_x', 'location_y', data = location, fit_reg=False, hue="starting")
plt.title('starting point')
plt.show()

In [None]:
# 클러스터링한 결과 반영
df_train['player0_starting'] = location.loc[df_train.index, 'starting']
location.index -= (df_train.index[-1]+1)
df_train['player1_starting'] = location.loc[df_train.index, 'starting']
del location

# 불필요한 컬럼 삭제
df_train = df_train.drop(['player0_x', 'player0_y', 'player1_x', 'player1_y'], axis = 1)
df_train = df_train.fillna(0)

In [None]:
df_train.head()

In [None]:
# 스타팅 포인트를 이용하여 맵 분류
map_list = []
for point in range(1,16):
    couple = df_train[df_train.player0_starting == point].player1_starting.value_counts()
    if couple[couple.index[1]]<100:
        map_list.append([point, couple.index[0], 999])
    else:
        map_list.append([point, couple.index[0], couple.index[1]])
map_list = np.sort(map_list, axis = 1)
map_list = np.unique(map_list, axis = 0)

In [None]:
# 6개의 2인용 맵과 1개의 3인용 맵이 존재

# TMI: 스타크래프트2 공허의유산 래더에서 3인용 맵이 쓰인적은 '까탈레나' 딱 한 번.
# 이를 통해 2017년 7월20일 ~ 2017년 11월16일 사이에 진행 된 게임 정보라는 것을 알 수 있음.
# 나머지 6개의 맵은 '어비설리프', '어센션투아이어', '애컬라이트', '인터로퍼', '오딧세이', '메크디포'
# 근데 결과 예측하는데 맵 이름 상관 없음
map_list

In [None]:
# 스타팅을 모르는 게임 수 확인.
len(df_train[(df_train.player0_starting == 0)|(df_train.player1_starting == 0)])
258

In [None]:
# map_list와 상대편 위치 정보를 이용하여 모르는 스타팅 찾기
for m in map_list:
    idx = df_train[(df_train.player0_starting == 0)&((df_train.player1_starting == m[0])|(df_train.player1_starting == m[2]))].index
    df_train.loc[idx, 'player0_starting'] = m[1]
    del idx
    idx = df_train[(df_train.player0_starting == 0)&((df_train.player1_starting == m[1])|(df_train.player1_starting == m[2]))].index
    df_train.loc[idx, 'player0_starting'] = m[0]
    del idx
    
    idx = df_train[(df_train.player1_starting == 0)&((df_train.player0_starting == m[0])|(df_train.player0_starting == m[2]))].index
    df_train.loc[idx, 'player1_starting'] = m[1]
    del idx
    idx = df_train[(df_train.player1_starting == 0)&((df_train.player0_starting == m[1])|(df_train.player0_starting == m[2]))].index
    df_train.loc[idx, 'player1_starting'] = m[0]
    del idx

In [None]:
# 모든 게임의 스타팅포인트를 찾음
df_train[(df_train.player0_starting == 0)|(df_train.player1_starting == 0)].head()	

In [None]:
# 맵 컬럼 추가
for map_num, m in enumerate(map_list):
    idx = df_train[(df_train.player0_starting == m[0])|(df_train.player0_starting == m[1])|(df_train.player0_starting == m[2])].index
    df_train.loc[idx, 'map'] = map_num
del idx, map_list

In [None]:
# 스타팅포인트, 맵 클러스터링 끝
df_train.head()