<a href="https://colab.research.google.com/github/crossrobot/2021-Bigcontest-SportTech-Sector-YonseiDSL-/blob/main/%EC%95%BC%EA%B5%AC%EC%84%B1%EC%A0%81%EC%98%88%EC%B8%A1%EB%AA%A8%EB%8D%B8_Seq2Seq_%EC%B5%9C%EC%A2%85%EB%B3%B8.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Requirements

In [24]:
# 데이터 분석 모듈

import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import tensorflow as tf 
import os
import random
import datetime
from tqdm import tqdm

In [25]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [26]:
# for reproducibility 
def seed_everything(seed = 2021):
    random.seed(seed)
    np.random.seed(seed)
    os.environ["PYTHONHASHSEED"] = str(seed)
    tf.random.set_seed(seed)       # global random seed. operational seed는 셀안에서 한번 더 설정한다.  

seed_everything()

# LSTM

## Data Load & Preprocessing

In [27]:
path =  '/content/drive/Shareddrives/Big Contest - BaseBall/빅콘테스트-야구/Final_Data/선수_경기별_데이터'
강백호_경기별 = pd.read_csv(os.path.join(path, '강백호경기별데이터.csv'), encoding = 'CP949')
로맥_경기별 = pd.read_csv(os.path.join(path, '로맥경기별데이터.csv'), encoding = 'CP949')
최정_경기별 = pd.read_csv(os.path.join(path, '최정경기별데이터.csv'), encoding = 'CP949')
이정후_경기별 = pd.read_csv(os.path.join(path, '이정후경기별데이터.csv'), encoding = 'CP949')
채은성_경기별 = pd.read_csv(os.path.join(path, '채은성경기별데이터.csv'), encoding = 'CP949')
양의지_경기별 = pd.read_csv(os.path.join(path, '양의지경기별데이터.csv'), encoding = 'CP949')
박건우_경기별 = pd.read_csv(os.path.join(path, '박건우경기별데이터.csv'), encoding = 'CP949')
김현수_경기별 = pd.read_csv(os.path.join(path, '김현수경기별데이터.csv'), encoding = 'CP949')
김재환_경기별 = pd.read_csv(os.path.join(path, '김재환경기별데이터.csv'), encoding = 'CP949')
전준우_경기별 = pd.read_csv(os.path.join(path, '전준우경기별데이터.csv'), encoding = 'CP949')

In [28]:
# 날짜 정보를 생성하는 함수 
def date_generator(dataset) : 
  dataset['date'] = dataset.apply(lambda x: str(x['연도'])+'년 '+x['날짜'], axis=1)
  dataset['date'] = dataset['date'].apply(lambda x : datetime.datetime.strptime(x, '%Y년 %m월 %d일').date())
  return dataset

In [29]:
# 'date' 열 생성
강백호_경기별 = date_generator(강백호_경기별)
로맥_경기별 = date_generator(로맥_경기별)
최정_경기별 = date_generator(최정_경기별)
이정후_경기별 = date_generator(이정후_경기별)
채은성_경기별 = date_generator(채은성_경기별)
양의지_경기별 = date_generator(양의지_경기별)
박건우_경기별 = date_generator(박건우_경기별)
김현수_경기별 = date_generator(김현수_경기별)
김재환_경기별 = date_generator(김재환_경기별)
전준우_경기별 = date_generator(전준우_경기별)

In [30]:
# 정규시즌 데이터 필터링 함수
def season_selector(df) : 
  startdate2021 = datetime.datetime.strptime('2021-04-03', "%Y-%m-%d").date()
  enddate2021 = datetime.datetime.strptime('2021-10-08', "%Y-%m-%d").date()
  startdate2020 = datetime.datetime.strptime('2020-05-05', "%Y-%m-%d").date()
  enddate2020 = datetime.datetime.strptime('2020-10-31', "%Y-%m-%d").date()
  startdate2019 = datetime.datetime.strptime('2019-03-23', "%Y-%m-%d").date()
  enddate2019 = datetime.datetime.strptime('2019-10-01', "%Y-%m-%d").date()
  startdate2018 = datetime.datetime.strptime('2018-03-24', "%Y-%m-%d").date()
  enddate2018 = datetime.datetime.strptime('2018-09-30', "%Y-%m-%d").date()

  filtered = df[(((df['date'] >= startdate2021) & (df['date'] <= enddate2021)) | ((df['date'] >= startdate2020) & (df['date'] <=  enddate2020)) | ((df['date'] >= startdate2019) & (df['date'] <= enddate2019)) | ((df['date'] >= startdate2018) & (df['date'] <= enddate2018)) )]
  return filtered


In [31]:
# 필터링 된 데이터 
강백호_경기별=season_selector(강백호_경기별)
로맥_경기별=season_selector(로맥_경기별)
최정_경기별=season_selector(최정_경기별)
이정후_경기별=season_selector(이정후_경기별)
채은성_경기별=season_selector(채은성_경기별)
양의지_경기별=season_selector(양의지_경기별)
박건우_경기별=season_selector(박건우_경기별)
김현수_경기별=season_selector(김현수_경기별)
김재환_경기별=season_selector(김재환_경기별)
전준우_경기별=season_selector(전준우_경기별)

In [32]:
# 아웃라이어 필터링 함수 
def outlier_filter(df) : 
  filtered_df = df[(df['OPS']>=0.75) & (df['OPS']<=1.2)]
  return filtered_df

In [33]:
# 필터링 된 데이터 
강백호_경기별=outlier_filter(강백호_경기별)
로맥_경기별=outlier_filter(로맥_경기별)
최정_경기별=outlier_filter(최정_경기별)
이정후_경기별=outlier_filter(이정후_경기별)
채은성_경기별=outlier_filter(채은성_경기별)
양의지_경기별=outlier_filter(양의지_경기별)
박건우_경기별=outlier_filter(박건우_경기별)
김현수_경기별=outlier_filter(김현수_경기별)
김재환_경기별=outlier_filter(김재환_경기별)
전준우_경기별=outlier_filter(전준우_경기별)

In [34]:
# 타구와 관계가 없다고 판단되는  열들은 제거
강백호_경기별 = 강백호_경기별.drop(['연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)
로맥_경기별 = 로맥_경기별.drop(['연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)
최정_경기별 = 최정_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)   # '결과','타순','P', '투구', 'avLI', 'RE24', 'WPA' 
이정후_경기별 = 이정후_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)
채은성_경기별 = 채은성_경기별.drop(['연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date' ] , axis=1)
양의지_경기별 = 양의지_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)
박건우_경기별 = 박건우_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date'] , axis=1)
김현수_경기별 = 김현수_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date' ] , axis=1)
김재환_경기별 = 김재환_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date' ] , axis=1)
전준우_경기별 = 전준우_경기별.drop([ '연도','날짜','상대','선발','결과','타순','P', '투구', 'avLI', 'RE24', 'WPA','date' ] , axis=1)

In [35]:
train_강백호_df, predictX_강백호_df = 강백호_경기별[0:-90].copy(), 강백호_경기별[-90:].copy()
train_로맥_df,predictX_로맥_df = 로맥_경기별[0:-90].copy(), 로맥_경기별[-90:].copy()
train_최정_df,predictX_최정_df = 최정_경기별[0:-90].copy(), 최정_경기별[-90:].copy()
train_이정후_df,predictX_이정후_df = 이정후_경기별[0:-90].copy(), 이정후_경기별[-90:].copy()
train_채은성_df,predictX_채은성_df = 채은성_경기별[0:-90].copy(), 채은성_경기별[-90:].copy()
train_양의지_df,predictX_양의지_df = 양의지_경기별[0:-90].copy(), 양의지_경기별[-90:].copy()
train_박건우_df,predictX_박건우_df = 박건우_경기별[0:-90].copy(), 박건우_경기별[-90:].copy()
train_김현수_df,predictX_김현수_df = 김현수_경기별[0:-90].copy(), 김현수_경기별[-90:].copy()
train_김재환_df,predictX_김재환_df = 김재환_경기별[0:-90].copy(), 김재환_경기별[-90:].copy()
train_전준우_df,predictX_전준우_df = 전준우_경기별[0:-90].copy(), 전준우_경기별[-90:].copy()

#훈련이터 
train_list = [train_강백호_df,train_로맥_df,train_최정_df, train_이정후_df,train_채은성_df, train_양의지_df,train_박건우_df,train_김현수_df,train_김재환_df,train_전준우_df]
predictX_list = [predictX_강백호_df, predictX_로맥_df,predictX_최정_df,predictX_이정후_df,predictX_채은성_df,predictX_양의지_df,predictX_박건우_df,predictX_김현수_df,predictX_김재환_df,predictX_전준우_df]

## Modeling

In [36]:
# hyper parameters
n_past = 90
n_future = 21 
n_features = 21# E1D1

In [37]:
## CPU 이용 시 ###
tf.random.set_seed(2021)           # operation seed 
# n_features ==> # of features at each timestep in the data.
#
encoder_inputs = tf.keras.layers.Input(shape=(n_past, n_features))
encoder_l1 = tf.keras.layers.LSTM(100, return_state=True)
encoder_outputs1 = encoder_l1(encoder_inputs)

encoder_states1 = encoder_outputs1[1:]

#
decoder_inputs = tf.keras.layers.RepeatVector(n_future)(encoder_outputs1[0])

#
decoder_l1 = tf.keras.layers.LSTM(100, return_sequences=True)(decoder_inputs,initial_state = encoder_states1)
decoder_outputs1 = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(n_features))(decoder_l1)

#
model_e1d1 = tf.keras.models.Model(encoder_inputs,decoder_outputs1)

#
model_e1d1.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 90, 21)]     0           []                               
                                                                                                  
 lstm_2 (LSTM)                  [(None, 100),        48800       ['input_2[0][0]']                
                                 (None, 100),                                                     
                                 (None, 100)]                                                     
                                                                                                  
 repeat_vector_1 (RepeatVector)  (None, 21, 100)     0           ['lstm_2[0][0]']                 
                                                                                            

In [38]:
# 아래 코드는 슬라이딩 윈도우 역할을 하는 함수를 만들어 샘플데이터를 X와 y로 적당한 길이로 나누어 학습 데이터셋을 만들어주는 함수이다. 
def split_series(series, n_past, n_future):
  #
  # n_past ==> no of past observations
  #
  # n_future ==> no of future observations 
  #
  X, y = list(), list()
  for window_start in range(len(series)):
    past_end = window_start + n_past
    future_end = past_end + n_future
    if future_end > len(series):
      break
    # slicing the past and future parts of the window
    past, future = series[window_start:past_end, :], series[past_end:future_end, :]   
    X.append(past)
    y.append(future)
  return np.array(X), np.array(y)

## Model Training 

In [39]:
#seq2seq 모델은 한 선수의 시계열 데이터로 미래 21경기의 결과를 예측하는 모델이다. 
# for문을 통해 seq2seq모델의 훈련과 예측과정을 자동화 한다.

tf.random.set_seed(2021)           # operation seed 
player_indicator = 0
pred_all = pd.DataFrame(columns = train_강백호_df.columns)
histories = []

for train_df, test_df in tqdm(zip(train_list,predictX_list)) : 
  train = train_df.copy()
  scalers={}
  for i in train_df.columns:
      scaler = MinMaxScaler(feature_range=(-1,1))                 # scaling
      s_s = scaler.fit_transform(train[i].values.reshape(-1,1))   # output dimension : (데이터길이 , 1)
      s_s=np.reshape(s_s,len(s_s))                                # output dimension : (데이터길이, )
      scalers['scaler_'+ i] = scaler
      train[i]=s_s
  test = test_df.copy()
  for i in train_df.columns:
      scaler = scalers['scaler_'+i]
      s_s = scaler.transform(test[i].values.reshape(-1,1))
      s_s=np.reshape(s_s,len(s_s))                                # s_s는 원래 데이터 프레임의 한 열이 스케일링된 것
      scalers['scaler_'+i] = scaler
      test[i]=s_s


  X_train, y_train = split_series(train.values, n_past, n_future)
  X_train = X_train.reshape((X_train.shape[0], X_train.shape[1],n_features))       # X_train.reshape()  : (246, 60, 21)   (끊어진 학습 데이터셋 개수 , 윈도우 길이, 특성 수)
  y_train = y_train.reshape((y_train.shape[0], y_train.shape[1], n_features))      
  X_test = np.array(test.values)
  X_test = X_test.reshape(-1, n_past, n_features)

  # 모델 훈련 
  reduce_lr = tf.keras.callbacks.LearningRateScheduler(lambda x: 1e-3 * 0.90 ** x)
  model_e1d1.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.Huber())  ## huber loss MSE, MAE loss 보다 더 좋은 loss function 
  history_e1d1=model_e1d1.fit(X_train,y_train,epochs=70, batch_size=32,verbose=0,callbacks=[reduce_lr])
  histories.append(history_e1d1)

  # 모델 예측
  pred_e1d1=model_e1d1.predict(X_test)

  # 예측값을 원본 스케일 값으로 바꿔준다. 
  for index,i in enumerate(train_df.columns):
    scaler = scalers['scaler_'+i]
    pred_e1d1[:,:,index]= scaler.inverse_transform(pred_e1d1[:,:,index])

  # dataframe에 예측결과 추가 
  pred_e1d1_df = pd.DataFrame(pred_e1d1.reshape(n_future,n_features), columns = train_강백호_df.columns)
  pred_e1d1_df['player'] = str(player_indicator)
  pred_all = pred_all.append(pred_e1d1_df)


  # 훈련 및 예측 후 모델 리셋 
  model_e1d1.reset_states()
  player_indicator += 1


10it [07:40, 46.00s/it]


In [40]:
pred_all

Unnamed: 0,타수,득점,안타,2타,3타,홈런,루타,타점,도루,도실,볼넷,사구,고4,삼진,병살,희타,희비,타율,출루,장타,OPS,player
0,3.702025,0.556914,0.778819,0.326866,-0.014464,0.155620,1.483302,0.396323,0.021276,0.060160,0.458893,0.041700,0.079483,0.715205,0.104818,-0.003259,0.058703,0.338435,0.404441,0.612385,1.010519,0
1,3.713151,0.568534,0.906567,0.270305,-0.015823,0.092714,1.666712,0.458406,-0.006663,0.043458,0.468349,0.022751,0.046156,0.692861,0.076423,-0.000527,0.045986,0.328396,0.404709,0.614569,1.020471,0
2,3.658180,0.574607,0.970052,0.260470,-0.016869,0.076979,1.702420,0.478691,-0.005402,0.038309,0.446738,0.014510,0.035444,0.694793,0.043256,0.008063,0.036124,0.322727,0.402400,0.615319,1.021723,0
3,3.632169,0.581331,1.011572,0.265887,-0.013785,0.083919,1.696800,0.507141,0.005856,0.037399,0.439109,0.015583,0.034231,0.704212,0.029573,0.013420,0.031306,0.320692,0.401728,0.615252,1.021668,0
4,3.624277,0.586445,1.033018,0.271802,-0.010471,0.093600,1.684267,0.534150,0.016415,0.036493,0.434364,0.017892,0.035033,0.712699,0.026217,0.015276,0.029050,0.320171,0.401614,0.615197,1.021971,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16,3.967613,0.414239,1.008794,0.256732,-0.010242,0.118600,1.608701,0.443032,-0.007782,0.007599,0.013683,0.015246,0.013337,0.502293,0.144300,-0.008473,0.034859,0.329879,0.378870,0.537188,0.925869,9
17,3.969402,0.415031,1.009913,0.256769,-0.010767,0.118515,1.609815,0.442104,-0.008130,0.007371,0.013074,0.015149,0.013068,0.502884,0.143375,-0.008197,0.034930,0.329985,0.379033,0.537331,0.926293,9
18,3.970931,0.415753,1.010944,0.256766,-0.011210,0.118477,1.611040,0.441386,-0.008407,0.007183,0.012526,0.015073,0.012823,0.503427,0.142570,-0.007961,0.034988,0.330075,0.379174,0.537436,0.926639,9
19,3.972242,0.416405,1.011889,0.256737,-0.011584,0.118472,1.612313,0.440837,-0.008628,0.007028,0.012036,0.015013,0.012601,0.503926,0.141870,-0.007759,0.035037,0.330152,0.379296,0.537513,0.926924,9


In [41]:
# 선수별로 미래 21경기에 대한 예측값을 평균하여 대회 평가기간동안의 OPS 예측값을 생성한다.
final_prediction = pred_all.groupby(['player']).mean()
final_prediction['player_name'] = ['강백호', '로맥','최정', '이정후', '채은성', '양의지', '박건우', '김현수', '김재환', '전준우']
final_prediction['pred_ops'] = final_prediction['출루'] + final_prediction['장타']
final_prediction.head()

Unnamed: 0_level_0,타수,득점,안타,2타,3타,홈런,루타,타점,도루,도실,볼넷,사구,고4,삼진,병살,희타,희비,타율,출루,장타,OPS,player_name,pred_ops
player,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
0,3.635217,0.589094,1.026448,0.280802,-0.005565,0.114667,1.708369,0.58715,0.031625,0.031965,0.438087,0.024081,0.038271,0.712994,0.040624,0.009968,0.03071,0.322262,0.4026,0.613306,1.021202,강백호,1.015906
1,3.658573,0.870102,1.204984,0.182064,-0.010712,0.291051,2.368837,0.862962,0.023809,-6.5e-05,0.573526,0.024391,0.015758,0.723758,0.037545,0.008404,0.009426,0.26895,0.371578,0.495632,0.861155,로맥,0.867211
2,3.72331,0.580181,1.004801,0.177313,-0.003696,0.154991,1.748806,0.391351,0.108602,0.018024,0.445059,0.064889,-0.00015,0.680042,0.10043,-0.001274,-0.003095,0.274066,0.400678,0.514839,0.912121,최정,0.915517
3,4.084004,0.63046,1.266792,0.263312,0.020036,0.10488,2.175042,0.682331,0.064329,0.007284,0.293122,0.014687,0.014604,0.325152,0.076763,0.010225,0.035608,0.364452,0.425439,0.604494,1.031758,이정후,1.029933
4,3.79662,0.431292,1.109131,0.231331,0.011112,0.028881,1.512316,0.611626,0.017108,0.027059,0.217346,0.023526,0.015862,0.715861,0.120818,-0.018883,0.02961,0.317235,0.364697,0.483651,0.846473,채은성,0.848348


final_prediction의 'OPS'열은 OPS를 예측한 것이고 'pred_ops'열은 '예측출루율'+ '예측장타율' 로 계산한 OPS 예측값이다.    
제출용 예측은 pred_OPS를 사용한다. 

In [42]:
final_prediction = final_prediction[['player_name', '출루', '장타', 'pred_ops']]

In [43]:
final_prediction = final_prediction.reset_index(drop = True)

In [44]:
final_prediction

Unnamed: 0,player_name,출루,장타,pred_ops
0,강백호,0.4026,0.613306,1.015906
1,로맥,0.371578,0.495632,0.867211
2,최정,0.400678,0.514839,0.915517
3,이정후,0.425439,0.604494,1.029933
4,채은성,0.364697,0.483651,0.848348
5,양의지,0.439294,0.597644,1.036939
6,박건우,0.386094,0.453635,0.839729
7,김현수,0.408229,0.524141,0.93237
8,김재환,0.377992,0.493374,0.871366
9,전준우,0.377742,0.528726,0.906468
