In [1]:
import pandas as pd
import os
from tqdm import tqdm
from glob import glob
import numpy as np

In [2]:
import cufflinks as cf
cf.go_offline(connected=True)

In [None]:
fileNames=['baseball_' + str(x) + '.csv' for x in list(range(2015,2020))]

taza = pd.DataFrame()
for fileName in tqdm(fileNames):
    
    temp = pd.read_csv(fileName)
    taza = taza.append(temp)

In [55]:
taza.corr().iplot(kind='heatmap',colorscale="Blues")

In [None]:
colnames = ['G_ID','GDAY_DS','T_ID','VS_T_ID','TB_SC'] + taza.corr()[taza.corr()['win']>0.3].index.tolist()

taza = taza[colnames]

taza['YEAR']=taza['G_ID'].str[:4]
years = [str(x) for x in range(2015,2020)]

In [None]:
print(colnames)

In [None]:
def weightedMean(arr):
    return np.average(arr, weights=np.arange(0, len(arr), 1))

In [4]:
class GetX():
    def __init__(self,gi,N, taza):
        self.gi = gi
        self.N = N
        self.taza =taza
        self.date = gi[:8]
        self.team1 = gi[8:10]
        self.team2 = gi[10:12]
    
    def makeR(self,team):
        '''
        df1은 team1의 공격(타자)데이터
        df2는 team1의 수비(실점)데이터
        '''
        df1 = self.taza[self.taza['T_ID']==team]
        df1 = df1.reset_index(drop=True)
        
        df2 = self.taza[self.taza['VS_T_ID']==team]
        df2 = df2.reset_index(drop=True)
        
        
        
        dateIdx = df1[df1['GDAY_DS']==int(self.date)].index[0]
        if(dateIdx-self.N<0):
            return []
        else:
            first = dateIdx-self.N
            
        df1 = df1[first:dateIdx]
        sr1 = df1.iloc[:,5:-1].apply(lambda x:weightedMean(x),axis=0)
        
        df2 = df2[first:dateIdx]
        sr2 = df2.iloc[:,5:-1].apply(lambda x:weightedMean(x),axis=0)
        cN = sr2.index.values
        sr2.index = [x+'_VS' for x in cN]
        
        if(len(sr1)<2):
            return []
        
        temp = pd.Series({"WIN_RATIO":weightedMean(df1['win'])})
        result = sr1.append(sr2)
        result = result.append(temp)
        
        return result


    def makeDf(self):
        df1 = self.taza[self.taza['G_ID']==self.gi]

        result1 = self.makeR(self.team1) #어웨이팀의 정보
        result2 = self.makeR(self.team2) #홈(상대)팀의 정보

        

        if((len(result1)<=1)|(len(result2)<=1)):
            return []
        
        cN = result2.index.values
        result2.index = [x+'_ENE' for x in cN]
        result = result1.append(result2)
        
        tempDf = self.taza[self.taza['GDAY_DS']<int(self.date)]
        tempDf = tempDf[(tempDf['T_ID']==self.team1)&(tempDf['VS_T_ID']==self.team2)]
        if(len(tempDf)==0):
            result['RELATIVE_WIN'] = 0.5
        else:

            result['RELATIVE_WIN'] = tempDf['win'][-10:].mean()
        result['G_ID']=self.gi
        result['WIN'] = self.taza[self.taza['G_ID']==gi].iloc[0,-1]
        

        return result.to_dict()
    

In [56]:
def gapLeftRight(df):
    dfTemp = df.iloc[:,:-3].copy()

    n = dfTemp.shape[1]//2

    dfLeft = dfTemp.iloc[:,:n]
    dfRight = dfTemp.iloc[:,n:]

    dfColumns = dfLeft.columns

    dfRight.columns = dfColumns

    dfResult = dfLeft-dfRight

    dfResult.columns = [x+'_GAP' for x in dfColumns]
    dfResult = pd.concat([dfResult, df.iloc[:,-3:]], axis=1)
    return dfResult

In [57]:
def gapLeftRightRandom(df):
    dfTemp = df.iloc[:,:-3].copy()

    n = dfTemp.shape[1]//2

    dfLeft = dfTemp.iloc[:,:n]
    dfRight = dfTemp.iloc[:,n:]

    dfColumns = dfLeft.columns

    dfRight.columns = dfColumns

    dfResult = dfLeft-dfRight

    dfResult.columns = [x+'_GAP' for x in dfColumns]
    dfResult = pd.concat([dfResult, df.iloc[:,-3:]], axis=1)
    #Home, Base randomize
    dfResult=dfResult.sample(len(dfResult))

    nTemp = len(dfResult)//2

    dfResult.iloc[:nTemp,:-3] = -1 * dfResult.iloc[:nTemp,:-3]

    dfResult.iloc[:nTemp,[-3,-1]] = 1-dfResult.iloc[:nTemp,[-3,-1]]

    dfResult = dfResult.sort_values("G_ID")
    return dfResult

In [79]:
resultByN = []

for N in tqdm(range(30,61,5)):
    dfX = pd.DataFrame()
    for year in years:
        taza_ = taza[taza['YEAR']==year]

        X = []
        for gi in taza_['G_ID'][::2]:
            temp = GetX(gi,N, taza_.iloc[:,:-1])
            temp = temp.makeDf()
            if(type(temp)!=list):
                X.append(temp)

        df = pd.DataFrame(X)
        dfX = dfX.append(df)
        

    dfResult = gapLeftRight(dfX)
    resultByN.append((N,np.mean(abs(dfResult.corr()['WIN'][:-1]))))

100%|██████████| 7/7 [05:36<00:00, 48.05s/it]


In [63]:
resultByN

[(30, 0.05929887904989841),
 (35, 0.061628717301157786),
 (40, 0.06613596149713964),
 (45, 0.06892468208994179),
 (50, 0.07306065471820053),
 (55, 0.0759796420306262),
 (60, 0.07694160802570202)]

In [27]:
dfX = pd.DataFrame()
for year in years:
    taza_ = taza[taza['YEAR']==year]

    X = []
    for gi in taza_['G_ID'][::2]:
        temp = GetX(gi,50, taza_.iloc[:,:-1])
        temp = temp.makeDf()
        if(type(temp)!=list):
            X.append(temp)

    df = pd.DataFrame(X)
    dfX = dfX.append(df)
    
gapLeftRight(dfX).to_csv("kbo_train.csv",index=False)

In [52]:
taza_ = pd.read_csv("baseball_2020.csv")
taza_ = taza_[colnames]
taza_['YEAR']=taza_['G_ID'].str[:4]

X = []
for gi in taza_['G_ID'][::2]:
    temp = GetX(gi,50, taza_.iloc[:,:-1])
    temp = temp.makeDf()
    if(type(temp)!=list):
        X.append(temp)

df = pd.DataFrame(X)
gapLeftRight(df).to_csv("kbo_test.csv",index=False)

### Home, Base 랜덤화

In [54]:
resultByN = []

for N in tqdm(range(30,61,5)):
    dfX = pd.DataFrame()
    for year in years:
        taza_ = taza[taza['YEAR']==year]

        X = []
        for gi in taza_['G_ID'][::2]:
            temp = GetX(gi,N, taza_.iloc[:,:-1])
            temp = temp.makeDf()
            if(type(temp)!=list):
                X.append(temp)

        df = pd.DataFrame(X)
        dfX = dfX.append(df)

    dfResult = gapLeftRightRandom(dfX)
    resultByN.append((N,np.mean(abs(dfResult.corr()['WIN'][:-1]))))

100%|██████████| 7/7 [04:54<00:00, 42.03s/it]


In [35]:
resultByN

[(5, 0.03723224320135221),
 (10, 0.04594175369920553),
 (15, 0.05865468636571103),
 (20, 0.06637832747032983),
 (25, 0.06733263277966778),
 (30, 0.06514828368565719),
 (35, 0.0674561382185336),
 (40, 0.07241724564036403),
 (45, 0.07553985361549503),
 (50, 0.08123362999955502),
 (55, 0.08375370013316849),
 (60, 0.08468626309608845),
 (65, 0.08453288841164124),
 (70, 0.09280611204366868)]

In [59]:
dfX = pd.DataFrame()
for year in years:
    taza_ = taza[taza['YEAR']==year]

    X = []
    for gi in taza_['G_ID'][::2]:
        temp = GetX(gi,50, taza_.iloc[:,:-1])
        temp = temp.makeDf()
        if(type(temp)!=list):
            X.append(temp)

    df = pd.DataFrame(X)
    dfX = dfX.append(df)
    
gapLeftRightRandom(dfX).to_csv("kbo_train_random.csv",index=False)

In [60]:
taza_ = pd.read_csv("baseball_2020.csv")
taza_ = taza_[colnames]
taza_['YEAR']=taza_['G_ID'].str[:4]

X = []
for gi in taza_['G_ID'][::2]:
    temp = GetX(gi,60, taza_.iloc[:,:-1])
    temp = temp.makeDf()
    if(type(temp)!=list):
        X.append(temp)

dfX = pd.DataFrame(X)
gapLeftRightRandom(dfX).to_csv("kbo_test_random.csv",index=False)

In [37]:
year='2020'
taza_ = taza[taza['YEAR']==year]



In [None]:
#trash
temp['win'] = 0.5

    gameIds = temp['G_ID'].unique()
    for gi in gameIds:

        fidx = temp[temp['G_ID'] == gi].index[0]
        sidx = temp[temp['G_ID'] == gi].index[1]


        if(temp.loc[fidx,'RUN'] > temp.loc[sidx,'RUN']):
            temp.loc[fidx,'win'] = 1
            temp.loc[sidx,'win'] = 0
        elif(temp.loc[fidx,'RUN'] < temp.loc[sidx,'RUN']):
            temp.loc[fidx,'win'] = 0
            temp.loc[sidx,'win'] = 1
    y = temp['win']
    temp = temp.iloc[:,:-1]

    temp['OBP'] = (temp['HIT']+temp['BB']+temp['HP'])/(temp['AB']+temp['BB']+temp['HP']+temp['SF'])

    temp['OOO'] = temp['HIT']/temp['AB']

    temp['win'] = y
    temp = temp.sort_values(["G_ID","TB_SC"],ascending = [True,False])
    temp = temp[temp.loc[:,'PA':'P_HIT_CN'].sum(axis=1)!=0].reset_index(drop=True)