In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
%matplotlib inline
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn import metrics
import statsmodels.api as sm
from sklearn.metrics import accuracy_score
import lightgbm
from lightgbm import LGBMClassifier
from sklearn import svm
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

  import pandas.util.testing as tm


In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
file = 'drive/MyDrive/data/Retrosheet_2010_2019/2010_to_2019_seasons.csv'
game_df = pd.read_csv(file)

game_col_del = '''VisitorGDP,VisitorCI,HomeGDP,HomeCI, DoubleHeader, DayOfWeek, VisitingTeamLeague, HomeTeamLeague, DayNight, CompletionInfo,ForfeitInfo, ProtestInfo,Duration, VisitorLineScore, HomeLineScore, UmpireHID, UmpireHName, Umpire1BID, Umpire1BName, Umpire2BID,Umpire2BName, Umpire3BID, Umpire3BName, UmpireLFID, UmpireLFName, UmpireRFID,UmpireRFName, VisitorManagerID, VisitorManagerName, HomeManagerID, HomeManagerName,WinningPitcherID, WinningPitcherName, LosingPitcherID, LosingPitcherNAme,SavingPitcherID, SavingPitcherName, GameWinningRBIID, GameWinningRBIName, VisitorBatting1Position, VisitorBatting2Position, VisitorBatting3Position,VisitorBatting4Position, VisitorBatting5Position, VisitorBatting6Position,VisitorBatting7Position, VisitorBatting8Position, VisitorBatting9Position,HomeBatting1Position, HomeBatting2Position, HomeBatting3Position,HomeBatting4Position, HomeBatting5Position, HomeBatting6Position,HomeBatting7Position, HomeBatting8Position, HomeBatting9Position,AdditionalInfo, AcquisitionInfo'''
game_col_del = game_col_del.replace(" ", "")
game_col_del = game_col_del.split(",")

game_df.drop(game_col_del,axis=1,inplace=True)

my_attendence={'BOS':37755, 'LAA':45517, 'CWS':40615, 'KC':37903, 'OAK':56782, 'TEX':40300, 'ARI':48686, 'ATL':41084, 'CIN':42319,
       'HOU':41168, 'MIL':41900, 'NYM':41922, 'PIT':38747, 'WAS':41339, 'TB':42735, 'BAL':44970, 'DET':41083, 'COL':50144,
       'MIA':37442, 'SF':41265, 'CLE':34830, 'MIN':38544, 'SEA':47929, 'TOR':49282, 'CHC':41649, 'PHI':42792, 'SD':40209, 'STL':45494,
       'NYY':54251, 'LAD':56000}
# Had an extra team due to the Marlins having two different labels. Updating their team name to MIA
game_df.replace(to_replace='FLO', value='MIA', inplace=True)

# Updating game_df so team abbreviations match. Will be leveraged when aggregating teams stats
game_df.replace(to_replace={"NYA":"NYY", "SDN":"SD", "CHN":"CHC", "SLN":"STL", "SFN":"SF", "LAN":"LAD", "TBA":"TB", "KCA":"KC", "CHA":"CWS", "ANA":"LAA", "NYN":"NYM"}, inplace=True)

for key, value in my_attendence.items():
  game_df.loc[game_df['HomeTeam']==key,'Attendence']=game_df.loc[game_df['HomeTeam']==key,'Attendence']/value


file4 = 'drive/MyDrive/data/park_factors.csv'
parkfactor_df = pd.read_csv(file4, encoding='cp949')
parkfactor_df

# Had an extra team due to the Marlins having two different labels. Updating their team name to MIA
parkfactor_df.replace(to_replace='FLO', value='MIA', inplace=True)

# Updating game_df so team abbreviations match. Will be leveraged when aggregating teams stats
parkfactor_df.replace(to_replace={"NYA":"NYY", "SDN":"SD", "CHN":"CHC", "SLN":"STL", "SFN":"SF", "LAN":"LAD", "TBA":"TB", "KCA":"KC", "CHA":"CWS", "ANA":"LAA", "NYN":"NYM"}, inplace=True)


game_df = pd.merge(game_df, parkfactor_df, on="HomeTeam")


file2 = 'drive/MyDrive/data/cleaned_piching_df.csv'
pitching_df = pd.read_csv(file2)
pitching_df.drop('Unnamed: 0', axis=1, inplace=True)
# Rearrange the pitching_df so the columns have a better flow
pitching_df = pitching_df.reindex(columns=['year','playerName','teamAbbrev','winningPercentage','runsScoredPer9','hitsPer9','strikeoutsPer9','baseOnBallsPer9','homeRunsPer9','era','whip','ops','gidp','avg'])
# Columns to be left out from the reindexing --- 'winPercentage', 'strikeoutsPer9Inn', 'walksPer9Inn', 'hitsPer9Inn'
# Marlins have the team labels: MIA & FLO. Standardizing to MIA
pitching_df.replace(to_replace='FLA', value='MIA', inplace=True)


col_del = ['VisitorBatting1PlayerID', 'VisitorBatting1Name', 'VisitorBatting2PlayerID', 'VisitorBatting2Name', 'VisitorBatting3PlayerID', 'VisitorBatting3Name', 'VisitorBatting4PlayerID', 'VisitorBatting4Name', 'VisitorBatting5PlayerID', 'VisitorBatting5Name', 'VisitorBatting6PlayerID', 'VisitorBatting6Name', 'VisitorBatting7PlayerID', 'VisitorBatting7Name', 'VisitorBatting8PlayerID', 'VisitorBatting8Name', 'VisitorBatting9PlayerID', 'VisitorBatting9Name', 'HomeBatting1PlayerID', 'HomeBatting1Name', 'HomeBatting2PlayerID', 'HomeBatting2Name', 'HomeBatting3PlayerID', 'HomeBatting3Name', 'HomeBatting4PlayerID', 'HomeBatting4Name', 'HomeBatting5PlayerID', 'HomeBatting5Name', 'HomeBatting6PlayerID', 'HomeBatting6Name', 'HomeBatting7PlayerID', 'HomeBatting7Name', 'HomeBatting8PlayerID', 'HomeBatting8Name', 'HomeBatting9PlayerID', 'HomeBatting9Name']
game_df.drop(col_del,axis=1,inplace=True)
game_df = game_df.rename(columns={"VisitingTeam":"VisitorTeam","VisitingTeamGameNumber":"VisitorTeamGameNumber"})

In [4]:
game_df['Date'] = pd.to_datetime(game_df['Date'].astype(str), format='%Y%m%d')
game_df['current_year'] = game_df['Date'].dt.year

game_df['prior_year']=game_df['current_year']-1
print(game_df.Attendence)
trend=1 #몇일 롤링
game_df[f'{trend}d_home_attendance'] = game_df.groupby(['current_year', 'HomeTeam'])['Attendence'].transform(lambda x: round(x.rolling(trend).mean().shift(periods=1, axis=0), 3))
print(game_df[f'{trend}d_home_attendance'])

0        0.991657
1        1.006489
2        1.012793
3        0.982228
4        0.980585
           ...   
24292    0.868982
24293    0.861661
24294    0.959000
24295    0.905446
24296    0.856214
Name: Attendence, Length: 24297, dtype: float64
0          NaN
1        0.992
2        1.006
3        1.013
4        0.982
         ...  
24292    0.934
24293    0.869
24294    0.862
24295    0.959
24296    0.905
Name: 1d_home_attendance, Length: 24297, dtype: float64


In [5]:

pitching_df['era'] = pd.to_numeric(pitching_df['era'],errors = 'coerce')
pitching_df['whip'] = pd.to_numeric(pitching_df['whip'],errors = 'coerce')
print(pitching_df.isnull().sum())
pitching_df.fillna(value = -1, inplace=True)
print(pitching_df.isnull().sum())

year                  0
playerName            0
teamAbbrev            0
winningPercentage     0
runsScoredPer9        0
hitsPer9              0
strikeoutsPer9        0
baseOnBallsPer9       0
homeRunsPer9          0
era                  10
whip                 10
ops                   0
gidp                  0
avg                   0
dtype: int64
year                 0
playerName           0
teamAbbrev           0
winningPercentage    0
runsScoredPer9       0
hitsPer9             0
strikeoutsPer9       0
baseOnBallsPer9      0
homeRunsPer9         0
era                  0
whip                 0
ops                  0
gidp                 0
avg                  0
dtype: int64


In [6]:
away_inning = game_df['LengthInOuts']//2
home_inning = game_df['LengthInOuts']- away_inning


game_df['VisitorOffInn'] = away_inning
game_df['VisitorDifInn'] = home_inning
game_df['HomeOffInn'] = home_inning
game_df['HomeDifInn'] = away_inning


game_df = game_df.merge(pitching_df, how='left', left_on=['prior_year','VisitorStartingPitcherName'], right_on=['year', 'playerName'])
game_df = game_df.merge(pitching_df, how='left', left_on=['prior_year','HomeStartingPitcherName'], right_on=['year', 'playerName'])

game_df['Home_team_won?'] = game_df['HomeRunsScore'] > game_df['VisitorRunsScored']
game_df['Visitor_team_won?'] = game_df['HomeRunsScore'] < game_df['VisitorRunsScored']

In [59]:
game_df.to_csv("test.csv")

In [7]:
bat_stat = [ 'TeamGameNumber','Team','AB','H','D','T','HR','RBI','SH','SF',
'HBP','BB','IBB','K','SB','CS','LOB','Pitchers',
'ER','TER','WP','Balks','PO','A','E','Passed','DB','TP','OffInn','DifInn','_team_won?']
pit_stat = ['H', 'D', 'T', 'HR', 'RBI', 'SH', 'SF', 'HBP', 'BB', 'IBB', 'K', 'LOB', 'ER','DB','AB']

In [8]:
select_stat = ['Date']
select_stat += ['Home'+i for i in bat_stat]+['Visitor'+i for i in pit_stat]
rename1 = {'Visitor'+i:'Pit'+i for i in pit_stat}
rename2 = {'Home'+i:i for i in bat_stat}

In [9]:
## Home 시각 
select_stat = ['Date']
select_stat += ['Home'+i for i in bat_stat]+['Visitor'+i for i in pit_stat]
rename1 = {'Visitor'+i:'Pit'+i for i in pit_stat}
rename2 = {'Home'+i:i for i in bat_stat}
Home = game_df.copy()
Home = Home[select_stat].rename(columns=rename1)
Home = Home.rename(columns=rename2)
# away 시각
select_stat = ['Date']
select_stat += ['Visitor'+i for i in bat_stat]+['Home'+i for i in pit_stat]
rename1 = {'Home'+i:'Pit'+i for i in pit_stat}
rename2 = {'Visitor'+i:i for i in bat_stat}
Visitor = game_df.copy()
Visitor = Visitor[select_stat].rename(columns=rename1)
Visitor = Visitor.rename(columns=rename2)

In [10]:
sep_team = pd.concat([Home,Visitor])
sep_team = sep_team.rename(columns={'PitDB':'DP'})
sep_team

Unnamed: 0,Date,TeamGameNumber,Team,AB,H,D,T,HR,RBI,SH,...,PitSH,PitSF,PitHBP,PitBB,PitIBB,PitK,PitLOB,PitER,DP,PitAB
0,2010-04-04,1,BOS,34,12,3,1,1,8,0,...,0,0,0,6,0,2,9,8,1,37
1,2010-04-06,2,BOS,35,9,3,0,1,4,0,...,0,1,2,5,0,7,10,3,1,35
2,2010-04-07,3,BOS,35,7,2,0,0,1,0,...,0,0,1,5,0,7,7,1,2,34
3,2010-04-16,10,BOS,41,6,2,0,1,1,0,...,0,0,0,3,0,11,5,1,1,41
4,2010-04-17,11,BOS,38,11,3,0,3,5,0,...,0,0,0,4,0,11,5,5,0,34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24295,2019-09-17,152,TB,33,6,1,0,2,5,0,...,0,0,1,3,0,8,4,5,1,31
24296,2019-09-18,153,TB,44,14,3,0,1,8,0,...,0,1,0,2,1,13,7,7,1,44
24297,2019-09-20,154,COL,35,10,1,0,4,5,0,...,2,1,3,4,0,7,7,5,2,33
24298,2019-09-21,155,COL,32,7,0,0,2,4,0,...,0,0,0,1,0,12,5,4,1,33


In [11]:
col = ['AB', 'H', 'D', 'T', 'HR', 'RBI','DP',
       'SF', 'BB', 'K', 'SB', 'CS', 'LOB', 'Pitchers', 'ER', 'TER', 'WP','PitAB',
       'Balks', 'PO', 'A', 'E', 'Passed', 'DB', 'TP', 'OffInn', 'DifInn',
       'PitH', 'PitD', 'PitT', 'PitHR', 'PitRBI', 'PitSF',
       'PitBB', 'PitK', 'PitLOB', 'PitER']

In [12]:
a = sep_team.copy()
sep_team['year']= sep_team.Date.dt.year
sep_team[col] = sep_team.groupby(['year','Team'])[col].transform(lambda x: x.expanding(1).sum())
sep_team[col] = sep_team[col].subtract(a[col])
sep_team = sep_team[sep_team['TeamGameNumber']>=10] #최소 10경기 이상
sep_team

Unnamed: 0,Date,TeamGameNumber,Team,AB,H,D,T,HR,RBI,SH,...,PitSF,PitHBP,PitBB,PitIBB,PitK,PitLOB,PitER,DP,PitAB,year
3,2010-04-16,10,BOS,104.0,28.0,8.0,1.0,2.0,13.0,0,...,1.0,0,16.0,0,16.0,26.0,12.0,4.0,106.0,2010
4,2010-04-17,11,BOS,145.0,34.0,10.0,1.0,3.0,14.0,0,...,1.0,0,19.0,0,27.0,31.0,13.0,5.0,147.0,2010
5,2010-04-18,12,BOS,183.0,45.0,13.0,1.0,6.0,19.0,0,...,1.0,0,23.0,0,38.0,36.0,18.0,5.0,181.0,2010
6,2010-04-19,13,BOS,210.0,50.0,15.0,1.0,6.0,20.0,1,...,1.0,0,29.0,0,44.0,41.0,19.0,8.0,213.0,2010
7,2010-04-20,14,BOS,240.0,55.0,16.0,1.0,7.0,22.0,1,...,2.0,1,32.0,1,49.0,46.0,21.0,9.0,248.0,2010
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24295,2019-09-17,152,TB,5551.0,1407.0,287.0,29.0,214.0,717.0,0,...,27.0,1,448.0,0,1600.0,996.0,714.0,131.0,5456.0,2019
24296,2019-09-18,153,TB,5584.0,1413.0,288.0,29.0,216.0,722.0,0,...,27.0,0,451.0,1,1608.0,1000.0,719.0,132.0,5487.0,2019
24297,2019-09-20,154,COL,5561.0,1477.0,320.0,41.0,216.0,790.0,0,...,42.0,3,579.0,0,1238.0,1099.0,761.0,125.0,5592.0,2019
24298,2019-09-21,155,COL,5596.0,1487.0,321.0,41.0,220.0,795.0,0,...,43.0,0,583.0,0,1245.0,1106.0,766.0,127.0,5625.0,2019


In [13]:
select_feature = ['Date', 'TeamGameNumber', 'Team','_team_won?']
# feature selection
select_feature += ['AB', 'H', 'D', 'T', 'HR', 'RBI','BB', 'K','DP', 'HBP','IBB','SF','SH',#bat stat
                    'ER','OffInn','DifInn' # game info
                    ,'PitAB','PitH', 'PitD', 'PitT', 'PitHR', 'PitRBI', 'PitSF', 'PitBB',"PitIBB","PitHBP" ,'PitK','PitLOB', 'PitER'
                    ]


In [14]:
print(sep_team.columns)
sep = sep_team[select_feature] 


Index(['Date', 'TeamGameNumber', 'Team', 'AB', 'H', 'D', 'T', 'HR', 'RBI',
       'SH', 'SF', 'HBP', 'BB', 'IBB', 'K', 'SB', 'CS', 'LOB', 'Pitchers',
       'ER', 'TER', 'WP', 'Balks', 'PO', 'A', 'E', 'Passed', 'DB', 'TP',
       'OffInn', 'DifInn', '_team_won?', 'PitH', 'PitD', 'PitT', 'PitHR',
       'PitRBI', 'PitSH', 'PitSF', 'PitHBP', 'PitBB', 'PitIBB', 'PitK',
       'PitLOB', 'PitER', 'DP', 'PitAB', 'year'],
      dtype='object')


In [15]:
# H, OBP, SLG, ISO, Babip, RC, wOBA X, K, BB, RBI, K/B, HR   
# 타자 가공
sep['RC'] = ((sep['H']+sep['BB']+sep['HBP']-sep['DP'])
        *(sep['H']+2*sep['D']+3*sep['T']+4*sep['HR']+0.52*(sep['SF']+sep['SH'])+0.26*(sep['BB']+sep['HBP']-sep['IBB']))
        )/(sep['AB']*sep['AB']) #득점 생산


#사구,사사구, 고의사구 통합
sep['BB'] += sep['HBP']+sep['IBB']
sep['PitBB'] += sep['PitHBP']+sep['PitIBB']
#del sep['HBP'],sep['IBB'],sep['PitHBP'],sep['PitIBB']
# 희생번트, 희생플라이 통합
sep['PitSF'] += sep_team['PitSH']
sep['SF'] += sep['SH']
#del sep['SH'],sep['PitSH']

sep['K/B'] = sep['K']/(sep['BB']+0.5)
sep['PA'] = sep['AB'] - sep['BB'] - sep['SF'] # 타수
sep['Babip'] = (sep['H']+sep['D']+sep['T']) / (sep['PA']-sep['K']-sep['HR']-sep['SF']) # 인플레이 타구
sep['SLG'] = (sep['H']+2*sep['D']+3*sep['T']+4*sep['HR'])/sep['PA'] # 장타율
sep['H'] = sep['H']/sep['PA'] # 안타율
sep['D'] = sep['D']/sep['PA'] # 안타율
sep['T'] = sep['T']/sep['PA'] # 안타율
sep['HR'] = sep['HR']/sep['PA'] # 안타율
sep['BB'] = sep['BB']/sep['PA'] # 볼넷
sep['RBI'] = sep['RBI']/sep['TeamGameNumber'] # 득점

sep['OBP'] = (sep['H']+sep['BB']) #출루율
sep['OPS'] = sep['SLG']+sep['OBP'] #OPS
sep['GPA'] = (1.8*sep['OBP']+sep['SLG'])/4#GPA(Gross Production Average) - park factor를 적용해야함
sep['ISO'] = sep['SLG'] - sep['H']


In [16]:
# 투수 가공
# PitH, PitOBP, PitSLG, PitOPS, PitHR, EAR, PitB, PitK, PitK/B, WHIP, kwERA
sep['ERA'] = sep['PitER']/sep['DifInn']
sep['TotalH'] = sep['PitH']+sep['PitD']+sep['PitT']+sep['PitHR']
sep['TotalBB'] = sep['PitBB']+sep['PitHBP']
sep['PitPA'] = sep['PitAB'] - sep['TotalBB'] - sep['PitSF']
sep['kwERA'] = (5.40-12*(sep['PitK'] - sep['TotalBB']))/sep['PitPA']
sep['WHIP'] = (sep['TotalH']+sep['TotalBB'])/sep['DifInn']
sep['PitK'] = sep['PitK']/sep['DifInn']
sep['PitB'] = sep['TotalBB']/sep['DifInn']
sep['PitK/B'] = sep['PitK']/(sep['PitB']+0.5)


sep['PitSLG'] = (sep['PitH']+2*sep['PitD']+3*sep['PitT']+4*sep['PitHR'])/sep['PitPA'] # 피장타율
sep['PitH'] = sep['PitH']/sep['PitPA'] # 피안타율
sep['PitOBP'] = (sep['TotalH']+sep['TotalBB']) / sep['PitAB'] #피출루율
sep['PitOPS'] = sep['PitSLG']+sep['PitOBP'] #피OPS
sep['PitHR'] /= sep['PitPA']

In [17]:
game_df.Attendence

0        0.991657
1        1.006489
2        1.012793
3        0.982228
4        0.980585
           ...   
24295    0.868982
24296    0.861661
24297    0.959000
24298    0.905446
24299    0.856214
Name: Attendence, Length: 24300, dtype: float64

In [18]:

#temp = sep[['Date', 'TeamGameNumber', 'Team', '_team_won?','GPA', 'OPS','OBP','H','SLG','Babip','RC']]
temp = sep

#game_df['Attendence'] = game_df.groupby(['Date', 'HomeTeam'])['Attendence'].transform(lambda x: round(x.rolling(1).mean().shift(periods=1, axis=0), 3)) ##x.rolling() 사이에 숫자만 변경해주면 됨

print(game_df.Attendence)
game_log = game_df[['Date','VisitorTeam','HomeTeam','VisitorTeamGameNumber','HomeTeamGameNumber',f'{trend}d_home_attendance','era_x','whip_x','era_y','whip_y']]


game_log = game_log[game_log.VisitorTeamGameNumber>=10]
game_log = game_log[game_log.HomeTeamGameNumber>=10]



game_log = pd.merge(game_log,temp,left_on = ['Date','VisitorTeam'], right_on = ['Date','Team'],how='left')
game_log = pd.merge(game_log,temp,left_on = ['Date','HomeTeam'], right_on = ['Date','Team'],how='left')

game_log['year'] = game_log.Date.dt.year

#game_log = game_log.apply(pd.to_numeric, errors='coerce')





train = game_log[game_log.year!=2019]
test = game_log[game_log.year==2019]

0        0.991657
1        1.006489
2        1.012793
3        0.982228
4        0.980585
           ...   
24295    0.868982
24296    0.861661
24297    0.959000
24298    0.905446
24299    0.856214
Name: Attendence, Length: 24300, dtype: float64


In [19]:

train.dropna()
test.dropna()

Unnamed: 0,Date,VisitorTeam,HomeTeam,VisitorTeamGameNumber,HomeTeamGameNumber,1d_home_attendance,era_x,whip_x,era_y,whip_y,...,TotalBB_y,PitPA_y,kwERA_y,WHIP_y,PitB_y,PitK/B_y,PitSLG_y,PitOBP_y,PitOPS_y,year
787,2019-04-11,TOR,BOS,13,13,0.958,4.89,1.56,3.81,1.13,...,4.0,30.0,-1.820000,0.592593,0.148148,0.514286,0.466667,0.457143,0.923810,2019
788,2019-04-12,BAL,BOS,14,14,0.967,4.88,1.38,3.82,1.26,...,7.0,58.0,-2.389655,0.566038,0.132075,0.567164,0.586207,0.454545,1.040752,2019
789,2019-04-13,BAL,BOS,15,15,0.892,5.29,1.58,4.28,1.18,...,9.0,87.0,-2.972414,0.500000,0.115385,0.645833,0.551724,0.397959,0.949683,2019
790,2019-04-14,BAL,BOS,16,16,0.949,13.50,1.80,3.58,1.14,...,17.0,119.0,-2.173109,0.600000,0.161905,0.561151,0.563025,0.459854,1.022879,2019
791,2019-04-15,BAL,BOS,17,17,0.954,4.12,1.30,3.18,1.45,...,19.0,149.0,-2.379866,0.553846,0.146154,0.583333,0.510067,0.426036,0.936103,2019
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24473,2019-08-25,NYY,LAD,132,132,0.961,5.57,1.33,2.73,1.04,...,134.0,2179.0,-2.927306,0.450000,0.074444,0.644101,0.469481,0.348537,0.818018,2019
24477,2019-09-06,SF,LAD,141,143,0.817,6.25,1.63,2.73,1.04,...,142.0,2309.0,-2.939194,0.458991,0.074658,0.647758,0.483326,0.354446,0.837772,2019
24479,2019-09-08,SF,LAD,143,145,0.962,2.81,1.13,0.00,0.25,...,153.0,2364.0,-2.931726,0.461145,0.078221,0.646331,0.482234,0.356663,0.838896,2019
24480,2019-09-17,TB,LAD,152,152,0.934,1.89,0.97,3.49,1.12,...,154.0,2394.0,-2.945113,0.457850,0.077739,0.648318,0.477861,0.354297,0.832158,2019


In [20]:
train.isnull().sum()

Date                     0
VisitorTeam              0
HomeTeam                 0
VisitorTeamGameNumber    0
HomeTeamGameNumber       0
                        ..
PitK/B_y                 1
PitSLG_y                 1
PitOBP_y                 1
PitOPS_y                 1
year                     0
Length: 115, dtype: int64

In [55]:
# 'H','OBP','SLG','ISO','Babip','RC','K','BB','RBI','K/B','HR','PitH','PitOBP','PitSLG','PitOPS','PitHR','EAR','PitB','PitK','PitK/B','WHIP'

In [21]:
y_cols = ['_team_won?_x']


select = ['H','OBP','SLG','ISO','Babip','RC','K','BB','RBI','K/B','HR','PitH','PitOBP','PitSLG','PitOPS','PitHR','ERA','PitB','PitK','PitK/B','WHIP','era','whip']
#select = ['H', 'D', 'T', 'HR', 'RBI', 'BB']
x_cols = [i+'_x' for i in select]
x_cols += [i+'_y' for i in select]
x_cols += [f'{trend}d_home_attendance']





In [81]:
# y_train = train[y_cols]*1
# y_test = test[y_cols]*1

# X_train = train[x_cols]
# X_test = test[x_cols]
# x_ = [i+'_x' for i in select]
# y_ = [i+'_y' for i in select]
# '''
# diff
# X_train = X_train[x_].to_numpy() - X_train[y_].to_numpy()
# X_test = X_test[x_].to_numpy() - X_test[y_].to_numpy()
# '''

# '''
# div
# X_train = np.log(X_train[x_].to_numpy()) - np.log(X_train[y_].to_numpy()+1e-6)
# X_test = np.log(X_test[x_].to_numpy()) - np.log(X_test[y_].to_numpy()+1e-6)
'''

'\ndiv\nX_train = np.log(X_train[x_].to_numpy()) - np.log(X_train[y_].to_numpy()+1e-6)\nX_test = np.log(X_test[x_].to_numpy()) - np.log(X_test[y_].to_numpy()+1e-6)\n'

In [24]:
'''
    Z-score normalization
'''
scaler = StandardScaler()
scaler.fit(train[x_cols])
X_train = scaler.transform(train[x_cols])
X_test = scaler.transform(test[x_cols])



ValueError: ignored

In [31]:
X_train = train[x_cols]
y_train = train[y_cols]*1


print(X_train)
X_test = test[x_cols]
y_test = test[y_cols]*1





X_train.replace([np.inf, -np.inf], np.nan)
X_test.replace([np.inf, -np.inf], np.nan)
y_train.replace([np.inf, -np.inf], np.nan)
y_test.replace([np.inf, -np.inf],np.nan)

X_train.fillna(value = -1, inplace=True)
X_test.fillna(value = -1, inplace=True)
y_train.fillna(value = -1, inplace=True)
y_test.fillna(value = -1, inplace=True)
print(X_train.isnull().sum())
print()
print(y_train.isnull().sum())



            H_x     OBP_x     SLG_x     ISO_x   Babip_x      RC_x     K_x  \
0      0.280959  0.427430  0.554816  0.273857  0.504551  0.179534   613.0   
1      0.278481  0.423832  0.550415  0.271934  0.500000  0.176896   624.0   
2      0.277155  0.422414  0.550000  0.272845  0.500000  0.176261   635.0   
3      0.277067  0.423274  0.552003  0.274936  0.500000  0.176786   641.0   
4      0.322424  0.429899  0.598384  0.275960  0.498421  0.202908   453.0   
...         ...       ...       ...       ...       ...       ...     ...   
24406  0.285483  0.387325  0.592833  0.307350  0.521844  0.181896  1375.0   
24407  0.259783  0.355460  0.509732  0.249949  0.483417  0.144400  1490.0   
24408  0.259825  0.355936  0.510283  0.250458  0.483510  0.144853  1499.0   
24409  0.259551  0.354760  0.509400  0.249848  0.482556  0.144393  1510.0   
24410  0.284507  0.385915  0.589940  0.305433  0.520852  0.180511  1391.0   

           BB_x      RBI_x     K/B_x  ...  PitOPS_y   PitHR_y     ERA_y  \


In [27]:
X_train.to_csv("1.csv")
y_train.to_csv("2.csv")

In [32]:

model = sm.Logit(y_train,X_train)
results = model.fit()
print(results.summary())

y_hat = results.predict( X_train)
y_hat = list(map(round,y_hat))
print('Train accuracy = ', accuracy_score(y_train, y_hat))

y_hat = results.predict( X_test)
y_hat = list(map(round,y_hat))
print('Test accuracy = ', accuracy_score(y_test, y_hat))

MissingDataError: ignored

In [69]:
num_thread = 8
# num of cpu 

In [70]:
param ={'num_leaves':[4,6,8,10,20],'max_depth':[-1,3,6,9,12,15]}
lgb =LGBMClassifier()
lgb_clf = GridSearchCV(lgb,param,scoring='accuracy',n_jobs=num_thread)
lgb_results = lgb_clf.fit(X_train, y_train.values.ravel(), eval_metric=['logloss'],eval_set=[(X_test, y_test.values.ravel())],callbacks=[lightgbm.early_stopping(10, verbose=0)])

train_score = lgb_results.score(X_train, y_train)
test_score = lgb_results.score(X_test, y_test)
print(lgb_results.best_params_)
print(test_score)

[1]	valid_0's binary_logloss: 0.68885
[2]	valid_0's binary_logloss: 0.687465
[3]	valid_0's binary_logloss: 0.686324
[4]	valid_0's binary_logloss: 0.684882
[5]	valid_0's binary_logloss: 0.684421
[6]	valid_0's binary_logloss: 0.683175
[7]	valid_0's binary_logloss: 0.681801
[8]	valid_0's binary_logloss: 0.681543
[9]	valid_0's binary_logloss: 0.680826
[10]	valid_0's binary_logloss: 0.680015
[11]	valid_0's binary_logloss: 0.679845
[12]	valid_0's binary_logloss: 0.67939
[13]	valid_0's binary_logloss: 0.678595
[14]	valid_0's binary_logloss: 0.678052
[15]	valid_0's binary_logloss: 0.67753
[16]	valid_0's binary_logloss: 0.677074
[17]	valid_0's binary_logloss: 0.67678
[18]	valid_0's binary_logloss: 0.676194
[19]	valid_0's binary_logloss: 0.676129
[20]	valid_0's binary_logloss: 0.676092
[21]	valid_0's binary_logloss: 0.67578
[22]	valid_0's binary_logloss: 0.675307
[23]	valid_0's binary_logloss: 0.675057
[24]	valid_0's binary_logloss: 0.674774
[25]	valid_0's binary_logloss: 0.674545
[26]	valid_0's

In [None]:
rf_param ={'min_samples_split':[4,6,8,10,20],'max_depth':[-1,3,6,9,12,15]}
rf =RandomForestClassifier()
rf_clf = GridSearchCV(rf,rf_param,scoring='accuracy',n_jobs=num_thread)
rf_results = rf_clf.fit(X_train,  y_train.values.ravel())


train_score = rf_results.score(X_train, y_train)
test_score = rf_results.score(X_test, y_test)
print(rf_results.best_params_)
print(test_score)

{'max_depth': 6, 'min_samples_split': 20}
0.5819573097060008


In [71]:
param_range = [0.001, 0.01, 0.1, 1.0, 10.0, 100.0] 
svm_param  = [{'C': param_range, 'kernel': ['linear']}, {'C': param_range, 'gamma': param_range, 'kernel': ['rbf']}]
svm_model =svm.SVC()
svm_clf = GridSearchCV(estimator=svm_model,param_grid=svm_param,scoring='accuracy',n_jobs=num_thread,verbose=0)
svm_results = svm_clf.fit(X_train, y_train.values.ravel())

train_score = svm_results.score(X_train, y_train)
test_score = svm_results.score(X_test, y_test)
print(svm_results.best_params_)
print(test_score)

KeyboardInterrupt: ignored