In [250]:
import numpy as np
import os
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from patsy import dmatrices


# Model Player Salaries

In [251]:
df = pd.read_csv(os.path.join("data", "db", "ModelInput.csv"))
df.head()


Unnamed: 0,Adjusted Salary,Adjusted Team Payroll,Batting_Career_Num_Seasons,Batting_Career_G,Batting_Career_SB,Batting_Career_RBI,Batting_Career_AVG,Batting_Career_PSN,Batting_Career_SLG,Batting_Career_H,...,Num_Post_Season_Appearances,Num_All_Star_Appearances,0.0,1B,2B,3B,C,MULTIPLE,P,SS
0,0.296,0.221,0.522,0.297,0.008,0.158,0.227,0.028,0.148,0.197,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1,0.266,0.221,0.348,0.257,0.176,0.086,0.264,0.07,0.129,0.174,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
2,0.399,0.221,0.478,0.464,0.032,0.28,0.251,0.096,0.155,0.343,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.296,0.221,0.522,0.341,0.024,0.271,0.234,0.079,0.203,0.231,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.123,0.221,0.0,0.012,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [252]:
df.rename(columns={'0.0' : 'NO_POSITION',
                   '1B' : 'FIRST_BASE',
                   '2B' : 'SECOND_BASE',
                   '3B' : 'THIRD_BASE',
                   'C' : 'CATCHER',
                   'P' : 'PITCHER',
                   'SS' : 'SHORT_STOP',
                   'Adjusted Salary' : 'Adjusted_Salary',
                   'Adjusted Team Payroll' : 'Adjusted_Team_Payroll'}, inplace=True)
predictor_vars = df.columns.values[1:]
response_variable = df.columns.values[0]
formula = "{} ~ {}".format(response_variable, ' + '.join(predictor_vars))
print(formula)

Adjusted_Salary ~ Adjusted_Team_Payroll + Batting_Career_Num_Seasons + Batting_Career_G + Batting_Career_SB + Batting_Career_RBI + Batting_Career_AVG + Batting_Career_PSN + Batting_Career_SLG + Batting_Career_H + Batting_Career_2B + Batting_Career_3B + Batting_Career_HR + Batting_Career_TB + Batting_Career_R + Batting_Career_OBP + Pitching_Career_SO + Pitching_Career_W + Pitching_Career_L + Pitching_Career_Num_Seasons + Pitching_Career_ERA + Pitching_Career_IP + Pitching_Career_ER + Pitching_Career_GS + Fielding_Num_Seasons + Fielding_G + Fielding_Career_A + Fielding_Career_PO + Fielding_Career_E + Fielding_Career_G + Fielding_Career_FPCT + Num_Post_Season_Appearances + Num_All_Star_Appearances + NO_POSITION + FIRST_BASE + SECOND_BASE + THIRD_BASE + CATCHER + MULTIPLE + PITCHER + SHORT_STOP


In [253]:
y, X = dmatrices('Adjusted_Salary ~ Batting_Career_TB'\
                 '+ Pitching_Career_IP + Pitching_Career_SO '\
                 '+ Num_All_Star_Appearances '\
                 '+ NO_POSITION + FIRST_BASE + SECOND_BASE', data=df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()  
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:        Adjusted_Salary   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.640
Method:                 Least Squares   F-statistic:                     642.3
Date:                Thu, 27 Oct 2016   Prob (F-statistic):               0.00
Time:                        17:20:23   Log-Likelihood:                -2876.7
No. Observations:                2525   AIC:                             5769.
Df Residuals:                    2517   BIC:                             5816.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [95.0% Conf. Int.]
--------------------------------------------------------------------------------------------
Intercept               

## Model Team Wins

In [254]:
teams_df = pd.read_csv(os.path.join("data", "lahman", "baseballdatabank-master", "core", "Teams.csv"))
teams_df['WIN_PCT'] = teams_df['W'] / (teams_df['W'] + teams_df['L'])
teams_df.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,WIN_PCT
0,1871,,BS1,BNA,,3,31,,20,10,...,0.83,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1,0.666667
1,1871,,CH1,CNA,,2,28,,19,9,...,0.82,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1,0.678571
2,1871,,CL1,CFC,,8,29,,10,19,...,0.81,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1,0.344828
3,1871,,FW1,KEK,,7,19,,7,12,...,0.8,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1,0.368421
4,1871,,NY2,NNA,,5,33,,16,17,...,0.83,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2,0.484848


In [293]:
# Scale values from 0 to 1
print(teams_df.columns[14:39])
for column in teams_df.columns[14:39]:
    print(column)
    teams_df[column] = (teams_df[column] - teams_df[column].min()) / (teams_df[column].max() - teams_df[column].min()) 
teams_df = teams_df.round(3)
teams_df = teams_df.fillna(0.0)
teams_df.head()

Index(['R', 'AB', 'H', 'SECOND_BASE_HITS', 'THIRD_BASE_HITS', 'HR', 'BB', 'SO',
       'SB', 'CS', 'HBP', 'SF', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts',
       'HA', 'HRA', 'BBA', 'SOA', 'E', 'DP'],
      dtype='object')
R
AB
H
SECOND_BASE_HITS
THIRD_BASE_HITS
HR
BB
SO
SB
CS
HBP
SF
RA
ER
ERA
CG
SHO
SV
IPouts
HA
HRA
BBA
SOA
E
DP


Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro,WIN_PCT
0,1871,0,BS1,BNA,0,3,31,0.0,20,10,...,0.303,Boston Red Stockings,South End Grounds I,0.0,103,98,BOS,BS1,BS1,0.667
1,1871,0,CH1,CNA,0,2,28,0.0,19,9,...,0.26,Chicago White Stockings,Union Base-Ball Grounds,0.0,104,102,CHI,CH1,CH1,0.679
2,1871,0,CL1,CFC,0,8,29,0.0,10,19,...,0.216,Cleveland Forest Citys,National Association Grounds,0.0,96,100,CLE,CL1,CL1,0.345
3,1871,0,FW1,KEK,0,7,19,0.0,7,12,...,0.173,Fort Wayne Kekiongas,Hamilton Field,0.0,101,107,KEK,FW1,FW1,0.368
4,1871,0,NY2,NNA,0,5,33,0.0,16,17,...,0.303,New York Mutuals,Union Grounds (Brooklyn),0.0,90,88,NYU,NY2,NY2,0.485


In [255]:
teams_df.rename(columns={'2B' : 'SECOND_BASE_HITS', '3B' : 'THIRD_BASE_HITS'}, inplace=True)
teams_df.columns

Index(['yearID', 'lgID', 'teamID', 'franchID', 'divID', 'Rank', 'G', 'Ghome',
       'W', 'L', 'DivWin', 'WCWin', 'LgWin', 'WSWin', 'R', 'AB', 'H',
       'SECOND_BASE_HITS', 'THIRD_BASE_HITS', 'HR', 'BB', 'SO', 'SB', 'CS',
       'HBP', 'SF', 'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA',
       'HRA', 'BBA', 'SOA', 'E', 'DP', 'FP', 'name', 'park', 'attendance',
       'BPF', 'PPF', 'teamIDBR', 'teamIDlahman45', 'teamIDretro', 'WIN_PCT'],
      dtype='object')

In [256]:
predictor_vars = teams_df.columns.values[1:]
response_variable = teams_df.columns.values[0]
formula = "{} ~ {}".format(response_variable, ' + '.join(predictor_vars))
print(formula)

yearID ~ lgID + teamID + franchID + divID + Rank + G + Ghome + W + L + DivWin + WCWin + LgWin + WSWin + R + AB + H + SECOND_BASE_HITS + THIRD_BASE_HITS + HR + BB + SO + SB + CS + HBP + SF + RA + ER + ERA + CG + SHO + SV + IPouts + HA + HRA + BBA + SOA + E + DP + FP + name + park + attendance + BPF + PPF + teamIDBR + teamIDlahman45 + teamIDretro + WIN_PCT


In [299]:
y, X = dmatrices('WIN_PCT ~ R + AB + THIRD_BASE_HITS + HR + BB + SB + SF + RA + ER + ERA + CG + SHO + SV + E + FP + BPF + PPF', data=teams_df, return_type='dataframe')
mod = sm.OLS(y, X)
res = mod.fit()  
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                WIN_PCT   R-squared:                       0.900
Model:                            OLS   Adj. R-squared:                  0.900
Method:                 Least Squares   F-statistic:                     1483.
Date:                Fri, 28 Oct 2016   Prob (F-statistic):               0.00
Time:                        07:02:16   Log-Likelihood:                 5847.4
No. Observations:                2805   AIC:                        -1.166e+04
Df Residuals:                    2787   BIC:                        -1.155e+04
Df Model:                          17                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [95.0% Conf. Int.]
-----------------------------------------------------------------------------------
Intercept           0.4496      0.019     