In [1]:
import pandas as pd 

## Reading the csv file
teams = pd.read_csv('Teams.csv')
teams.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871,,BS1,BNA,,3,31,,20,10,...,24,0.834,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,...,16,0.829,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,...,15,0.818,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1
3,1871,,FW1,KEK,,7,19,,7,12,...,8,0.803,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1
4,1871,,NY2,NNA,,5,33,,16,17,...,14,0.84,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2


In [2]:
## Selecting observations and variables of interest
my_teams = teams.loc[teams['yearID'] > 2000, ['yearID', 'teamID', 'lgID', 'G', 'W', 'L', 'R', 'RA']]
my_teams.head()

Unnamed: 0,yearID,teamID,lgID,G,W,L,R,RA
2355,2001,ANA,AL,162,75,87,691,730
2356,2001,ARI,NL,162,92,70,818,677
2357,2001,ATL,NL,162,88,74,729,643
2358,2001,BAL,AL,162,63,98,687,829
2359,2001,BOS,AL,161,82,79,772,745


In [3]:
import numpy as np

## Computing RD, Wpct, and League
my_teams['RD'] = my_teams['R'] - my_teams['RA']
my_teams['Wpct'] = my_teams['W'] / (my_teams['W'] + my_teams['L'])
my_teams['League'] = np.where(my_teams['lgID'] == 'NL', 0, 1)
my_teams.head()

Unnamed: 0,yearID,teamID,lgID,G,W,L,R,RA,RD,Wpct,League
2355,2001,ANA,AL,162,75,87,691,730,-39,0.462963,1
2356,2001,ARI,NL,162,92,70,818,677,141,0.567901,0
2357,2001,ATL,NL,162,88,74,729,643,86,0.54321,0
2358,2001,BAL,AL,162,63,98,687,829,-142,0.391304,1
2359,2001,BOS,AL,161,82,79,772,745,27,0.509317,1


In [4]:
import statsmodels.formula.api as smf

## Building the linear model 
lm_md = smf.ols(formula = 'Wpct ~ RD + League', data = my_teams).fit()

## Extracting model results
lm_md.summary()

0,1,2,3
Dep. Variable:,Wpct,R-squared:,0.856
Model:,OLS,Adj. R-squared:,0.855
Method:,Least Squares,F-statistic:,1769.0
Date:,"Wed, 10 Nov 2021",Prob (F-statistic):,1.31e-251
Time:,19:10:54,Log-Likelihood:,1289.4
No. Observations:,600,AIC:,-2573.0
Df Residuals:,597,BIC:,-2560.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.5005,0.002,312.372,0.000,0.497,0.504
RD,0.0006,1.06e-05,59.426,0.000,0.001,0.001
League,-0.0010,0.002,-0.449,0.653,-0.006,0.004

0,1,2,3
Omnibus:,36.176,Durbin-Watson:,2.198
Prob(Omnibus):,0.0,Jarque-Bera (JB):,98.154
Skew:,0.254,Prob(JB):,4.85e-22
Kurtosis:,4.915,Cond. No.,248.0


In [5]:
## Building the linear model 
lm_md = smf.ols(formula = 'Wpct ~ RD', data = my_teams).fit()

## Computing the RMSE of the linear model 
RMSE_lm = np.sqrt(np.mean((lm_md.fittedvalues - my_teams['Wpct'])**2))
round(RMSE_lm, 4)

0.0282

In [6]:
## Estimating Wpct Pythagorean approach with k = 1.85
my_teams['Wpct_pyt'] = my_teams['R']**1.85 / (my_teams['R']**1.85 + my_teams['RA']**1.85)
my_teams.head()

Unnamed: 0,yearID,teamID,lgID,G,W,L,R,RA,RD,Wpct,League,Wpct_pyt
2355,2001,ANA,AL,162,75,87,691,730,-39,0.462963,1,0.474628
2356,2001,ARI,NL,162,92,70,818,677,141,0.567901,0,0.586618
2357,2001,ATL,NL,162,88,74,729,643,86,0.54321,0,0.557798
2358,2001,BAL,AL,162,63,98,687,829,-142,0.391304,1,0.413967
2359,2001,BOS,AL,161,82,79,772,745,27,0.509317,1,0.516459


In [7]:
## Computing the Pythagorean RMSE
RMSE_pyt = np.sqrt(np.mean((my_teams['Wpct_pyt'] - my_teams['Wpct'])**2))
round(RMSE_pyt, 4)

0.0257

In [8]:
## We would use the Pythagorean model to predict Wpct because it has a smaller RMSE.

In [9]:
## Estimating the Wpct
Wpct = 730**1.85 / (730**1.85 + 750**1.85)
round(Wpct, 4)

0.4875