In [43]:
import numpy as np
import pandas as pd
import os

teams_df = pd.read_csv(os.path.join("data", "lahman", "baseballdatabank-master", "core", "Teams.csv"))
teams_df.head()

Unnamed: 0,yearID,lgID,teamID,franchID,divID,Rank,G,Ghome,W,L,...,DP,FP,name,park,attendance,BPF,PPF,teamIDBR,teamIDlahman45,teamIDretro
0,1871,,BS1,BNA,,3,31,,20,10,...,,0.83,Boston Red Stockings,South End Grounds I,,103,98,BOS,BS1,BS1
1,1871,,CH1,CNA,,2,28,,19,9,...,,0.82,Chicago White Stockings,Union Base-Ball Grounds,,104,102,CHI,CH1,CH1
2,1871,,CL1,CFC,,8,29,,10,19,...,,0.81,Cleveland Forest Citys,National Association Grounds,,96,100,CLE,CL1,CL1
3,1871,,FW1,KEK,,7,19,,7,12,...,,0.8,Fort Wayne Kekiongas,Hamilton Field,,101,107,KEK,FW1,FW1
4,1871,,NY2,NNA,,5,33,,16,17,...,,0.83,New York Mutuals,Union Grounds (Brooklyn),,90,88,NYU,NY2,NY2


## Subset

In [44]:
subset_columns = ['G', 'W', 'L']
subset_columns.extend(teams_df.columns[14:-3].values)
subset_columns.remove('name')
subset_columns.remove('park')
subset_columns.remove('attendance')
teams_df = pd.DataFrame(teams_df, columns=subset_columns)
teams_df.head()

Unnamed: 0,G,W,L,R,AB,H,2B,3B,HR,BB,...,IPouts,HA,HRA,BBA,SOA,E,DP,FP,BPF,PPF
0,31,20,10,401,1372,426,70,37,3,60,...,828,367,2,42,23,225,,0.83,103,98
1,28,19,9,302,1196,323,52,21,10,60,...,753,308,6,28,22,218,,0.82,104,102
2,29,10,19,249,1186,328,35,40,7,26,...,762,346,13,53,34,223,,0.81,96,100
3,19,7,12,137,746,178,19,8,2,33,...,507,261,5,21,17,163,,0.8,101,107
4,33,16,17,302,1404,403,43,21,1,33,...,879,373,7,42,22,227,,0.83,90,88


In [45]:
teams_df['Winning Percentage'] = teams_df['W'] / (teams_df['W'] + teams_df['L'])
teams_df.head()

Unnamed: 0,G,W,L,R,AB,H,2B,3B,HR,BB,...,HA,HRA,BBA,SOA,E,DP,FP,BPF,PPF,Winning Percentage
0,31,20,10,401,1372,426,70,37,3,60,...,367,2,42,23,225,,0.83,103,98,0.666667
1,28,19,9,302,1196,323,52,21,10,60,...,308,6,28,22,218,,0.82,104,102,0.678571
2,29,10,19,249,1186,328,35,40,7,26,...,346,13,53,34,223,,0.81,96,100,0.344828
3,19,7,12,137,746,178,19,8,2,33,...,261,5,21,17,163,,0.8,101,107,0.368421
4,33,16,17,302,1404,403,43,21,1,33,...,373,7,42,22,227,,0.83,90,88,0.484848


## Normalization

In [47]:
for column in teams_df.columns:
     teams_df[column] = (teams_df[column] - teams_df[column].min()) / (teams_df[column].max() - teams_df[column].min())
teams_df = teams_df.round(3)
teams_df = teams_df.fillna(0.0)
teams_df.head()
teams_df.head()

Unnamed: 0,G,W,L,R,AB,H,2B,3B,HR,BB,...,HA,HRA,BBA,SOA,E,DP,FP,BPF,PPF,Winning Percentage
0,0.157,0.172,0.046,0.315,0.208,0.225,0.18,0.247,0.011,0.072,...,0.164,0.008,0.051,0.016,0.301,0.0,0.303,0.623,0.469,0.742
1,0.138,0.164,0.038,0.232,0.177,0.166,0.131,0.14,0.038,0.072,...,0.133,0.025,0.034,0.015,0.289,0.0,0.26,0.638,0.519,0.755
2,0.145,0.086,0.115,0.188,0.175,0.169,0.086,0.267,0.027,0.031,...,0.153,0.054,0.064,0.023,0.297,0.0,0.216,0.522,0.494,0.384
3,0.082,0.06,0.062,0.094,0.096,0.083,0.043,0.053,0.008,0.04,...,0.109,0.021,0.025,0.012,0.196,0.0,0.173,0.594,0.58,0.41
4,0.17,0.138,0.1,0.232,0.214,0.211,0.107,0.14,0.004,0.04,...,0.167,0.029,0.051,0.015,0.304,0.0,0.303,0.435,0.346,0.539


## Regression Model

In [48]:
y = np.asarray(teams_df['Winning Percentage'])
teams_df.drop(['Winning Percentage', 'G', 'W', 'L'],inplace=True,axis=1)
X = np.asarray(teams_df)


In [None]:
y

In [49]:
from sklearn import linear_model
from sklearn import cross_validation

regr = linear_model.LinearRegression()
x_train, x_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.3, random_state=55)
regr.fit(x_train, y_train)
score = regr.score(x_test, y_test)
print(score)

0.915994428193


In [50]:
regr.coef_

array([ 0.88166259, -0.39066642,  0.00736825, -0.01692215, -0.02933152,
       -0.05329997, -0.04784067,  0.00515573, -0.03742941, -0.00466127,
        0.006104  , -0.01342295, -0.67345721,  0.39531509, -0.34139359,
        0.07038364,  0.0437575 ,  0.11423094,  0.04358548, -0.04932805,
       -0.00136819, -0.00936989, -0.02105822,  0.0622386 ,  0.00195577,
        0.21093697,  0.52711669, -0.60848766])

In [51]:
teams_df.head()

Unnamed: 0,R,AB,H,2B,3B,HR,BB,SO,SB,CS,...,IPouts,HA,HRA,BBA,SOA,E,DP,FP,BPF,PPF
0,0.315,0.208,0.225,0.18,0.247,0.011,0.072,0.012,0.126,0.0,...,0.153,0.164,0.008,0.051,0.016,0.301,0.0,0.303,0.623,0.469
1,0.232,0.177,0.166,0.131,0.14,0.038,0.072,0.014,0.119,0.0,...,0.136,0.133,0.025,0.034,0.015,0.289,0.0,0.26,0.638,0.519
2,0.188,0.175,0.169,0.086,0.267,0.027,0.031,0.016,0.031,0.0,...,0.138,0.153,0.054,0.064,0.023,0.297,0.0,0.216,0.522,0.494
3,0.094,0.096,0.083,0.043,0.053,0.008,0.04,0.006,0.028,0.0,...,0.079,0.109,0.021,0.025,0.012,0.196,0.0,0.173,0.594,0.58
4,0.232,0.214,0.211,0.107,0.14,0.004,0.04,0.01,0.079,0.0,...,0.165,0.167,0.029,0.051,0.015,0.304,0.0,0.303,0.435,0.346


In [54]:
teams_df.columns

Index(['R', 'AB', 'H', '2B', '3B', 'HR', 'BB', 'SO', 'SB', 'CS', 'HBP', 'SF',
       'RA', 'ER', 'ERA', 'CG', 'SHO', 'SV', 'IPouts', 'HA', 'HRA', 'BBA',
       'SOA', 'E', 'DP', 'FP', 'BPF', 'PPF'],
      dtype='object')