Will use a generalized linear model in this notebook.

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import statsmodels.api as sm

from sklearn.preprocessing import StandardScaler

# Batter

In [4]:
# read in batter training data
batter_train = pd.read_csv('../Modeling_Data/batter_data_modeling.csv')
# read in batter testing data
batter_test = pd.read_csv('../Modeling_Data/batter_test_data.csv')

In [5]:
# set up X and y from training data
X = batter_train.drop(columns=['Name', 'Team', 'POS', 'PTS', 'FPPG'])
y = batter_train['FPPG']

In [6]:
# add constant to X
X = sm.add_constant(X)

In [7]:
# scale data
sc = StandardScaler()
# fit and transform X
X_sc = sc.fit_transform(X)

In [8]:
# GLM
glm_batter = sm.GLM(y, X_sc, family=sm.families.Poisson(link=sm.families.links.log())).fit()

In [9]:
results_summary = glm_batter.summary()

In [10]:
glm_batter.summary()

0,1,2,3
Dep. Variable:,FPPG,No. Observations:,635.0
Model:,GLM,Df Residuals:,620.0
Model Family:,Poisson,Df Model:,14.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-4191.5
Date:,"Wed, 12 May 2021",Deviance:,6050.3
Time:,09:56:19,Pearson chi2:,70100.0
No. Iterations:,100,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0,0,,,0,0
x1,-0.4131,0.116,-3.562,0.000,-0.640,-0.186
x2,4.8106,0.224,21.475,0.000,4.372,5.250
x3,0.2101,0.117,1.802,0.072,-0.018,0.439
x4,-2.2638,0.138,-16.399,0.000,-2.534,-1.993
x5,-0.1267,0.065,-1.942,0.052,-0.255,0.001
x6,-0.0595,0.024,-2.443,0.015,-0.107,-0.012
x7,-0.3109,0.080,-3.887,0.000,-0.468,-0.154
x8,0.1884,0.092,2.045,0.041,0.008,0.369


In [11]:
# create data frame of coefficients
# save as html
results_as_html = results_summary.tables[1].as_html()
# read in html
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
# set index from X columns
results_df['Features'] = X.columns
results_df.set_index('Features', inplace=True)

In [12]:
# sort values
results_df.sort_values(by='coef')

Unnamed: 0_level_0,coef,std err,z,P>|z|,[0.025,0.975]
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
H,-2.2638,0.138,-16.399,0.0,-2.534,-1.993
TB,-1.3953,0.072,-19.333,0.0,-1.537,-1.254
BB,-0.6248,0.063,-9.908,0.0,-0.748,-0.501
GMS,-0.4131,0.116,-3.562,0.0,-0.64,-0.186
HR,-0.3109,0.08,-3.887,0.0,-0.468,-0.154
2B,-0.1267,0.065,-1.942,0.052,-0.255,0.001
AVG,-0.0731,0.08,-0.918,0.359,-0.229,0.083
CS,-0.0605,0.034,-1.804,0.071,-0.126,0.005
3B,-0.0595,0.024,-2.443,0.015,-0.107,-0.012
const,0.0,0.0,,,0.0,0.0


# Interpert Coefficient Values

---

# Pitcher

In [13]:
# read in pitcher training data
pitcher_train = pd.read_csv('../Modeling_Data/pitcher_data_modeling.csv')
# read in batter testing data
pitcher_test = pd.read_csv('../Modeling_Data/pitcher_test_data.csv')

In [14]:
# set up X and y from training data
X = pitcher_train.drop(columns=['Name', 'Team', 'POS', 'PTS', 'AVG', 'FPPG'])
y = pitcher_train['FPPG']

In [15]:
# add constant
X = sm.add_constant(X)

In [16]:
# scale data
sc = StandardScaler()
# fit and transform X
X_sc = sc.fit_transform(X)

In [17]:
# GLM
glm_pitcher = sm.GLM(y, X_sc, family=sm.families.Poisson(link=sm.families.links.log())).fit()

In [18]:
results_summary = glm_pitcher.summary()

In [19]:
glm_pitcher.summary()

0,1,2,3
Dep. Variable:,FPPG,No. Observations:,760.0
Model:,GLM,Df Residuals:,746.0
Model Family:,Poisson,Df Model:,13.0
Link Function:,log,Scale:,1.0
Method:,IRLS,Log-Likelihood:,-7819.2
Date:,"Wed, 12 May 2021",Deviance:,12667.0
Time:,09:56:19,Pearson chi2:,157000.0
No. Iterations:,9,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,0,0,,,0,0
x1,-0.0308,0.033,-0.942,0.346,-0.095,0.033
x2,-0.0480,0.033,-1.475,0.140,-0.112,0.016
x3,-1.9378,0.053,-36.836,0.000,-2.041,-1.835
x4,1.0503,0.081,13.012,0.000,0.892,1.209
x5,0.5868,0.038,15.491,0.000,0.513,0.661
x6,-0.1359,0.125,-1.090,0.276,-0.380,0.108
x7,-0.1360,0.109,-1.251,0.211,-0.349,0.077
x8,-0.0454,0.159,-0.285,0.776,-0.358,0.267


In [20]:
# create data frame of coefficients
# save as html
results_as_html = results_summary.tables[1].as_html()
# read in html
results_df = pd.read_html(results_as_html, header=0, index_col=0)[0]
# set index from X columns
results_df['Features'] = X.columns
results_df.set_index('Features', inplace=True)

In [21]:
# sort values
results_df.sort_values(by='coef')

Unnamed: 0_level_0,coef,std err,z,P>|z|,[0.025,0.975]
Features,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
WHIP,-9.5005,2.198,-4.322,0.0,-13.809,-5.192
GMS,-1.9378,0.053,-36.836,0.0,-2.041,-1.835
ERA,-0.7475,0.041,-18.185,0.0,-0.828,-0.667
H,-0.136,0.109,-1.251,0.211,-0.349,0.077
IP,-0.1359,0.125,-1.09,0.276,-0.38,0.108
L,-0.048,0.033,-1.475,0.14,-0.112,0.016
R,-0.0454,0.159,-0.285,0.776,-0.358,0.267
W,-0.0308,0.033,-0.942,0.346,-0.095,0.033
const,0.0,0.0,,,0.0,0.0
HR,0.0233,0.036,0.644,0.52,-0.048,0.094


# Interpert

The coefficients provided in the GLM are very useful in this project and can be used provide insights in to the strategy of building a solid line up.