In [2]:
# y = 3.2 + 1.87*x + 2.1*x^2
# 1. Interpertration of 2.1: For one unit of increase in x, on average, y will increase by 2.1. Similarly, for
# two units of increase in x, on average, y will increase by 8.4.
# 2. R^2 = 91%. Interpretation: On average, 91% of the variability in y, can be explained by a quadritic model of x.
# 3. x = 3.2 => y = 3.2 + 1.87*3.2 + 2.1*3.2**2 = 30.688

In [4]:
import pandas as pd 
import numpy as np

import statsmodels.formula.api as smf

# Reading the csv 
batting = pd.read_csv('Batting.csv')
batting.head()

Unnamed: 0,playerID,yearID,stint,teamID,lgID,G,AB,R,H,2B,...,RBI,SB,CS,BB,SO,IBB,HBP,SH,SF,GIDP
0,abercda01,1871,1,TRO,,1,4,0,0,0,...,0.0,0.0,0.0,0,0.0,,,,,0.0
1,addybo01,1871,1,RC1,,25,118,30,32,6,...,13.0,8.0,1.0,4,0.0,,,,,0.0
2,allisar01,1871,1,CL1,,29,137,28,40,4,...,19.0,3.0,1.0,2,5.0,,,,,1.0
3,allisdo01,1871,1,WS3,,27,133,28,44,10,...,27.0,1.0,1.0,0,2.0,,,,,0.0
4,ansonca01,1871,1,RC1,,25,120,29,39,11,...,16.0,6.0,2.0,2,1.0,,,,,0.0


In [8]:
batting_agg = batting.groupby('playerID')[['AB', 'HR', 'SO']].sum().reset_index()
batting_agg.columns = ['playerID', 'tot_AB', 'tot_HR', 'tot_SO']
batting_agg

Unnamed: 0,playerID,tot_AB,tot_HR,tot_SO
0,aardsda01,4,0,2.0
1,aaronha01,12364,755,1383.0
2,aaronto01,944,13,145.0
3,aasedo01,5,0,3.0
4,abadan01,21,0,5.0
...,...,...,...,...
19893,zupofr01,18,0,6.0
19894,zuvelpa01,491,2,50.0
19895,zuverge01,142,0,39.0
19896,zwilldu01,1280,30,155.0


In [9]:
batting_agg = batting_agg[batting_agg['tot_AB'] > 5000].reset_index(drop = True)
batting_agg

Unnamed: 0,playerID,tot_AB,tot_HR,tot_SO
0,aaronha01,12364,755,1383.0
1,abreubo01,8480,288,1840.0
2,adamssp01,5557,9,223.0
3,adcocjo01,6606,336,1059.0
4,alfoned01,5385,146,617.0
...,...,...,...,...
801,zeileto01,7573,253,1279.0
802,zimmehe01,5304,58,432.0
803,zimmery01,6399,270,1307.0
804,ziskri01,5144,207,910.0


In [10]:
batting_agg['HR_rate'] = batting_agg['tot_HR'] / batting_agg['tot_AB']
batting_agg['SO_rate'] = batting_agg['tot_SO'] / batting_agg['tot_AB']
batting_agg

Unnamed: 0,playerID,tot_AB,tot_HR,tot_SO,HR_rate,SO_rate
0,aaronha01,12364,755,1383.0,0.061064,0.111857
1,abreubo01,8480,288,1840.0,0.033962,0.216981
2,adamssp01,5557,9,223.0,0.001620,0.040130
3,adcocjo01,6606,336,1059.0,0.050863,0.160309
4,alfoned01,5385,146,617.0,0.027112,0.114578
...,...,...,...,...,...,...
801,zeileto01,7573,253,1279.0,0.033408,0.168889
802,zimmehe01,5304,58,432.0,0.010935,0.081448
803,zimmery01,6399,270,1307.0,0.042194,0.204251
804,ziskri01,5144,207,910.0,0.040241,0.176905


In [12]:
## Building the quadratic model
quad_md = smf.ols(formula = 'SO_rate ~ HR_rate + I(HR_rate**2)', data = batting_agg).fit()

## Extracting model resutls 
quad_md.summary()

0,1,2,3
Dep. Variable:,SO_rate,R-squared:,0.532
Model:,OLS,Adj. R-squared:,0.53
Method:,Least Squares,F-statistic:,455.7
Date:,"Wed, 01 Nov 2023",Prob (F-statistic):,5.54e-133
Time:,11:46:00,Log-Likelihood:,1404.8
No. Observations:,806,AIC:,-2804.0
Df Residuals:,803,BIC:,-2789.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.0503,0.004,13.415,0.000,0.043,0.058
HR_rate,3.6501,0.278,13.136,0.000,3.105,4.196
I(HR_rate ** 2),-18.5281,4.236,-4.374,0.000,-26.843,-10.214

0,1,2,3
Omnibus:,15.195,Durbin-Watson:,2.036
Prob(Omnibus):,0.001,Jarque-Bera (JB):,18.033
Skew:,0.247,Prob(JB):,0.000121
Kurtosis:,3.541,Cond. No.,2840.0


In [14]:
new_obs = pd.DataFrame({'HR_rate': [0.05]})
new_obs

Unnamed: 0,HR_rate
0,0.05


In [15]:
# Predicting SO rate with quadratic model 
quad_md.predict(new_obs)

0    0.1865
dtype: float64

In [None]:
# The quadratic term is significant.