### Loading in libraries, preparing for analysis

In [1]:
# importing needed libraries for two stage least squares and ols
import pandas as pd
from linearmodels.iv import IV2SLS
from linearmodels.panel import PanelOLS
from linearmodels.panel import PooledOLS
import statsmodels.api as sm


# reading in the data
df = pd.read_csv("~/projects/Graduate Thesis/Data/FinalCSV.csv", header=0)

# creating a new column that will represent revenue in thousands of dollars
df['ResponsiveRevThous'] = df['ResponsiveRev']/1000

# making the year a categorical variable
df.Year.astype('category')

# setting the index to program and year. This makes python read the data as a panel
df = df.set_index(['Program','Year'])

# dropping old columns
var = df
var = var.drop(columns=['ResponsiveRev','srs','new_coach'])

In [2]:
# checking columns
var.columns

Index(['TwoStars', 'ThreeStars', 'FourStars', 'FiveStars', 'lagged_srs',
       'stateunemp', 'stadium_cap', 'stadium_years_open',
       'stadium_years_rennovation', 'num_homegames', 'coach_experience',
       'conf_coy', 'nat_coy', 'coach_years_at_school', 'pop/d1prog',
       'hs_grad_pct', 'pro_team_per_million', 'spring_practice', 'num_jcs',
       'power', 'ResponsiveRevThous'],
      dtype='object')

In [3]:
# making dataframes for Power 5 and Non Power 5
p5 = var[var['power'] == 1]
nonp5 =var[var['power'] == 0]

### Summary Statistics

In [5]:
# printing a table with summary statistics for all teams
des = var.describe()
d = des.reset_index()

d.T

Unnamed: 0,0,1,2,3,4,5,6,7
index,count,mean,std,min,25%,50%,75%,max
TwoStars,378,33.3783,25.2668,0,12,25,58,96
ThreeStars,378,44.1376,17.6661,6,29,46,59,86
FourStars,378,14.1481,16.1375,0,1,9,21,62
FiveStars,378,1.25661,2.51895,0,0,0,1,16
lagged_srs,378,1.794,10.2467,-23.53,-5.345,2.17,8.7575,24.51
stateunemp,378,7.93989,1.92523,3.27,6.68,7.82,9.26,13.5
stadium_cap,378,56468,22394.3,20000,36096.8,55000,72000,109901
stadium_years_open,378,61.1693,27.0236,0,43,64,86,101
stadium_years_rennovation,378,7.0582,5.76432,0,3,6,10,27


In [6]:
# printing a table with summary statistics for Power 5 teams
desp5 = p5.describe()
dp5 = desp5.reset_index()

dp5.T

Unnamed: 0,0,1,2,3,4,5,6,7
index,count,mean,std,min,25%,50%,75%,max
TwoStars,244,17.4836,12.9133,0,8,15,23,68
ThreeStars,244,52.9508,12.8971,20,44,54,61.25,86
FourStars,244,21.373,15.9525,0,9,16,33,62
FiveStars,244,1.93443,2.92073,0,0,1,3,16
lagged_srs,244,6.19985,8.27634,-14.87,0.635,6.055,12.27,24.51
stateunemp,244,7.82201,1.937,3.27,6.4575,7.76,9.2825,12.58
stadium_cap,244,68544.7,17710.1,32740,55000,64150.5,82112,109901
stadium_years_open,244,69.9959,26.0137,1,53,83,89,101
stadium_years_rennovation,244,6.43852,4.97156,0,2,6,10,25


In [8]:
# printing a table with summary statistics for Non Power 5 teams
desnonp5 = nonp5.describe()
dnonp5 = desnonp5.reset_index()

dnonp5.T

Unnamed: 0,0,1,2,3,4,5,6,7
index,count,mean,std,min,25%,50%,75%,max
TwoStars,134,62.3209,14.0534,17,54,66,71,96
ThreeStars,134,28.0896,13.3706,6,19,25,35,69
FourStars,134,0.992537,1.75789,0,0,0,1,9
FiveStars,134,0.0223881,0.148497,0,0,0,0,1
lagged_srs,134,-6.22858,8.48596,-23.53,-11.7175,-6.155,-0.3025,18.69
stateunemp,134,8.15455,1.89193,4.13,6.805,7.83,9.0975,13.5
stadium_cap,134,34477.7,9681.36,20000,30000,30850,38019,65857
stadium_years_open,134,45.097,20.7469,0,39,45,59.5,82
stadium_years_rennovation,134,8.18657,6.85858,0,3,6,12,27


### Box Cox test to check for power tranformation

In [9]:
# importing stats library
from scipy import stats

# making a dataframe with only the dependent variable
transform = var['ResponsiveRevThous']

# running a Box Cox test on the dependent variable
x = stats.boxcox(transform)

# printing the Box Cox lambda
x[1]

0.2143413854988545

Because the Box Cox lambda is close to 0, I will natural log my dependent variable.

In [10]:
# making an array with my dependent variable natural logged
# lmbda = 0 sets the transformation to natural log
xx = stats.boxcox(transform, lmbda = 0)

### Shapiro-Wilk test for normality

In [11]:
# running a Shapiro-Wilk test
shapiro = stats.shapiro(xx)

# printing the p value
shapiro[1]

1.1499772245571194e-09

I must reject the null hypothesis that my dependent variable is normally distributed.

In [14]:
# creating column with logged dependent variable
var['lnResponsiveRevThous'] = xx

### Two Stage Least Squares

In [18]:
# defining dependent variable, exogenous variables, endogenous variables, and instruments
dependent = var.lnResponsiveRevThous
exog = var[['lagged_srs','stateunemp','num_homegames','stadium_cap','stadium_years_open','stadium_years_rennovation','coach_years_at_school','coach_experience','conf_coy','nat_coy']]
endog = var[['ThreeStars','FourStars','FiveStars']]
instruments = var[['pop/d1prog','num_jcs','spring_practice','hs_grad_pct','pro_team_per_million']]

# building model, running regression, returning results
mod = IV2SLS(dependent, exog, endog, instruments)
res = mod.fit()
res

0,1,2,3
Dep. Variable:,lnResponsiveRevThous,R-squared:,0.9255
Estimator:,IV-2SLS,Adj. R-squared:,0.9228
No. Observations:,378,F-statistic:,8019.1
Date:,"Fri, Oct 05 2018",P-value (F-stat),0.0000
Time:,12:25:54,Distribution:,chi2(13)
Cov. Estimator:,robust,,
,,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
lagged_srs,0.0500,0.0315,1.5895,0.1119,-0.0117,0.1118
stateunemp,0.1991,0.0870,2.2886,0.0221,0.0286,0.3696
num_homegames,0.0051,0.2994,0.0169,0.9865,-0.5817,0.5918
stadium_cap,0.0002,2.854e-05,5.5077,0.0000,0.0001,0.0002
stadium_years_open,0.0088,0.0047,1.8475,0.0647,-0.0005,0.0180
stadium_years_rennovation,-0.0681,0.0238,-2.8648,0.0042,-0.1147,-0.0215
coach_years_at_school,0.0641,0.0502,1.2748,0.2024,-0.0344,0.1625
coach_experience,-0.0281,0.0413,-0.6800,0.4965,-0.1089,0.0528
conf_coy,0.0148,0.3027,0.0489,0.9610,-0.5784,0.6080


### Fixed Effects Model

In [19]:
panelx = var[['lagged_srs','stateunemp','num_homegames','stadium_cap','stadium_years_open','stadium_years_rennovation','coach_years_at_school','coach_experience','conf_coy','nat_coy','ThreeStars','FourStars','FiveStars']]
panelx = sm.add_constant(panelx)
panely = var['lnResponsiveRevThous']

mod = PanelOLS(panely, panelx, entity_effects=True)
fixed_effects = mod.fit()
print(fixed_effects)

                           PanelOLS Estimation Summary                            
Dep. Variable:     lnResponsiveRevThous   R-squared:                        0.4350
Estimator:                     PanelOLS   R-squared (Between):             -0.5065
No. Observations:                   378   R-squared (Within):               0.4350
Date:                  Fri, Oct 05 2018   R-squared (Overall):             -0.4337
Time:                          12:29:05   Log-likelihood                   -49.385
Cov. Estimator:              Unadjusted                                           
                                          F-statistic:                      17.054
Entities:                            77   P-value                           0.0000
Avg Obs:                         4.9091   Distribution:                  F(13,288)
Min Obs:                         1.0000                                           
Max Obs:                         5.0000   F-statistic (robust):             17.054
    