### Import Libraries

In [1]:


import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf

plt.style.use('seaborn') # pretty matplotlib plots



### Read Dataset

In [16]:
star = pd.read_csv("Star.csv", index_col=0)
star.head()
np.unique(star['race'])

array(['black', 'other', 'white'], dtype=object)

In [3]:
star.isna().any()

tmathssk    False
treadssk    False
classk      False
totexpk     False
sex         False
freelunk    False
race        False
schidkn     False
dtype: bool

#### only analyze small and regular size data

In [4]:
filter = star['classk'] != 'regular.with.aide' 
star = star[filter]
star.head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn
2,473,447,small.class,7,girl,no,white,63
3,536,450,small.class,21,girl,no,black,20
11,559,448,regular,16,boy,no,white,69
12,489,447,small.class,5,boy,yes,white,79
13,454,431,regular,8,boy,yes,white,5


#### add total score

In [5]:
star['total_score'] = star['tmathssk'] + star['treadssk']
star.drop(labels=['tmathssk', 'treadssk', 'schidkn'], axis=1, inplace=True)

#### create indicator variables

In [6]:
star = pd.get_dummies(data=star, prefix_sep='-', drop_first=True)
star.rename(columns={'classk-small.class':'small'}, inplace=True)
star.head()


Unnamed: 0,totexpk,total_score,small,sex-girl,freelunk-yes,race-other,race-white
2,7,920,1,1,0,0,1
3,21,986,1,1,0,0,0
11,16,1007,0,0,0,0,1
12,5,936,1,0,1,0,1
13,8,885,0,0,1,0,1


#### get summary stats for small= 0 and small = 1

#### summary stats across all data

In [7]:
star.describe()

Unnamed: 0,totexpk,total_score,small,sex-girl,freelunk-yes,race-other,race-white
count,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0
mean,9.037503,924.49183,0.464238,0.485936,0.473882,0.005358,0.679614
std,5.726875,75.012373,0.498786,0.499869,0.499384,0.073009,0.466687
min,0.0,635.0,0.0,0.0,0.0,0.0,0.0
25%,4.0,871.0,0.0,0.0,0.0,0.0,0.0
50%,8.0,918.0,0.0,0.0,0.0,0.0,1.0
75%,13.0,969.0,1.0,1.0,1.0,0.0,1.0
max,27.0,1253.0,1.0,1.0,1.0,1.0,1.0


#### run a linear regression to understand for the case of small= 0
one check for random assignment. regress small on othr independent variables and check if there are any significant coefficients. If assignment is random there shoud be no significant coefficients.
Here 'small' is the treatment effect and total_Score is the dependent variable

In [8]:
x_train = star.drop(labels=['small', 'total_score'], axis=1 )
y_true = star['small']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train)).fit()

# fitted model and summary
print(ols_sm.summary())

                            OLS Regression Results                            
Dep. Variable:                  small   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.001
Method:                 Least Squares   F-statistic:                    0.1100
Date:                Wed, 14 Oct 2020   Prob (F-statistic):              0.990
Time:                        11:16:11   Log-Likelihood:                -2699.5
No. Observations:                3733   AIC:                             5411.
Df Residuals:                    3727   BIC:                             5448.
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const            0.4635      0.025     18.404   

In [9]:
star[['total_score', 'small']].groupby("small").describe()

Unnamed: 0_level_0,total_score,total_score,total_score,total_score,total_score,total_score,total_score,total_score
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
small,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2000.0,917.942,73.153389,635.0,866.0,912.0,961.0,1229.0
1,1733.0,932.050779,76.42836,747.0,878.0,924.0,981.0,1253.0


### Run a Linear Regression 
total_score = b0 + b1 * small + e

In [10]:
x_train = star['small']
y_true = star['total_score']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train)).fit()

# fitted model and summary
print(ols_sm.summary())

                            OLS Regression Results                            
Dep. Variable:            total_score   R-squared:                       0.009
Model:                            OLS   Adj. R-squared:                  0.009
Method:                 Least Squares   F-statistic:                     33.13
Date:                Wed, 14 Oct 2020   Prob (F-statistic):           9.32e-09
Time:                        11:16:11   Log-Likelihood:                -21398.
No. Observations:                3733   AIC:                         4.280e+04
Df Residuals:                    3731   BIC:                         4.281e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        917.9420      1.670    549.615      0.0

***from above linear regression***

***average for regular-size class = 917.94, same as what we founf with describe method***

***average for small-size class = 917 + 14***

***difference estimator = b1 = 14.1088***

### Add teacher experience to the model
total_score = b0 + b1 * small + b2 * teacher experience

In [11]:
x_train = star[['small', 'totexpk']]
y_true = star['total_score']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train)).fit()

# fitted model and summary
print(ols_sm.summary())

                            OLS Regression Results                            
Dep. Variable:            total_score   R-squared:                       0.017
Model:                            OLS   Adj. R-squared:                  0.016
Method:                 Least Squares   F-statistic:                     31.52
Date:                Wed, 14 Oct 2020   Prob (F-statistic):           2.68e-14
Time:                        11:16:12   Log-Likelihood:                -21383.
No. Observations:                3733   AIC:                         4.277e+04
Df Residuals:                    3730   BIC:                         4.279e+04
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        907.4293      2.549    356.034      0.0

***each additional experience year improves score by 1.15***

### Use all school variables for regression

In [12]:
x_train = star[['small', 'totexpk', 'sex-girl', 'freelunk-yes', 'race-other',	'race-white']]
y_true = star['total_score']

# ols model with intercept added to predictor
ols_sm = sm.OLS(y_true, sm.add_constant(x_train)).fit()

# fitted model and summary
print(ols_sm.summary())

                            OLS Regression Results                            
Dep. Variable:            total_score   R-squared:                       0.096
Model:                            OLS   Adj. R-squared:                  0.095
Method:                 Least Squares   F-statistic:                     66.19
Date:                Wed, 14 Oct 2020   Prob (F-statistic):           1.87e-78
Time:                        11:18:10   Log-Likelihood:                -21225.
No. Observations:                3733   AIC:                         4.246e+04
Df Residuals:                    3726   BIC:                         4.251e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                   coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------
const          912.6822      3.761    242.661   