### Import Libraries

In [35]:


import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

import statsmodels.api as sm
import statsmodels.formula.api as smf

plt.style.use('seaborn') # pretty matplotlib plots



### Read Dataset

In [36]:
star = pd.read_csv("Star.csv", index_col=0)
star.head()


Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn
2,473,447,small.class,7,girl,no,white,63
3,536,450,small.class,21,girl,no,black,20
5,463,439,regular.with.aide,0,boy,yes,black,19
11,559,448,regular,16,boy,no,white,69
12,489,447,small.class,5,boy,yes,white,79


In [37]:
star.isna().any()

tmathssk    False
treadssk    False
classk      False
totexpk     False
sex         False
freelunk    False
race        False
schidkn     False
dtype: bool

#### only analyze small and regular size data

In [38]:
filter = star['classk'] != 'regular.with.aide' 
star = star[filter]
star.head()

Unnamed: 0,tmathssk,treadssk,classk,totexpk,sex,freelunk,race,schidkn
2,473,447,small.class,7,girl,no,white,63
3,536,450,small.class,21,girl,no,black,20
11,559,448,regular,16,boy,no,white,69
12,489,447,small.class,5,boy,yes,white,79
13,454,431,regular,8,boy,yes,white,5


#### add total score

In [39]:
star['total_score'] = star['tmathssk'] + star['treadssk']

#### create indicator variables

In [40]:
star = pd.get_dummies(data=star, prefix_sep='-', drop_first=True)
star.rename(columns={'classk-small.class':'small'}, inplace=True)
star.head()


Unnamed: 0,tmathssk,treadssk,totexpk,schidkn,total_score,small,sex-girl,freelunk-yes,race-other,race-white
2,473,447,7,63,920,1,1,0,0,1
3,536,450,21,20,986,1,1,0,0,0
11,559,448,16,69,1007,0,0,0,0,1
12,489,447,5,79,936,1,0,1,0,1
13,454,431,8,5,885,0,0,1,0,1


#### get summary stats for small= 0 and small = 1

In [41]:
star[['total_score', 'small']].groupby("small").describe()

Unnamed: 0_level_0,total_score,total_score,total_score,total_score,total_score,total_score,total_score,total_score
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max
small,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
0,2000.0,917.942,73.153389,635.0,866.0,912.0,961.0,1229.0
1,1733.0,932.050779,76.42836,747.0,878.0,924.0,981.0,1253.0


#### summary stats across all data

In [42]:
star.describe()

Unnamed: 0,tmathssk,treadssk,totexpk,schidkn,total_score,small,sex-girl,freelunk-yes,race-other,race-white
count,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0,3733.0
mean,487.07206,437.41977,9.037503,40.263059,924.49183,0.464238,0.485936,0.473882,0.005358,0.679614
std,48.590568,31.836505,5.726875,22.949801,75.012373,0.498786,0.499869,0.499384,0.073009,0.466687
min,320.0,315.0,0.0,1.0,635.0,0.0,0.0,0.0,0.0,0.0
25%,454.0,414.0,4.0,21.0,871.0,0.0,0.0,0.0,0.0,0.0
50%,484.0,434.0,8.0,40.0,918.0,0.0,0.0,0.0,0.0,1.0
75%,520.0,455.0,13.0,60.0,969.0,1.0,1.0,1.0,0.0,1.0
max,626.0,627.0,27.0,80.0,1253.0,1.0,1.0,1.0,1.0,1.0


#### run a linear regression to understand for the case of small= 0