In [135]:
import numpy as np
import pandas as pd
import scipy as sp
from linearmodels import PanelOLS
import statsmodels as sm
import statsmodels.formula.api as smf
import statsmodels.api as sm_api
import statsmodels.regression.linear_model as sm 

In [96]:
df = pd.read_stata("AEJ2018_child_mortality.dta") 
df.count()
df = df[df.datedeath_cmc<1333]
df.count()

branchid              221
villageid             221
treatment             221
hhid                  221
childID               221
female                221
datebirth_m           221
datebirth_yr          221
datebirth_cmc         221
died                  221
datedeath_m           221
datedeath_yr          221
datedeath_cmc         221
aad                   221
dateofinterview_m     221
dateofinterview_yr    221
dtype: int64

In [97]:
df['death_u5'] = np.where((df['died']==1) & (df['aad']<60), 1, 0) 
df['death_u1'] = np.where((df['died']==1) & (df['aad']<12), 1, 0)
df['death_u1m'] = np.where((df['died'] == 1) & (df['aad']<1), 1,0)

In [98]:
df = df.groupby(['branchid', 'villageid', 'treatment'], as_index = False)[['death_u5', 
'death_u1','death_u1m']].sum().reset_index()
df.count() #in Stata collapse (sum) reduces # of obs from 11342 to 214, here after groupby only 123 obs left (?) 

index        123
branchid     123
villageid    123
treatment    123
death_u5     123
death_u1     123
death_u1m    123
dtype: int64

In [70]:
df.describe()

Unnamed: 0,branchid,villageid,treatment,hhid,childID,female,datebirth_m,datebirth_yr,datebirth_cmc,died,datedeath_m,datedeath_yr,datedeath_cmc,aad,dateofinterview_m,dateofinterview_yr,death_u5,death_u1,death_u1m
count,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0,123.0
mean,15.707317,1414.886179,0.512195,252806.04065,2.0,0.764228,13.056911,3609.869873,2365.642334,1.796748,13.504066,3610.325195,2371.552734,5.910569,17.894308,3616.85376,1.796748,1.601626,0.97561
std,14.987827,238.781957,0.501896,146474.818855,1.361051,0.769088,9.427289,2073.143555,1357.780273,1.031943,9.69314,2073.362061,1360.626465,7.986681,10.371629,2077.30249,1.031943,1.022211,0.909543
min,1.0,1001.0,0.0,100115.0,1.0,0.0,1.0,2008.0,1304.0,1.0,1.0,2008.0,1304.0,0.0,9.0,2013.0,1.0,0.0,0.0
25%,5.0,1306.5,0.0,141278.0,1.0,0.0,7.0,2009.0,1316.0,1.0,7.5,2009.0,1320.0,0.0,10.0,2013.0,1.0,1.0,0.0
50%,5.0,1347.0,1.0,202269.0,2.0,1.0,11.0,4016.0,2611.0,2.0,11.0,4016.0,2616.0,2.0,18.0,4026.0,2.0,1.0,1.0
75%,31.0,1629.0,1.0,327338.5,2.0,1.0,17.0,4019.0,2645.5,2.0,16.5,4020.0,2653.5,9.0,20.0,4026.0,2.0,2.0,1.5
max,42.0,1906.0,1.0,811137.0,8.0,3.0,66.0,12054.0,7914.0,6.0,66.0,12055.0,7926.0,39.0,66.0,12078.0,6.0,6.0,4.0


In [99]:
print(df.columns)
df = df.set_index(['branchid','villageid'])

Index(['index', 'branchid', 'villageid', 'treatment', 'death_u5', 'death_u1',
       'death_u1m'],
      dtype='object')


In [100]:
from linearmodels import PanelOLS
model = PanelOLS(df['death_u5'],df['treatment'], entity_effects = True)
res = model.fit(cov_type='robust', cluster_entity=True)
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:               death_u5   R-squared:                        0.0014
Estimator:                   PanelOLS   R-squared (Between):             -0.0442
No. Observations:                 123   R-squared (Within):               0.0014
Date:                Mon, May 04 2020   R-squared (Overall):             -0.0341
Time:                        21:15:42   Log-likelihood                   -171.98
Cov. Estimator:                Robust                                           
                                        F-statistic:                      0.1583
Entities:                          12   P-value                           0.6915
Avg Obs:                       10.250   Distribution:                   F(1,110)
Min Obs:                       3.0000                                           
Max Obs:                       37.000   F-statistic (robust):             0.1716
                            

In [101]:
model = PanelOLS(df['death_u1'], df['treatment'], entity_effects = True)
res = model.fit(cov_type = 'robust', cluster_entity = True)
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:               death_u1   R-squared:                        0.0024
Estimator:                   PanelOLS   R-squared (Between):              0.0613
No. Observations:                 123   R-squared (Within):               0.0024
Date:                Mon, May 04 2020   R-squared (Overall):              0.0466
Time:                        21:15:52   Log-likelihood                   -171.47
Cov. Estimator:                Robust                                           
                                        F-statistic:                      0.2695
Entities:                          12   P-value                           0.6047
Avg Obs:                       10.250   Distribution:                   F(1,110)
Min Obs:                       3.0000                                           
Max Obs:                       37.000   F-statistic (robust):             0.2936
                            

In [102]:
model = PanelOLS(df['death_u1m'], df['treatment'], entity_effects = True)
res = model.fit(cov_type = 'robust', cluster_entity = True)
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:              death_u1m   R-squared:                        0.0079
Estimator:                   PanelOLS   R-squared (Between):              0.1643
No. Observations:                 123   R-squared (Within):               0.0079
Date:                Mon, May 04 2020   R-squared (Overall):              0.0916
Time:                        21:15:58   Log-likelihood                   -153.75
Cov. Estimator:                Robust                                           
                                        F-statistic:                      0.8810
Entities:                          12   P-value                           0.3500
Avg Obs:                       10.250   Distribution:                   F(1,110)
Min Obs:                       3.0000                                           
Max Obs:                       37.000   F-statistic (robust):             0.9166
                            

In [123]:
data_2 = pd.read_stata("AEJ2018_child_mortality_computation.dta") 
data_2.describe()
#print(data_2)
#data_2.count()

Unnamed: 0,branchid,villageid,treatment,hhid,childID,female,datebirth_m,datebirth_yr,datebirth_cmc,died,...,dateofinterview_yr,count_month_u5,death_under5,date12m_cmc,aabeginning,aaend,count_month_u1,death_under1,count_month_u1m,death_under1m
count,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,...,11342.0,11342.0,11342.0,11342.0,11342.0,11342.0,8808.0,11342.0,6499.0,6499.0
mean,16.427438,1431.536942,0.535796,143175.980515,1.436078,0.495944,6.603157,2010.788208,1336.248047,0.034297,...,2012.734985,24.36096,0.034297,1348.248047,-3.748016,30.256922,8.947604,0.025921,1.0,0.031389
std,15.45245,249.285194,0.498739,24927.929774,0.676039,0.499998,3.355596,1.401922,16.562719,0.182013,...,0.265033,11.191686,0.182013,16.562719,16.562727,16.509613,3.912847,0.158906,0.0,0.174389
min,1.0,1001.0,0.0,100102.0,1.0,0.0,1.0,2008.0,1304.0,0.0,...,2013.0,0.0,0.0,1316.0,-35.5,0.5,0.0,0.0,1.0,0.0
25%,1.0,1308.0,0.0,130820.5,1.0,0.0,4.0,2010.0,1322.0,0.0,...,2013.0,15.5,0.0,1334.0,-17.5,16.5,6.5,0.0,1.0,0.0
50%,5.0,1412.0,1.0,141247.5,1.0,0.0,7.0,2011.0,1337.0,0.0,...,2013.0,29.5,0.0,1349.0,-4.5,30.5,11.5,0.0,1.0,0.0
75%,32.0,1633.0,1.0,163320.75,2.0,1.0,9.0,2012.0,1350.0,0.0,...,2013.0,34.0,0.0,1362.0,10.5,44.5,12.0,0.0,1.0,0.0
max,42.0,1910.0,1.0,191042.0,6.0,1.0,12.0,2013.0,1368.0,1.0,...,2013.0,36.0,1.0,1380.0,28.5,61.5,12.0,1.0,1.0,1.0


In [124]:
data_2 = data_2.groupby(['villageid', 'branchid', 'treatment'], as_index = False)[['death_under5','count_month_u5',
'death_under1', 'count_month_u1','death_under1m','count_month_u1m']].sum().reset_index()
data_2.count()


index              214
villageid          214
branchid           214
treatment          214
death_under5       214
count_month_u5     214
death_under1       214
count_month_u1     214
death_under1m      214
count_month_u1m    214
dtype: int64

In [125]:
data_2['count_month_u5'] = data_2.apply(lambda row: row.count_month_u5/12, axis = 1)
data_2['count_month_u1'] = data_2.apply(lambda row: row.count_month_u1/12, axis = 1)
data_2['mrate_u5'] = (data_2['death_under5']/data_2['count_month_u5'])*1000
data_2['mrate_u1'] = (data_2['death_under1']/data_2['count_month_u1'])*1000
data_2['mrate_u1m'] = (data_2['death_under1m']/data_2['count_month_u1m'])*1000
data_2.describe()

Unnamed: 0,index,villageid,branchid,treatment,death_under5,count_month_u5,death_under1,count_month_u1,death_under1m,count_month_u1m,mrate_u5,mrate_u1,mrate_u1m
count,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0,214.0
mean,106.5,1427.602804,16.691589,0.537383,1.817757,107.594237,1.373832,30.689447,0.953271,30.36916,16.611985,43.855541,29.433989
std,61.920648,255.135766,15.47763,0.49977,1.846062,23.746709,1.535333,6.610219,1.224807,7.223409,15.68016,45.691946,35.54031
min,0.0,1001.0,1.0,0.0,0.0,38.208333,0.0,8.708333,0.0,7.0,0.0,0.0,0.0
25%,53.25,1306.25,1.0,0.0,1.0,92.1875,0.0,27.052083,0.0,27.0,6.793686,0.0,0.0
50%,106.5,1411.5,5.0,1.0,1.0,108.6875,1.0,31.270833,1.0,31.0,12.357873,32.345072,25.978408
75%,159.75,1632.75,31.0,1.0,2.75,123.479167,2.0,34.65625,1.0,35.0,24.308293,67.346003,43.47826
max,213.0,1910.0,42.0,1.0,14.0,170.75,11.0,51.208333,7.0,51.0,92.383833,242.424242,161.290314


In [126]:
data_2 = data_2.set_index(['villageid', 'branchid'])

In [136]:
model = sm.OLS(data_2['mrate_u5'], sm.add_constant(data_2['treatment']))
res = model.fit()
print(res)

AttributeError: module 'statsmodels.regression.linear_model' has no attribute 'add_constant'