In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm

In [2]:
# Import colonial_legacy.dta
df = pd.read_stata('colonial_legacy.dta')

# Display the first 5 rows of the data
df.head()

Unnamed: 0,hv007,hv025,refused_any_blood_test,Times_Prospected,LATNUM,LONGNUM,land_suit,elev,malaria_ecology,mean_temp,mean_rain,tsi_grid_tsi,atlantic_all_years,dist_missions,relative_suitability,b4,vaccination_index,child_age_cont,child_age_cont2,cluster_id
0,2011,urban,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,0.208876,0.0,0.0,-1.540202,female,0.3,0.5,0.25,2.0
1,2011,urban,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,0.208876,0.0,0.0,-1.540202,male,0.9,0.833333,0.694444,2.0
2,2011,urban,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,0.208876,0.0,0.0,-1.540202,female,0.8,0.75,0.5625,2.0
3,2011,urban,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,0.208876,0.0,0.0,-1.540202,female,0.3,0.916667,0.840278,2.0
4,2011,urban,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,0.208876,0.0,0.0,-1.540202,male,0.0,0.916667,0.840278,2.0


*We will also use cluster-robust standard errors at the survey cluster level. For the sake of the problem set you don’t need to worry about this for your answers.*

# Data Preparation

In [3]:
# Check data types
df.dtypes

# Convert b4 and hv025 to dummy variables
df = pd.get_dummies(df, columns=['b4', 'hv025'])

# Convert all boolean variables to int
for col in df.columns:
    if df[col].dtype == 'bool':
        df[col] = df[col].astype(int)

# Define list of variables to all regressions child_age_cont child_age_cont2 b4 hv007 hv025 elev LATNUM LONGNUM mean_temp mean_rain land_suit malaria_ecology tsi_grid_tsi atlantic_all_years dist_missions
# b4 and hv025 are replaced by the dummy columns b4_male, b4_female, hv025_urban, hv025_rural
variables = ['child_age_cont', 'child_age_cont2', 'b4_male', 'b4_female', 'hv007', 'hv025_urban', 'hv025_rural', 'elev', 'LATNUM', 'LONGNUM', 'mean_temp', 'mean_rain', 'land_suit', 'malaria_ecology', 'tsi_grid_tsi', 'atlantic_all_years', 'dist_missions']

# Display the first 5 rows of the data
df.head()

Unnamed: 0,hv007,refused_any_blood_test,Times_Prospected,LATNUM,LONGNUM,land_suit,elev,malaria_ecology,mean_temp,mean_rain,...,dist_missions,relative_suitability,vaccination_index,child_age_cont,child_age_cont2,cluster_id,b4_male,b4_female,hv025_urban,hv025_rural
0,2011,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,...,0.0,-1.540202,0.3,0.5,0.25,2.0,0,1,1,0
1,2011,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,...,0.0,-1.540202,0.9,0.833333,0.694444,2.0,1,0,1,0
2,2011,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,...,0.0,-1.540202,0.8,0.75,0.5625,2.0,0,1,1,0
3,2011,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,...,0.0,-1.540202,0.3,0.916667,0.840278,2.0,0,1,1,0
4,2011,0.0,0.0,10.34002,15.266488,0.548376,326.56076,15.434868,28.075,63.083333,...,0.0,-1.540202,0.0,0.916667,0.840278,2.0,1,0,1,0


In [None]:
df.dtypes

# #11

In [5]:
# Estimate the naïve OLS model: 𝑉𝑎𝑐𝑐𝑖𝑛𝑎𝑡𝑖𝑜𝑛 𝑖𝑛𝑑𝑒𝑥𝑖 = 𝛽0 + 𝛽1𝑇𝑖𝑚𝑒𝑠 𝑉𝑖𝑠𝑖𝑡 𝑃𝑟𝑜𝑠𝑝𝑒𝑐𝑡𝑖 + 𝑿𝒓𝒕𝒊′𝑩 + 𝒖𝒓𝒕𝒊
# Where 𝑿𝒓𝒕𝒊′𝑩 is a vector of the control variables we defined above. Interpret 𝛽1.

# Add a constant to the independent variable
X = sm.add_constant(df[['Times_Prospected'] + variables])

# Estimate the model
model = sm.OLS(df['vaccination_index'], X).fit()

# Print the results
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:      vaccination_index   R-squared:                       0.360
Model:                            OLS   Adj. R-squared:                  0.360
Method:                 Least Squares   F-statistic:                     427.1
Date:                Tue, 14 Nov 2023   Prob (F-statistic):               0.00
Time:                        19:12:21   Log-Likelihood:                -3840.4
No. Observations:               12139   AIC:                             7715.
Df Residuals:                   12122   BIC:                             7841.
Df Model:                          16                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
const                 -4.2580      1