---
[Philipp Schreiber](https://github.com/pcschreiber1)

# Simulation Study of spatial Average Treamtent Effect (ATE) estimation
## For the replication of Henderson, Storeygard, Deichmann (2017)

---

In [1]:
%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.ticker
import statsmodels.formula.api as smf
import seaborn as sns

#For spatial analysis
import geopandas as gpd
import shapely.geometry as geom
import libpysal as lp #For spatial weights

from pysal.viz import splot #exploratory analysis
#from splot.esda import plot_moran #exploratory analysis
from pysal.explore import esda #exploratory analysis
from pysal.model import spreg #For spatial regression

import os #for environmental variable

pd.options.display.float_format = "{:,.2f}".format

In [2]:
from auxiliary.data_import import *
from auxiliary.plots import *
from auxiliary.simulations import *

**Three simulation studies:**

 - SLX specification: $Y = WD\gamma +  X \beta + u $ where $W$ is the weight matrix and $\gamma=0.25$, $\beta = 0.5$
 
 - Spatial Lag specification: $Y = WY \rho +   D\gamma +  X \beta + u $ where $W$ is the weight matrix and $\gamma=0.25$, $\beta = 0.5$, $\rho = 0.05$.
 
 - SDM specification: $Y = WY \rho +   WD\gamma +  X \beta + u $ where $W$ is the weight matrix and $\gamma=0.25$, $\beta = 0.5$, $\rho = 0.05$.
 
 - Backdoor specification: $Y = WY \rho +   D\gamma +  X \beta + u $ and $D = WD$ where $W$ is the weight matrix and $\gamma=0.25$, $\beta = 0.5$, $\rho = 0.05$.
 
We compare true ATE, non-spatial estimate and spatial estimate for three different sample sizes:
   - Simple: One simulation with $n=100$
   - Small: $100$ simulations with $n=100$
   - Large: $100$ simulations with $n=2.500$ (for SLX $100$ simulations with $n=10.000$)
   
Further details: the weight matrix $W$ is generated using a knn=10 nearest neighbour approach.

<span style="color:orange">**NOTE**:</span> For the sake of enabling continous integration, the simulation sizes for the CI run have been reduced to a smaller sample size. The results of the proper run have been stored in the data folder.

---

**SLX Simulation**

In [94]:
np.random.seed(2021)
if os.environ.get("CI") == "true":
    # define the number of simulations and number of observations to be simulated
    n_sims = [1]#, 100, 100]
    n_obs = [100]#, 100, 10000]

else:
    # define the number of simulations and number of observations to be simulated
    n_sims = [1, 100, 100]
    n_obs = [100, 100, 10000]

# initialize the container
columns = ["Sim1", "Sim2", "Sim3"]
df = pd.DataFrame(columns=columns, index=["ATE", "Non-spatial", "spatial"])


for _, n in enumerate(n_sims):
    
    #initialize containers
    ATE = np.empty((n_sims[_],1))
    Nspat = np.empty((n_sims[_],1))
    Spat = np.empty((n_sims[_],1))
    
    for j in range(0, n):
        data, w = simulate_SLX_sample(n_obs[_])
        # calculate values
        ate_true = data["Y_1"].sub(data["Y_0"]).mean()
        nonspatial_ols = smf.ols("Y ~ X + D", data=data).fit().params[2]
        spatial_ols = smf.ols("Y ~ X + D + WD", data=data).fit().params[2]
                   
        
        #store in container
        ATE[j] = ate_true
        Nspat[j] = nonspatial_ols
        Spat[j] = spatial_ols

    #save in dataframe
    df.loc[:, columns[_]] = [np.mean(ATE), np.mean(Nspat), np.mean(Spat)]


In [95]:
df.head()

Unnamed: 0,Sim1,Sim2,Sim3
ATE,0.25,0.25,0.25
Non-spatial,0.55,0.24,0.24
spatial,0.25,0.25,0.25


In [41]:
# df.to_csv("data/SLX_sim.csv") #current file is with 10000 simulations

**Spatial Lag Simulation**

In [12]:
# Spatial lag model

np.random.seed(123)

if os.environ.get("CI") == "true":
    n_sims = [1]#, 100, 100]
    n_obs = [100]#, 100, 2500]
else:
    n_sims = [1, 100, 100]
    n_obs = [100, 100, 2500]


columns = ["Sim1", "Sim2", "Sim3"]
df = pd.DataFrame(columns=columns, index=["ATE", "Non-spatial", "spatial"])


for _, n in enumerate(n_sims):
    
    #initialize containers
    ATE = np.empty((n_sims[_],1))
    Nspat = np.empty((n_sims[_],1))
    Spat = np.empty((n_sims[_],1))
    
    for j in range(0, n):
        data, w = simulate_SpatialLag_sample(n_obs[_])
        # calculate values
        ate_true = data["Y_1"].sub(data["Y_0"]).mean()
        nonspatial_ols = smf.ols("Y ~ X + D", data=data).fit().params[2]
        #spatial 2 stage
        # preparing data for pysal spreg
        y = data["Y"].to_numpy()
        y = np.reshape(y, (y.size, 1))
        
        X = []
        X.append(data["X"].to_numpy())
        X.append(data["D"].to_numpy())
        X = np.array(X).T
        
        #row standardize matrix
        w.transform = 'r'
        
        #two-stage regression
        #reg = spreg.ML_Lag(y, X, w=w,name_x=['X', 'D'], epsilon=1e-07, name_y='Y',name_ds='simulation')
        reg = spreg.GM_Lag(y, X, w=w,w_lags=1, name_y='Y', name_x=['X', 'D'])
        spatial_2stage = reg.betas[2][0]
        
        #store in container
        ATE[j] = ate_true
        Nspat[j] = nonspatial_ols
        Spat[j] = spatial_2stage

    #save in dataframe
    df.loc[:, columns[_]] = [np.mean(ATE), np.mean(Nspat), np.mean(Spat)]

In [14]:
# df.to_csv("data/Spatial_Lag_sim.csv")

In [13]:
df.head()

Unnamed: 0,Sim1,Sim2,Sim3
ATE,0.25,0.25,0.25
Non-spatial,0.27,0.26,0.26
spatial,0.25,0.25,0.25


**SDM Simulation**

In [58]:
np.random.seed(123)

if os.environ.get("CI") == "true":
    n_sims = [1]#, 100, 100]
    n_obs = [100]#, 100, 2500]
else:
    n_sims = [1, 100, 100]
    n_obs = [100, 100, 2500]

columns = ["Sim1", "Sim2", "Sim3"]
df = pd.DataFrame(columns=columns, index=["ATE", "Non-spatial", "spatial"])


for _, n in enumerate(n_sims):
    
    #initialize containers
    ATE = np.empty((n_sims[_],1))
    Nspat = np.empty((n_sims[_],1))
    Spat = np.empty((n_sims[_],1))
    
    for j in range(0, n):
        data, w = simulate_SDM_sample(n_obs[_])
        # calculate values
        ate_true = data["Y_1"].sub(data["Y_0"]).mean()
        nonspatial_ols = smf.ols("Y ~ X + D", data=data).fit().params[2]
        #spatial 2 stage
        # preparing data for pysal spreg
        y = data["Y"].to_numpy()
        y = np.reshape(y, (y.size, 1))
        
        X = []
        X.append(data["X"].to_numpy())
        X.append(data["D"].to_numpy())
        X.append(data["WD"].to_numpy())
        X = np.array(X).T
        
        #row standardize matrix
        w.transform = 'r'
        
        #two-stage regression
        reg = spreg.GM_Lag(y, X, w=w,w_lags=1, name_x=['X', 'D', 'WD'], name_y='Y',name_ds='simulation')
        spatial_2stage = reg.betas[2][0]
        
        #store in container
        ATE[j] = ate_true
        Nspat[j] = nonspatial_ols
        Spat[j] = spatial_2stage

    #save in dataframe
    df.loc[:, columns[_]] = [np.mean(ATE), np.mean(Nspat), np.mean(Spat)]

In [59]:
df.head()

Unnamed: 0,Sim1,Sim2,Sim3
ATE,0.25,0.25,0.25
Non-spatial,0.36,0.34,0.41
spatial,0.25,0.25,0.25


In [60]:
# df.to_csv("data/SDM_sim.csv")

**Backdoor Specification**

In [140]:
np.random.seed(123)

if os.environ.get("CI") == "true":
    n_sims = [1]#, 100, 100]
    n_obs = [100]#, 100, 2500]
else:
    n_sims = [1, 100, 100]
    n_obs = [100, 100, 2500]

columns = ["Sim1", "Sim2", "Sim3"]
df = pd.DataFrame(columns=columns, index=["ATE", "Non-spatial", "spatial"])


for _, n in enumerate(n_sims):
    
    #initialize containers
    ATE = np.empty((n_sims[_],1))
    Nspat = np.empty((n_sims[_],1))
    Spat = np.empty((n_sims[_],1))
    
    for j in range(0, n):
        data, w = simulate_backdoor_sample(n_obs[_])
        # calculate values
        ate_true = data["Y_1"].sub(data["Y_0"]).mean()
        nonspatial_ols = smf.ols("Y ~ X + D", data=data).fit().params[2]
        #spatial 2 stage
        # preparing data for pysal spreg
        y = data["Y"].to_numpy()
        y = np.reshape(y, (y.size, 1))
        
        X = []
        X.append(data["X"].to_numpy())
        X.append(data["D"].to_numpy())
        X.append(data["WD"].to_numpy())
        X = np.array(X).T
        
        #row standardize matrix
        w.transform = 'r'
        
        #two-stage regression
        reg = spreg.GM_Lag(y, X, w=w,w_lags=1, name_x=['X', 'D', 'WD'], name_y='Y',name_ds='simulation')
        spatial_2stage = reg.betas[2][0]
        
        #store in container
        ATE[j] = ate_true
        Nspat[j] = nonspatial_ols
        Spat[j] = spatial_2stage

    #save in dataframe
    df.loc[:, columns[_]] = [np.mean(ATE), np.mean(Nspat), np.mean(Spat)]

In [141]:
df.head()

Unnamed: 0,Sim1,Sim2,Sim3
ATE,0.25,0.25,0.25
Non-spatial,0.3,0.41,0.41
spatial,0.25,0.25,0.25


In [142]:
# df.to_csv("data/backdoor_sim.csv")