# Bounds Example

In [1]:
import pandas as pd
import numpy as np
import spillover_effects as spef

### Read data

The data comes from Beuermann et al. (2015), where they studied the short-term impacts of the "One Laptop per Child" program, which aims to promote self-empowered learning by providing personal laptops to children in developing countries.

- Beuermann, Diether W, Julian Cristia, Santiago Cueto, Ofer Malamud, and Yyannu Cruz-Aguayo, “One laptop per child at home: Short-term impacts from a randomized experiment in Peru,” American Economic Journal: Applied Economics, 2015, 7 (2), 53–80.

In [2]:
path_data = 'https://raw.githubusercontent.com/pabloestradac/spillover-effects/main/data/'
edges = pd.read_csv(path_data + 'edges_students.csv')
edges.iloc[:10]

Unnamed: 0,student,friend1,friend2,friend3,friend4
0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
1,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
2,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
3,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
4,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
5,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
6,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
7,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,
8,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0
9,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0,2083300000000.0


In [3]:
data = pd.read_csv(path_data + 'data_students.csv')
data.iloc[:10]

Unnamed: 0,school,grade,section,classroom,student,computer_use,won_lottery,male,age,n_siblings,...,activity_6,activity_7,activity_8,activity_10,past_computer_use,past_week_home,past_week_school,past_week_cafe,past_week_friend_house,past_week_other
0,208330,3,1,208330_3_1,2083300301002,544.0,0,1.0,7.0,2.0,...,1.0,1.0,1.0,0.0,380.0,1.0,1.0,1.0,0.0,0.0
1,208330,3,1,208330_3_1,2083300301003,0.0,0,1.0,8.0,1.0,...,1.0,1.0,1.0,0.0,102.0,0.0,1.0,0.0,0.0,0.0
2,208330,3,1,208330_3_1,2083300301005,116.0,1,0.0,8.0,2.0,...,1.0,1.0,1.0,1.0,280.0,0.0,1.0,0.0,0.0,0.0
3,208330,3,1,208330_3_1,2083300301006,380.0,0,1.0,9.0,6.0,...,1.0,0.0,1.0,1.0,140.0,0.0,1.0,0.0,0.0,0.0
4,208330,3,1,208330_3_1,2083300301007,102.0,0,1.0,9.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
5,208330,3,1,208330_3_1,2083300301008,360.0,0,0.0,8.0,3.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
6,208330,3,1,208330_3_1,2083300301009,120.0,0,1.0,8.0,3.0,...,1.0,1.0,1.0,1.0,502.0,1.0,1.0,1.0,1.0,1.0
7,208330,3,1,208330_3_1,2083300301010,450.0,0,0.0,8.0,2.0,...,1.0,1.0,1.0,1.0,441.0,1.0,1.0,0.0,0.0,1.0
8,208330,3,1,208330_3_1,2083300301011,240.0,1,0.0,8.0,4.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
9,208330,3,1,208330_3_1,2083300301013,60.0,0,0.0,8.0,2.0,...,1.0,1.0,1.0,0.0,120.0,1.0,1.0,0.0,0.0,0.0


### Spillover Effects

We are interested in estimating the spillover effects of having at least one friend treated. In order to use the WLS and Bounds estimator, we need to calculate the kernel weight matrix, exposure treatment, and propensity score. This is breakdown in four steps.

In [4]:
# 1. Create adjacency matrix and array with nodes order
A, nodes = spef.utils.adjacency_matrix(edges, directed=True, nodes=data['student'].values)
A, nodes

(<3085x3085 sparse matrix of type '<class 'numpy.uint32'>'
 	with 7378 stored elements in COOrdinate format>,
 array([ 2083300301002,  2083300301003,  2083300301005, ...,
        10449990603024, 10449990603025, 10449990603029]))

In [5]:
# 2. Reorder data to match adjacency matrix order
data = data.set_index('student').loc[nodes].reset_index()
data.iloc[:10]

Unnamed: 0,student,school,grade,section,classroom,computer_use,won_lottery,male,age,n_siblings,...,activity_6,activity_7,activity_8,activity_10,past_computer_use,past_week_home,past_week_school,past_week_cafe,past_week_friend_house,past_week_other
0,2083300301002,208330,3,1,208330_3_1,544.0,0,1.0,7.0,2.0,...,1.0,1.0,1.0,0.0,380.0,1.0,1.0,1.0,0.0,0.0
1,2083300301003,208330,3,1,208330_3_1,0.0,0,1.0,8.0,1.0,...,1.0,1.0,1.0,0.0,102.0,0.0,1.0,0.0,0.0,0.0
2,2083300301005,208330,3,1,208330_3_1,116.0,1,0.0,8.0,2.0,...,1.0,1.0,1.0,1.0,280.0,0.0,1.0,0.0,0.0,0.0
3,2083300301006,208330,3,1,208330_3_1,380.0,0,1.0,9.0,6.0,...,1.0,0.0,1.0,1.0,140.0,0.0,1.0,0.0,0.0,0.0
4,2083300301007,208330,3,1,208330_3_1,102.0,0,1.0,9.0,1.0,...,1.0,1.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0
5,2083300301008,208330,3,1,208330_3_1,360.0,0,0.0,8.0,3.0,...,1.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0,0.0,1.0
6,2083300301009,208330,3,1,208330_3_1,120.0,0,1.0,8.0,3.0,...,1.0,1.0,1.0,1.0,502.0,1.0,1.0,1.0,1.0,1.0
7,2083300301010,208330,3,1,208330_3_1,450.0,0,0.0,8.0,2.0,...,1.0,1.0,1.0,1.0,441.0,1.0,1.0,0.0,0.0,1.0
8,2083300301011,208330,3,1,208330_3_1,240.0,1,0.0,8.0,4.0,...,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0
9,2083300301013,208330,3,1,208330_3_1,60.0,0,0.0,8.0,2.0,...,1.0,1.0,1.0,0.0,120.0,1.0,1.0,0.0,0.0,0.0


In [6]:
# 3. Create exposure treatments and propensity score
# pscore1: probability of having at least one neighbor treated
# exposure1: 1 if at least one neighbor is treated, 0 otherwise
n_treated = 4
data['pscore'] = spef.utils.spillover_pscore(A, n_treated, blocks=data['classroom'])
data['exposure'] = spef.utils.spillover_treatment(data['won_lottery'], A)
data[['pscore', 'exposure', 'won_lottery']].describe().round(3)

Unnamed: 0,pscore,exposure,won_lottery
count,3085.0,3085.0,3085.0
mean,0.431,0.42,0.216
std,0.199,0.494,0.412
min,0.0,0.0,0.0
25%,0.324,0.0,0.0
50%,0.453,0.0,0.0
75%,0.562,1.0,0.0
max,1.0,1.0,1.0


In [7]:
# 4. Create kernel matrix for HAC standard errors
# Element i,j is 1 if the lenght of the shortest path between i and j is less than or equal to optimal bandwidth
distances, bandwidth = spef.utils.kernel(A)
print('Optimal bandwidth:', bandwidth)

Optimal bandwidth: 2


## Selection: Conditional on Positive Effects

We want to estimate the effect of having at least one friend treated on computer use at the intensive margin. 

At the intensive margin, all outcomes are positive and we can take logarithms.

We will focus on the subgroup of students that won the lottery.

In [8]:
# COP: conditional on positive
subgroup = data['won_lottery']==1
# Selection
data['selection'] = (data['computer_use'] >= 1) * 1
selection = (data['selection'] == 1).values
data['log_use'] = np.nan
data.loc[selection, 'log_use'] = np.log(data.loc[selection, 'computer_use'])
print('{} observations with missing outcomes for log(use)'.format((~selection).sum()))

788 observations with missing outcomes for log(use)


If we assume outcomes are missing at random (or that there is no effect at the extensive margin), we can use WLS.

In [9]:
# No covariates
res_wls = spef.WLS(name_y='log_use', name_z='exposure', name_pscore='pscore',
                   data=data, kernel_weights=distances, subsample=subgroup)
res_wls.summary.round(2)



Unnamed: 0,coef,se,t-val,p-val,ci-low,ci-up
spillover,0.1,0.09,1.16,0.25,-0.07,0.27
exposure0,4.71,0.06,80.96,0.0,4.6,4.83
exposure1,4.81,0.07,74.0,0.0,4.68,4.94


In [10]:
print('Having at least one treated friend increases computer usage by ' +
      '{:.0f}%'.format(res_wls.summary.iloc[0,0]*100))

Having at least one treated friend increases computer usage by 10%


In [11]:
# Include covariates
name_covariates = ['male', 'age', 'n_siblings', 'n_young_siblings',
       'father_lives_home', 'father_works_home', 'mother_works_home',
       'home_phone', 'home_power', 'home_car', 'home_moto', 'past_computer_use',
       'past_week_home', 'past_week_school', 'past_week_cafe',
       'past_week_friend_house', 'past_week_other']
res_wls = spef.WLS(name_y='log_use', name_z='exposure', name_pscore='pscore',
                   data=data, kernel_weights=distances, subsample=subgroup,
                   name_x=name_covariates, interaction=False)
res_wls.summary.iloc[:3].round(2)



Unnamed: 0,coef,se,t-val,p-val,ci-low,ci-up
spillover,0.15,0.09,1.61,0.11,-0.03,0.33
exposure0,4.69,0.06,78.3,0.0,4.57,4.8
exposure1,4.83,0.07,68.61,0.0,4.7,4.97


In [12]:
print('Having at least one treated friend increases computer usage by ' +
        '{:.0f}%'.format(res_wls.summary.iloc[0,0]*100))

Having at least one treated friend increases computer usage by 15%


## Bounds

In [13]:
res_bounds = spef.Bounds(name_y='log_use', name_z='exposure', name_pscore='pscore',
                         data=data, kernel_weights=distances, subsample=subgroup)
res_bounds.summary.round(2)



Unnamed: 0,lower-bound,upper-bound,ci-low,ci-up
spillover,0.06,0.16,-0.1,0.32


In [14]:
print('Having at least one treated friend increases computer usage from ' +
      '{:.0f}% to {:.0f}%'.format(res_bounds.summary.iloc[0,0]*100, res_bounds.summary.iloc[0,1]*100))

Having at least one treated friend increases computer usage from 6% to 16%


In [15]:
res_pmt = spef.BoundsML(name_y='log_use', name_z='exposure', name_pscore='pscore',
                        name_x=name_covariates, data=data, kernel_weights=distances,
                        subsample=subgroup, n_splits=1, method='parametric', verbose=True)
res_pmt.summary.round(2)

Predicted probabilities with method: logit
accuracy=0.82
Conditional quantiles with method: quantile-reg
Q 0.01) R2 = -6.52
Q 0.99) R2 = -3.02


Unnamed: 0,lower-bound,upper-bound,ci-low,ci-up
spillover,0.35,0.41,0.29,0.48


In [16]:
print('Having at least one treated friend increases computer usage from ' +
      '{:.0f}% to {:.0f}%'.format(res_pmt.summary.iloc[0,0]*100, res_pmt.summary.iloc[0,1]*100))

Having at least one treated friend increases computer usage from 35% to 41%


In [17]:
res_las = spef.BoundsML(name_y='log_use', name_z='exposure', name_pscore='pscore',
                        name_x=name_covariates, data=data, kernel_weights=distances,
                        subsample=subgroup, n_splits=10, n_cvs=10, method='lasso',
                        verbose=True, seed=42,
                        lambdas_proba=np.linspace(1, 10, 100),
                        lambdas_quant=np.linspace(0.01, 1, 100))
res_las.summary.round(2)

Predicted probabilities with method: postlasso-logit
14 (72.2%) selected variables; lambda=3.00; accuracy=0.81
14 (77.8%) selected variables; lambda=3.00; accuracy=0.79
10 (50.0%) selected variables; lambda=3.00; accuracy=0.81
14 (72.2%) selected variables; lambda=3.00; accuracy=0.83
12 (61.1%) selected variables; lambda=3.00; accuracy=0.83
14 (72.2%) selected variables; lambda=3.00; accuracy=0.83
12 (61.1%) selected variables; lambda=3.00; accuracy=0.83
12 (61.1%) selected variables; lambda=3.00; accuracy=0.83
13 (66.7%) selected variables; lambda=3.00; accuracy=0.83
14 (77.8%) selected variables; lambda=3.00; accuracy=0.80
Conditional quantiles with method: postlasso-qr
Q 0.01) 1 (0.0%) selected variables; lambda = 0.04; R2 = -3.64
Q 0.10) 4 (16.7%) selected variables; lambda = 0.04; R2 = -0.39
Q 0.90) 2 (5.6%) selected variables; lambda = 0.04; R2 = -1.92
Q 0.99) 1 (0.0%) selected variables; lambda = 0.04; R2 = -5.85
Q 0.01) 1 (0.0%) selected variables; lambda = 0.04; R2 = -2.62
Q 0

Unnamed: 0,lower-bound,upper-bound,ci-low,ci-up
spillover,0.25,0.36,0.2,0.44


In [18]:
print('Having at least one treated friend increases computer usage from ' +
      '{:.0f}% to {:.0f}%'.format(res_las.summary.iloc[0,0]*100, res_las.summary.iloc[0,1]*100))

Having at least one treated friend increases computer usage from 25% to 36%


In [19]:
res_aml = spef.BoundsML(name_y='log_use', name_z='exposure', name_pscore='pscore',
                        name_x=name_covariates, data=data, kernel_weights=distances,
                        subsample=subgroup, n_splits=10, n_cvs=10, method='automl',
                        verbose=True, seed=42)
res_aml.summary.round(2)

Predicted probabilities with method: automl
xgb_limitdepth; accuracy=0.81
xgb_limitdepth; accuracy=0.81
xgb_limitdepth; accuracy=0.81
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.83
xgb_limitdepth; accuracy=0.78
Conditional quantiles with method: gradient-boosting
Q 0.01) R2 = -4.27
Q 0.10) R2 = -0.56
Q 0.75) R2 = -0.57
Q 0.90) R2 = -1.64
Q 0.99) R2 = -3.80
Q 0.01) R2 = -1.51
Q 0.10) R2 = -0.62
Q 0.75) R2 = -0.23
Q 0.90) R2 = -0.90
Q 0.99) R2 = -2.13
Q 0.01) R2 = -6.05
Q 0.10) R2 = -2.22
Q 0.75) R2 = 0.02
Q 0.90) R2 = -0.61
Q 0.99) R2 = -2.37
Q 0.01) R2 = -4.98
Q 0.10) R2 = -0.94
Q 0.75) R2 = -0.80
Q 0.90) R2 = -2.34
Q 0.99) R2 = -4.63
Q 0.01) R2 = -5.25
Q 0.10) R2 = -1.51
Q 0.75) R2 = -0.60
Q 0.90) R2 = -2.11
Q 0.99) R2 = -4.66
Q 0.01) R2 = -4.85
Q 0.10) R2 = -1.12
Q 0.75) R2 = -0.33
Q 0.90) R2 = -1.24
Q 0.99) R2 = -3.78
Q 0.01) R2 = -9.08
Q 0.10) R2 = -2

Unnamed: 0,lower-bound,upper-bound,ci-low,ci-up
spillover,0.31,0.55,0.25,0.66


In [20]:
print('Having at least one treated friend increases computer usage from ' +
      '{:.0f}% to {:.0f}%'.format(res_aml.summary.iloc[0,0]*100, res_aml.summary.iloc[0,1]*100))

Having at least one treated friend increases computer usage from 31% to 55%
