# RCT data

The data comes from the paper: 
- [Beuermann et al. "One Laptop per Child at Home: Short-Term Impacts from a Randomized Experiment in Peru", American Economic Journal: Applied Economics](https://www.aeaweb.org/articles?id=10.1257/app.20130267)

It is publicly available here: https://www.openicpsr.org/openicpsr/project/113587/version/V1/view

Specifically, we will use:
- `listas_final.dta`: Dataset containing student status with respect to lottery participation and whether a XO laptop was obtained.
- `cestudiante_g3-6_p2_r1.dta`: Raw dataset of the second section of the student questionnaire before experiment
- `cestudiante_g3-6_p1_r2.dta`: Raw dataset of the first section of the student questionnaire after experiment
- `cestudiante_g3-6_p2_r2.dta`: Raw dataset of the second section of the student questionnaire after experiment

In [1]:
import numpy as np
import pandas as pd

## Individual Data

In [2]:
# RCT data
path_data = "../data/"
df_RCT = pd.read_stata(path_data + "listas_final.dta")

# Eligible students: participated in the lottery and were in treated schools
df_RCT['participated_treated'] = df_RCT['participated_in_lottery'] * df_RCT['treatment_school']
eligible = df_RCT[df_RCT['participated_treated'] == 1]['codest'].values
df_RCT = df_RCT[df_RCT['codest'].isin(eligible)]
df_RCT = df_RCT[['codest', 'won_lottery', 'received_laptop']].astype(int)
df_RCT

Unnamed: 0,codest,won_lottery,received_laptop
0,2083300101001,0,0
2,2083300101003,0,0
3,2083300101004,0,0
4,2083300101005,0,0
5,2083300101006,0,0
...,...,...,...
9813,10449990603020,0,0
9814,10449990603021,0,0
9817,10449990603024,0,0
9818,10449990603025,0,0


In [3]:
# Part 2 has outcome data
df_p2 = pd.read_stata(path_data + "cestudiante_g3-6_p2_r2.dta", convert_categoricals=False)

# Outcomes: computer use (minutes yesterday)
df_p2['use_home'] = (df_p2['P4_A2'] * 60) + df_p2['P4_B2']
df_p2['use_school'] = (df_p2['P4_A1'] * 60) + df_p2['P4_B1']
df_p2['use_cafe'] = (df_p2['P4_A3'] * 60) + df_p2['P4_B3']
df_p2['use_friend_house'] = (df_p2['P4_A4'] * 60) + df_p2['P4_B4']
df_p2['use_other'] = (df_p2['P4_A5'].fillna(0) * 60) + df_p2['P4_B5'].fillna(0)
df_p2['use_all'] = df_p2['use_home'] + df_p2['use_school'] + df_p2['use_cafe'] + df_p2['use_friend_house'] + df_p2['use_other']
df_p2 = df_p2[['CODMOD', 'GRADO', 'SECCION', 'CODEST', 'use_all']]
df_p2

Unnamed: 0,CODMOD,GRADO,SECCION,CODEST,use_all
0,208330.0,3.0,1.0,2.083300e+12,510.0
1,208330.0,3.0,1.0,2.083300e+12,544.0
2,208330.0,3.0,1.0,2.083300e+12,0.0
3,208330.0,3.0,1.0,2.083300e+12,116.0
4,208330.0,3.0,1.0,2.083300e+12,380.0
...,...,...,...,...,...
6036,1258649.0,6.0,1.0,1.258649e+13,110.0
6037,1258649.0,6.0,1.0,1.258649e+13,0.0
6038,1258649.0,6.0,1.0,1.258649e+13,75.0
6039,1258649.0,6.0,1.0,1.258649e+13,210.0


In [4]:
# Part 1 has baseline covariates
df_p1 = pd.read_stata(path_data + "cestudiante_g3-6_p1_r2.dta", convert_categoricals=False)

# Data with baseline covariates
df_cov = df_p1[['CODEST'] + ['P'+str(i) for i in range(1, 8)] + ['P8_A'+str(i) for i in range(1, 5)]]
df_cov = df_cov.replace([-9, -8], np.nan)
df_cov.columns = ['student', 'male', 'age', 'n_siblings', 'n_young_siblings',
                  'father_lives_home', 'father_works_home', 'mother_works_home',
                  'home_phone', 'home_power', 'home_car', 'home_moto']
dummy_cols = ['male', 'father_lives_home', 'father_works_home', 'mother_works_home',
              'home_phone', 'home_power', 'home_car', 'home_moto']
df_cov[dummy_cols] = np.where(df_cov[dummy_cols] == 2, 0, df_cov[dummy_cols])
df_cov

Unnamed: 0,student,male,age,n_siblings,n_young_siblings,father_lives_home,father_works_home,mother_works_home,home_phone,home_power,home_car,home_moto
0,2.083300e+12,1.0,9.0,3.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,0.0
1,2.083300e+12,1.0,7.0,2.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2.083300e+12,1.0,8.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0
3,2.083300e+12,0.0,8.0,2.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,1.0
4,2.083300e+12,1.0,9.0,6.0,0.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...
6003,1.258649e+13,0.0,11.0,2.0,2.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0
6004,1.258649e+13,0.0,13.0,3.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0
6005,1.258649e+13,1.0,11.0,0.0,0.0,1.0,1.0,0.0,,1.0,,
6006,1.258649e+13,0.0,11.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,0.0,0.0


In [5]:
# Round 1 has baseline outcome data
df_r1 = pd.read_stata(path_data + "cestudiante_g3-6_p2_r1.dta", convert_categoricals=False)

# Baseline Outcome: computer use (minutes yesterday)
df_r1['past_use_home'] = (df_r1['P5_A2'] * 60) + df_r1['P5_B2']
df_r1['past_use_school'] = (df_r1['P5_A1'] * 60) + df_r1['P5_B1']
df_r1['past_use_cafe'] = (df_r1['P5_A3'] * 60) + df_r1['P5_B3']
df_r1['past_use_friend_house'] = (df_r1['P5_A4'] * 60) + df_r1['P5_B4']
df_r1['past_use_other'] = (df_r1['P5_A5'] * 60) + df_r1['P5_B5']
df_r1['past_use_all'] =  df_r1['past_use_home'] + df_r1['past_use_school'] + df_r1['past_use_cafe'] + df_r1['past_use_friend_house'] + df_r1['past_use_other']

# Data with baseline outcome and other covariates
df_past_outcome = df_r1[['CODEST', 'past_use_all']
                        + ['P6_A'+str(i) for i in range(1, 6)]]
df_past_outcome.columns = ['student', 'past_use_all',
                           'past_week_home', 'past_week_school', 'past_week_cafe', 'past_week_friend_house', 'past_week_other']
dummy_cols = ['past_week_home', 'past_week_school', 'past_week_cafe', 'past_week_friend_house', 'past_week_other']

# Other covariates
df_past_outcome = df_past_outcome.replace([-8, -9], np.nan)
df_past_outcome[dummy_cols] = np.where(df_past_outcome[dummy_cols] == 2, 0, df_past_outcome[dummy_cols])
df_past_outcome

Unnamed: 0,student,past_use_all,past_week_home,past_week_school,past_week_cafe,past_week_friend_house,past_week_other
0,2.083300e+12,0.0,1.0,1.0,1.0,1.0,
1,2.083300e+12,380.0,1.0,1.0,1.0,0.0,0.0
2,2.083300e+12,102.0,0.0,1.0,0.0,0.0,0.0
3,2.083300e+12,0.0,1.0,0.0,1.0,1.0,1.0
4,2.083300e+12,280.0,0.0,1.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...
6116,1.258649e+13,90.0,0.0,0.0,1.0,1.0,1.0
6117,1.258649e+13,120.0,0.0,0.0,1.0,0.0,1.0
6118,1.258649e+13,,,,,,
6119,1.258649e+13,85.0,0.0,0.0,1.0,0.0,0.0


In [6]:
# Merge RCT data with outcome data
data = df_p2.merge(df_RCT, left_on='CODEST', right_on='codest', how='inner')
data = data.drop(columns=['codest'])
data = data.rename(columns={'CODMOD': 'school', 'GRADO': 'grade', 'SECCION': 'section', 'CODEST': 'student'})
data = data.astype({'school': int, 'grade': int, 'section': int, 'student': int})

# Merge data with baseline covariates
data = data.merge(df_cov, on='student', how='left')

# Merge data with baseline outcome
data = data.merge(df_past_outcome, on='student', how='left')

# Create class variable
data['classroom'] = data['school'].astype(str) + '_' + data['grade'].astype(str) + '_' + data['section'].astype(str)

data

Unnamed: 0,school,grade,section,student,use_all,won_lottery,received_laptop,male,age,n_siblings,...,home_power,home_car,home_moto,past_use_all,past_week_home,past_week_school,past_week_cafe,past_week_friend_house,past_week_other,classroom
0,208330,3,1,2083300301002,544.0,0,0,1.0,7.0,2.0,...,1.0,0.0,0.0,380.0,1.0,1.0,1.0,0.0,0.0,208330_3_1
1,208330,3,1,2083300301003,0.0,0,0,1.0,8.0,1.0,...,1.0,0.0,0.0,102.0,0.0,1.0,0.0,0.0,0.0,208330_3_1
2,208330,3,1,2083300301005,116.0,1,1,0.0,8.0,2.0,...,1.0,0.0,1.0,280.0,0.0,1.0,0.0,0.0,0.0,208330_3_1
3,208330,3,1,2083300301006,380.0,0,0,1.0,9.0,6.0,...,1.0,1.0,1.0,140.0,0.0,1.0,0.0,0.0,0.0,208330_3_1
4,208330,3,1,2083300301007,102.0,0,0,1.0,9.0,1.0,...,1.0,0.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,208330_3_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3080,1044999,6,3,10449990603020,30.0,0,0,1.0,10.0,0.0,...,1.0,0.0,0.0,,,,,,,1044999_6_3
3081,1044999,6,3,10449990603021,135.0,0,0,0.0,13.0,2.0,...,1.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0,1044999_6_3
3082,1044999,6,3,10449990603024,390.0,0,0,0.0,12.0,5.0,...,1.0,0.0,1.0,,,,,,,1044999_6_3
3083,1044999,6,3,10449990603025,240.0,0,0,1.0,11.0,2.0,...,1.0,1.0,0.0,300.0,0.0,1.0,0.0,0.0,0.0,1044999_6_3


In [7]:
data = data[['school', 'grade', 'section', 'classroom', 'student', 'use_all', 'won_lottery',
             'male', 'age', 'n_siblings', 'n_young_siblings', 'father_lives_home', 'father_works_home',
             'mother_works_home', 'home_phone', 'home_power', 'home_car', 'home_moto', 'past_use_all',
             'past_week_home', 'past_week_school', 'past_week_cafe', 'past_week_friend_house', 'past_week_other']]

data = data.rename(columns={'use_all': 'computer_use', 'past_use_all': 'past_computer_use'})
data.to_csv(path_data + "data.csv", index=False)

## Network Data

In [8]:
df_edges = df_p1.loc[df_p1['CODEST'].isin(data['student'].values), [col for col in df_p1.columns if 'CODEST' in col]]
df_edges = df_edges.iloc[:, :5]
df_edges.columns = ['student', 'friend1', 'friend2', 'friend3', 'friend4']
df_edges.to_csv(path_data + "edges.csv", index=False)