In [1]:
import pandas as pd

## Files

In [2]:
path = '../../data/raw/ENAHO/'
enaho_dwelling_2014_file = path + 'enaho01-2014-100.dta'
enaho_dwelling_2015_file = path + 'enaho01-2015-100.dta'
enaho_dwelling_2016_file = path + 'enaho01-2016-100.dta'
enaho_dwelling_2017_file = path + 'enaho01-2017-100.dta'
enaho_dwelling_2018_file = path + 'enaho01-2018-100.dta'
enaho_summary_2014_file = path + 'sumaria-2014.dta'
enaho_summary_2015_file = path + 'sumaria-2015.dta'
enaho_summary_2016_file = path + 'sumaria-2016.dta'
enaho_summary_2017_file = path + 'sumaria-2017.dta'
enaho_summary_2018_file = path + 'sumaria-2018.dta'

## Dwelling

In [3]:
cols_dwelling = ['aÑo', 'conglome', 'vivienda', 'hogar', 'ubigeo', 'result', 'p110', 'factor07']
enaho_dwelling_2014 = pd.read_stata(enaho_dwelling_2014_file, columns=cols_dwelling)
enaho_dwelling_2015 = pd.read_stata(enaho_dwelling_2015_file, columns=cols_dwelling)
enaho_dwelling_2016 = pd.read_stata(enaho_dwelling_2016_file, columns=cols_dwelling)
enaho_dwelling_2017 = pd.read_stata(enaho_dwelling_2017_file, columns=cols_dwelling)
enaho_dwelling_2018 = pd.read_stata(enaho_dwelling_2018_file, columns=cols_dwelling)

In [4]:
enaho_dwelling = pd.concat([enaho_dwelling_2014,
                            enaho_dwelling_2015,
                            enaho_dwelling_2016,
                            enaho_dwelling_2017,
                            enaho_dwelling_2018]).reset_index(drop = True)

In [5]:
dwelling_names = {'aÑo': 'year',
                  'conglome': 'conglomerate',
                  'vivienda': 'house',
                  'hogar': 'household',
                  'ubigeo': 'IDDIST',
                  'p110': 'access to water',
                  'factor07': 'household weight'}
enaho_dwelling = enaho_dwelling.rename(columns = dwelling_names)

In [6]:
enaho_dwelling = enaho_dwelling[enaho_dwelling['IDDIST'].apply(lambda x: x[:4] == '1501' or x[:2] == '07')]
enaho_dwelling = enaho_dwelling[enaho_dwelling['access to water'].notna()]
enaho_dwelling.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,result,access to water,household weight
4179,2014,1650,18,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004
4180,2014,1650,31,11,70101,completa,"red pública, dentro de la vivienda",313.410004
4181,2014,1650,44,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004
4183,2014,1650,81,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004
4185,2014,1652,3,11,70101,completa,"red pública, dentro de la vivienda",290.570007


## Summary

In [7]:
cols_summary = ['aÑo', 'conglome', 'vivienda', 'hogar', 'ubigeo', 'mieperho']
enaho_summary_2014 = pd.read_stata(enaho_summary_2014_file, columns=cols_summary)
enaho_summary_2015 = pd.read_stata(enaho_summary_2015_file, columns=cols_summary)
enaho_summary_2016 = pd.read_stata(enaho_summary_2016_file, columns=cols_summary)
enaho_summary_2017 = pd.read_stata(enaho_summary_2017_file, columns=cols_summary)
enaho_summary_2018 = pd.read_stata(enaho_summary_2018_file, columns=cols_summary)

In [8]:
enaho_summary = pd.concat([enaho_summary_2014,
                           enaho_summary_2015,
                           enaho_summary_2016,
                           enaho_summary_2017,
                           enaho_summary_2018]).reset_index(drop = True)

In [9]:
summary_names = {'aÑo': 'year',
                 'conglome': 'conglomerate',
                 'vivienda': 'house',
                 'hogar': 'household',
                 'ubigeo': 'IDDIST',
                 'mieperho': 'num hh members'}
enaho_summary = enaho_summary.rename(columns = summary_names)

In [10]:
enaho_summary = enaho_summary[enaho_summary['IDDIST'].apply(lambda x: x[:4] == '1501' or x[:2] == '07')]
enaho_summary.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,num hh members
3243,2014,1650,18,11,70101,3
3244,2014,1650,31,11,70101,4
3245,2014,1650,44,11,70101,4
3246,2014,1650,81,11,70101,5
3247,2014,1652,3,11,70101,2


## Merging

In [11]:
enaho = pd.merge(enaho_dwelling, enaho_summary, how='left', on=['year', 'conglomerate', 'house', 'household', 'IDDIST'])
enaho.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,result,access to water,household weight,num hh members
0,2014,1650,18,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,3
1,2014,1650,31,11,70101,completa,"red pública, dentro de la vivienda",313.410004,4
2,2014,1650,44,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,4
3,2014,1650,81,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,5
4,2014,1652,3,11,70101,completa,"red pública, dentro de la vivienda",290.570007,2


In [12]:
enaho['hh members weight'] = enaho['household weight'] * enaho['num hh members']

In [13]:
enaho.loc[enaho['access to water'] == 'red pública, dentro de la vivienda', '1-piped water inside the house'] = 1
enaho.loc[enaho['access to water'] != 'red pública, dentro de la vivienda', '1-piped water inside the house'] = 0

In [14]:
enaho.loc[enaho['access to water'] == 'red pública, fuera de la vivienda pero dentro del edificio', '2-piped water outside the house, inside the property'] = 1
enaho.loc[enaho['access to water'] != 'red pública, fuera de la vivienda pero dentro del edificio', '2-piped water outside the house, inside the property'] = 0

In [15]:
enaho.loc[(enaho['access to water'] == 'camión - cisterna u otro similar') |
          (enaho['access to water'] == 'pilón o pileta de uso público') |
          (enaho['access to water'] == 'pilón de uso público'), '3-tanker truck or public reservoir'] = 1
enaho.loc[(enaho['access to water'] != 'camión - cisterna u otro similar') &
          (enaho['access to water'] != 'pilón o pileta de uso público') &
          (enaho['access to water'] != 'pilón de uso público'), '3-tanker truck or public reservoir'] = 0

In [16]:
enaho.loc[(enaho['access to water'] == 'pozo') |
          (enaho['access to water'] == 'pozo (agua subterranea)') |
          (enaho['access to water'] == 'río, acequia, manantial o similar') |
          (enaho['access to water'] == 'otra'), '4-water well, river or other'] = 1
enaho.loc[(enaho['access to water'] != 'pozo') &
          (enaho['access to water'] != 'pozo (agua subterranea)') &
          (enaho['access to water'] != 'río, acequia, manantial o similar') &
          (enaho['access to water'] != 'otra'), '4-water well, river or other'] = 0

In [17]:
enaho.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,result,access to water,household weight,num hh members,hh members weight,1-piped water inside the house,"2-piped water outside the house, inside the property",3-tanker truck or public reservoir,"4-water well, river or other"
0,2014,1650,18,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,3,940.22998,1.0,0.0,0.0,0.0
1,2014,1650,31,11,70101,completa,"red pública, dentro de la vivienda",313.410004,4,1253.640015,1.0,0.0,0.0,0.0
2,2014,1650,44,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,4,1253.640015,1.0,0.0,0.0,0.0
3,2014,1650,81,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,5,1567.050049,1.0,0.0,0.0,0.0
4,2014,1652,3,11,70101,completa,"red pública, dentro de la vivienda",290.570007,2,581.140015,1.0,0.0,0.0,0.0


## Bringing zones

In [18]:
zones_lima_file = '../../data/clean/iddist_zone_lima.csv'
zones_lima = pd.read_csv(zones_lima_file, dtype='str')

In [19]:
zones_lima.head()

Unnamed: 0,IDDIST,zone
0,150119,South Lima
1,150102,North Lima
2,150131,High income Lima
3,150120,High income Lima
4,150130,High income Lima


In [20]:
enaho = pd.merge(enaho, zones_lima, how='inner', on=['IDDIST'])
enaho.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,result,access to water,household weight,num hh members,hh members weight,1-piped water inside the house,"2-piped water outside the house, inside the property",3-tanker truck or public reservoir,"4-water well, river or other",zone
0,2014,1650,18,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,3,940.22998,1.0,0.0,0.0,0.0,Port
1,2014,1650,31,11,70101,completa,"red pública, dentro de la vivienda",313.410004,4,1253.640015,1.0,0.0,0.0,0.0,Port
2,2014,1650,44,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,4,1253.640015,1.0,0.0,0.0,0.0,Port
3,2014,1650,81,11,70101,incompleta,"red pública, dentro de la vivienda",313.410004,5,1567.050049,1.0,0.0,0.0,0.0,Port
4,2014,1652,3,11,70101,completa,"red pública, dentro de la vivienda",290.570007,2,581.140015,1.0,0.0,0.0,0.0,Port


## Collapsing

In [21]:
def weighted_mean_function_generator(col, weights):

    def weighted_mean(df):

        col_times_weight = df[col] * df[weights]
        weight = df[weights]

        return col_times_weight.sum() / weight.sum() * 100

    return weighted_mean

In [22]:
weighted_mean_water1 = weighted_mean_function_generator('1-piped water inside the house', 'hh members weight')
weighted_mean_water2 = weighted_mean_function_generator('2-piped water outside the house, inside the property', 'hh members weight')
weighted_mean_water3 = weighted_mean_function_generator('3-tanker truck or public reservoir', 'hh members weight')
weighted_mean_water4 = weighted_mean_function_generator('4-water well, river or other', 'hh members weight')

In [23]:
groupby_cols = ['year', 'zone']

In [24]:
zone_year_water1 = enaho.groupby(groupby_cols).apply(weighted_mean_water1).reset_index().rename(columns={0: '1-piped water inside the house'})
zone_year_water2 = enaho.groupby(groupby_cols).apply(weighted_mean_water2).reset_index().rename(columns={0: '2-piped water in the property'})
zone_year_water3 = enaho.groupby(groupby_cols).apply(weighted_mean_water3).reset_index().rename(columns={0: '3-tanker truck/public reservoir'})
zone_year_water4 = enaho.groupby(groupby_cols).apply(weighted_mean_water4).reset_index().rename(columns={0: '4-water well, river or other'})

In [25]:
enaho_year_zones_access_water = pd.merge(zone_year_water1, zone_year_water2, how='inner', on=['year', 'zone'])
enaho_year_zones_access_water = pd.merge(enaho_year_zones_access_water, zone_year_water3, how='inner', on=['year', 'zone'])
enaho_year_zones_access_water = pd.merge(enaho_year_zones_access_water, zone_year_water4, how='inner', on=['year', 'zone'])

In [26]:
enaho_year_zones_access_water.head()

Unnamed: 0,year,zone,1-piped water inside the house,2-piped water in the property,3-tanker truck/public reservoir,"4-water well, river or other"
0,2014,East Lima,85.43379,0.781328,7.731766,6.053123
1,2014,High income Lima,91.680318,2.495587,4.100707,1.72339
2,2014,North Lima,90.775166,2.156598,4.953599,2.114641
3,2014,Old town,90.234172,5.572719,1.401075,2.792039
4,2014,Port,88.98644,1.762187,7.430813,1.820563


In [27]:
enaho_year_zones_access_water.to_csv('../../data/clean/access to water by zones_2014-2018.csv', index=False)