In [1]:
import pandas as pd

## Labor

In [2]:
path = '../data/raw/ENAHO/'
enaho_labor_2018_file = path + 'enaho01a-2018-500.dta'

In [3]:
cols_labor = ['aÑo', 'ubigeo', 'conglome', 'vivienda', 'hogar', 'codperso', 'p501', 'p558a1', 'p558a2', 'p558a3','fac500a']
enaho_labor_2018 = pd.read_stata(enaho_labor_2018_file, columns=cols_labor)

In [4]:
labor_names = {'aÑo': 'year',
               'conglome': 'conglomerate',
               'vivienda': 'house',
               'hogar': 'household',
               'ubigeo': 'IDDIST',
               'codperso': 'person',
               'p501': 'has a job',
               'p558a1': 'private retirement fund',
               'p558a2': 'public retirement fund 1',
               'p558a3': 'public retirement fund 2',
               'fac500a': 'person weight'}
enaho_labor_2018 = enaho_labor_2018.rename(columns = labor_names)

In [5]:
enaho_labor_2018 = enaho_labor_2018[enaho_labor_2018['has a job'] == 'si']
enaho_labor_2018 = enaho_labor_2018[enaho_labor_2018['IDDIST'].apply(lambda x: x[:4] == '1501' or x[:2] == '07')]
enaho_labor_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight
17083,2018,70106,6023,3,11,1,si,pase,pase,pase,326.991455
17088,2018,70106,6023,14,11,5,si,pase,pase,pase,288.977936
17089,2018,70106,6023,36,11,1,si,sistema privado de pensiones (afp),pase,pase,334.1474
17090,2018,70106,6023,36,11,2,si,pase,pase,pase,326.991455
17091,2018,70106,6023,57,11,1,si,pase,pase,pase,237.115311


## Health

In [6]:
enaho_health_2018_file = path + 'enaho01a-2018-400.dta'

In [7]:
cols_health = ['aÑo', 'ubigeo', 'conglome', 'vivienda', 'hogar', 'codperso', 'p4191', 'p4192', 'p4193', 'p4194']
enaho_health_2018 = pd.read_stata(enaho_health_2018_file, columns=cols_health)

In [8]:
health_names = {'aÑo': 'year',
                'conglome': 'conglomerate',
                'vivienda': 'house',
                'hogar': 'household',
                'ubigeo': 'IDDIST',
                'codperso': 'person',
                'p4191': 'general workers health provider',
                'p4192': 'private health provider 1',
                'p4193': 'private health provider 2',
                'p4194': 'police/military health provider'}
enaho_health_2018 = enaho_health_2018.rename(columns = health_names)
enaho_health_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,general workers health provider,private health provider 1,private health provider 2,police/military health provider
0,2018,10101,5002,34,11,1,essalud,no,no,seguro ff.aa./policiales
1,2018,10101,5002,34,11,2,essalud,no,no,no
2,2018,10101,5002,34,11,3,no,no,no,no
3,2018,10101,5002,34,11,4,no,no,no,no
4,2018,10101,5002,83,11,1,essalud,no,no,no


## Merging

In [9]:
enaho_2018 = pd.merge(enaho_labor_2018, enaho_health_2018, how='left', on=['year', 'IDDIST', 'conglomerate', 'house', 'household', 'person'])

In [10]:
recode = {'si': '1',
          'sistema privado de pensiones (afp)': '1',
          'sistema nacional de pensiones: ley 19990': '1',
          'sistema nacional de pensiones ley 20530 (cédula viva)': '1',
          'essalud': '1',
          'seguro privado de salud': '1',
          'entidad prestadora de salud': '1',
          'seguro ff.aa./policiales': '1',
          'pase': '0',
          'no': '0'}
recode_cols = ['has a job',
               'private retirement fund',
               'public retirement fund 1',
               'public retirement fund 2',
               'general workers health provider', 
               'private health provider 1',
               'private health provider 2',
               'police/military health provider']

for col in recode_cols:
    enaho_2018[col] = pd.to_numeric(enaho_2018[col].map(recode))

enaho_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight,general workers health provider,private health provider 1,private health provider 2,police/military health provider
0,2018,70106,6023,3,11,1,1,0,0,0,326.991455,1.0,0.0,0.0,0.0
1,2018,70106,6023,14,11,5,1,0,0,0,288.977936,0.0,0.0,0.0,0.0
2,2018,70106,6023,36,11,1,1,1,0,0,334.1474,1.0,0.0,0.0,0.0
3,2018,70106,6023,36,11,2,1,0,0,0,326.991455,0.0,0.0,0.0,0.0
4,2018,70106,6023,57,11,1,1,0,0,0,237.115311,0.0,0.0,0.0,0.0


In [11]:
enaho_2018.loc[(enaho_2018['private retirement fund'] == 1) | 
               (enaho_2018['public retirement fund 1'] == 1) |
               (enaho_2018['public retirement fund 2'] == 1), 'retirement fund'] = 1
enaho_2018.loc[(enaho_2018['private retirement fund'] == 0) &
               (enaho_2018['public retirement fund 1'] == 0) &
               (enaho_2018['public retirement fund 2'] == 0), 'retirement fund'] = 0

In [12]:
enaho_2018.loc[(enaho_2018['general workers health provider'] == 1) | 
               (enaho_2018['private health provider 1'] == 1) |
               (enaho_2018['private health provider 2'] == 1) |
               (enaho_2018['police/military health provider'] == 1), 'health provision'] = 1
enaho_2018.loc[(enaho_2018['general workers health provider'] == 0) &
               (enaho_2018['private health provider 1'] == 0) &
               (enaho_2018['private health provider 2'] == 0) &
               (enaho_2018['police/military health provider'] == 0), 'health provision'] = 0

In [13]:
enaho_2018.loc[(enaho_2018['retirement fund'] == 1) & 
               (enaho_2018['health provision'] == 1), 'formal job'] = 1
enaho_2018.loc[(enaho_2018['retirement fund'] == 0) |
               (enaho_2018['health provision'] == 0), 'formal job'] = 0

In [14]:
enaho_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight,general workers health provider,private health provider 1,private health provider 2,police/military health provider,retirement fund,health provision,formal job
0,2018,70106,6023,3,11,1,1,0,0,0,326.991455,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2018,70106,6023,14,11,5,1,0,0,0,288.977936,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,2018,70106,6023,36,11,1,1,1,0,0,334.1474,1.0,0.0,0.0,0.0,1.0,1.0,1.0
3,2018,70106,6023,36,11,2,1,0,0,0,326.991455,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018,70106,6023,57,11,1,1,0,0,0,237.115311,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Merging with zones of Lima

In [15]:
zones_lima_file = '../data/clean/iddist_zone_lima.csv'
zones_lima = pd.read_csv(zones_lima_file, dtype='str')

In [16]:
enaho_2018 = pd.merge(enaho_2018, zones_lima, how='inner', on=['IDDIST'])

In [17]:
enaho_2018['zone'].unique()

array(['Port', 'North Lima', 'Old town', 'San Juan de Lurigancho',
       'East Lima', 'High income Lima', 'South Lima', 'South beach'],
      dtype=object)

In [18]:
for zone in enaho_2018['zone'].unique():
    enaho_2018.loc[enaho_2018['zone'] == zone, zone] = 1
    enaho_2018.loc[enaho_2018['zone'] != zone, zone] = 0

## Collapsing

In [19]:
def weighted_mean_function_generator(col, weights):

    def weighted_mean(df):

        col_times_weight = df[col] * df[weights]
        weight = df[weights]

        return col_times_weight.sum() / weight.sum()

    return weighted_mean

In [20]:
weighted_mean_zone1 = weighted_mean_function_generator('Port', 'person weight')
weighted_mean_zone2 = weighted_mean_function_generator('North Lima', 'person weight')
weighted_mean_zone3 = weighted_mean_function_generator('Old town', 'person weight')
weighted_mean_zone4 = weighted_mean_function_generator('San Juan de Lurigancho', 'person weight')
weighted_mean_zone5 = weighted_mean_function_generator('East Lima', 'person weight')
weighted_mean_zone6 = weighted_mean_function_generator('High income Lima', 'person weight')
weighted_mean_zone7 = weighted_mean_function_generator('South Lima', 'person weight')
weighted_mean_zone8 = weighted_mean_function_generator('South beach', 'person weight')

In [21]:
groupby_cols = ['year']

In [22]:
zone_formal_jobs1 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone1).reset_index().rename(columns={0: 'Port'})
zone_formal_jobs2 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone2).reset_index().rename(columns={0: 'North Lima'})
zone_formal_jobs3 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone3).reset_index().rename(columns={0: 'Old town'})
zone_formal_jobs4 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone4).reset_index().rename(columns={0: 'San Juan de Lurigancho'})
zone_formal_jobs5 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone5).reset_index().rename(columns={0: 'East Lima'})
zone_formal_jobs6 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone6).reset_index().rename(columns={0: 'High income Lima'})
zone_formal_jobs7 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone7).reset_index().rename(columns={0: 'South Lima'})
zone_formal_jobs8 = enaho_2018.groupby(groupby_cols).apply(weighted_mean_zone8).reset_index().rename(columns={0: 'South beach'})

In [23]:
formal_jobs_by_zone = pd.merge(zone_formal_jobs1, zone_formal_jobs2, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs3, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs4, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs5, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs6, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs7, how='inner', on='year')
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, zone_formal_jobs8, how='inner', on='year')

In [24]:
formal_jobs_by_zone

Unnamed: 0,year,Port,North Lima,Old town,San Juan de Lurigancho,East Lima,High income Lima,South Lima,South beach
0,2018,0.098413,0.2065,0.161169,0.101136,0.106715,0.164586,0.153533,0.007948


In [25]:
formal_jobs_by_zone.to_csv('../data/clean/percentage formal jobs by zone_lima_2018.csv', index=False)