In [1]:
import pandas as pd

## Labor

In [2]:
path = '../../data/raw/ENAHO/'
enaho_labor_2018_file = path + 'enaho01a-2018-500.dta'

In [3]:
cols_labor = ['aÑo', 'ubigeo', 'conglome', 'vivienda', 'hogar', 'codperso', 'p501', 'p558a1', 'p558a2', 'p558a3','fac500a']
enaho_labor_2018 = pd.read_stata(enaho_labor_2018_file, columns=cols_labor)

In [4]:
labor_names = {'aÑo': 'year',
               'conglome': 'conglomerate',
               'vivienda': 'house',
               'hogar': 'household',
               'ubigeo': 'IDDIST',
               'codperso': 'person',
               'p501': 'has a job',
               'p558a1': 'private retirement fund',
               'p558a2': 'public retirement fund 1',
               'p558a3': 'public retirement fund 2',
               'fac500a': 'person weight'}
enaho_labor_2018 = enaho_labor_2018.rename(columns = labor_names)
enaho_labor_2018['working age'] = 1

In [5]:
#enaho_labor_2018 = enaho_labor_2018[enaho_labor_2018['has a job'] == 'si']
enaho_labor_2018 = enaho_labor_2018[enaho_labor_2018['IDDIST'].apply(lambda x: x[:4] == '1501' or x[:2] == '07')]
enaho_labor_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight,working age
17083,2018,70106,6023,3,11,1,si,pase,pase,pase,326.991455,1
17084,2018,70106,6023,3,11,2,no,pase,pase,pase,288.977936,1
17085,2018,70106,6023,14,11,1,no,pase,sistema nacional de pensiones: ley 19990,pase,334.1474,1
17086,2018,70106,6023,14,11,2,no,pase,pase,pase,326.991455,1
17087,2018,70106,6023,14,11,4,no,pase,pase,pase,237.115311,1


## Health

In [6]:
enaho_health_2018_file = path + 'enaho01a-2018-400.dta'

In [7]:
cols_health = ['aÑo', 'ubigeo', 'conglome', 'vivienda', 'hogar', 'codperso', 'p4191', 'p4192', 'p4193', 'p4194']
enaho_health_2018 = pd.read_stata(enaho_health_2018_file, columns=cols_health)

In [8]:
health_names = {'aÑo': 'year',
                'conglome': 'conglomerate',
                'vivienda': 'house',
                'hogar': 'household',
                'ubigeo': 'IDDIST',
                'codperso': 'person',
                'p4191': 'general workers health provider',
                'p4192': 'private health provider 1',
                'p4193': 'private health provider 2',
                'p4194': 'police/military health provider'}
enaho_health_2018 = enaho_health_2018.rename(columns = health_names)
enaho_health_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,general workers health provider,private health provider 1,private health provider 2,police/military health provider
0,2018,10101,5002,34,11,1,essalud,no,no,seguro ff.aa./policiales
1,2018,10101,5002,34,11,2,essalud,no,no,no
2,2018,10101,5002,34,11,3,no,no,no,no
3,2018,10101,5002,34,11,4,no,no,no,no
4,2018,10101,5002,83,11,1,essalud,no,no,no


## Merging

In [9]:
enaho_2018 = pd.merge(enaho_labor_2018, enaho_health_2018, how='left', on=['year', 'IDDIST', 'conglomerate', 'house', 'household', 'person'])

In [10]:
recode = {'si': '1',
          'sistema privado de pensiones (afp)': '1',
          'sistema nacional de pensiones: ley 19990': '1',
          'sistema nacional de pensiones ley 20530 (cédula viva)': '1',
          'essalud': '1',
          'seguro privado de salud': '1',
          'entidad prestadora de salud': '1',
          'seguro ff.aa./policiales': '1',
          'pase': '0',
          'no': '0'}
recode_cols = ['has a job',
               'private retirement fund',
               'public retirement fund 1',
               'public retirement fund 2',
               'general workers health provider', 
               'private health provider 1',
               'private health provider 2',
               'police/military health provider']

for col in recode_cols:
    enaho_2018[col] = pd.to_numeric(enaho_2018[col].map(recode))

enaho_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight,working age,general workers health provider,private health provider 1,private health provider 2,police/military health provider
0,2018,70106,6023,3,11,1,1.0,0.0,0.0,0.0,326.991455,1,1.0,0.0,0.0,0.0
1,2018,70106,6023,3,11,2,0.0,0.0,0.0,0.0,288.977936,1,1.0,0.0,0.0,0.0
2,2018,70106,6023,14,11,1,0.0,0.0,1.0,0.0,334.1474,1,0.0,0.0,0.0,0.0
3,2018,70106,6023,14,11,2,0.0,0.0,0.0,0.0,326.991455,1,0.0,0.0,0.0,0.0
4,2018,70106,6023,14,11,4,0.0,0.0,0.0,0.0,237.115311,1,0.0,0.0,0.0,0.0


In [11]:
enaho_2018.loc[(enaho_2018['private retirement fund'] == 1) | 
               (enaho_2018['public retirement fund 1'] == 1) |
               (enaho_2018['public retirement fund 2'] == 1), 'retirement fund'] = 1
enaho_2018.loc[(enaho_2018['private retirement fund'] == 0) &
               (enaho_2018['public retirement fund 1'] == 0) &
               (enaho_2018['public retirement fund 2'] == 0), 'retirement fund'] = 0

In [12]:
enaho_2018.loc[(enaho_2018['general workers health provider'] == 1) | 
               (enaho_2018['private health provider 1'] == 1) |
               (enaho_2018['private health provider 2'] == 1) |
               (enaho_2018['police/military health provider'] == 1), 'health provision'] = 1
enaho_2018.loc[(enaho_2018['general workers health provider'] == 0) &
               (enaho_2018['private health provider 1'] == 0) &
               (enaho_2018['private health provider 2'] == 0) &
               (enaho_2018['police/military health provider'] == 0), 'health provision'] = 0

In [13]:
enaho_2018.loc[(enaho_2018['retirement fund'] == 1) & 
               (enaho_2018['health provision'] == 1) &
               (enaho_2018['has a job'] == 1), 'formal job'] = 1
enaho_2018.loc[(enaho_2018['retirement fund'] == 0) |
               (enaho_2018['health provision'] == 0) |
               (enaho_2018['has a job'] == 0), 'formal job'] = 0

In [14]:
enaho_2018.head()

Unnamed: 0,year,IDDIST,conglomerate,house,household,person,has a job,private retirement fund,public retirement fund 1,public retirement fund 2,person weight,working age,general workers health provider,private health provider 1,private health provider 2,police/military health provider,retirement fund,health provision,formal job
0,2018,70106,6023,3,11,1,1.0,0.0,0.0,0.0,326.991455,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2018,70106,6023,3,11,2,0.0,0.0,0.0,0.0,288.977936,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
2,2018,70106,6023,14,11,1,0.0,0.0,1.0,0.0,334.1474,1,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,2018,70106,6023,14,11,2,0.0,0.0,0.0,0.0,326.991455,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,2018,70106,6023,14,11,4,0.0,0.0,0.0,0.0,237.115311,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Merging with zones of Lima

In [15]:
zones_lima_file = '../../data/clean/iddist_zone_lima.csv'
zones_lima = pd.read_csv(zones_lima_file, dtype='str')

In [16]:
enaho_2018 = pd.merge(enaho_2018, zones_lima, how='inner', on=['IDDIST'])

In [17]:
enaho_2018['zone'].unique()

array(['Port', 'North Lima', 'Old town', 'San Juan de Lurigancho',
       'East Lima', 'High income Lima', 'South Lima', 'South beach'],
      dtype=object)

In [18]:
for zone in enaho_2018['zone'].unique():
    enaho_2018.loc[enaho_2018['zone'] == zone, zone] = 1
    enaho_2018.loc[enaho_2018['zone'] != zone, zone] = 0

## Collapsing

In [19]:
enaho_2018.columns

Index(['year', 'IDDIST', 'conglomerate', 'house', 'household', 'person',
       'has a job', 'private retirement fund', 'public retirement fund 1',
       'public retirement fund 2', 'person weight', 'working age',
       'general workers health provider', 'private health provider 1',
       'private health provider 2', 'police/military health provider',
       'retirement fund', 'health provision', 'formal job', 'zone', 'Port',
       'North Lima', 'Old town', 'San Juan de Lurigancho', 'East Lima',
       'High income Lima', 'South Lima', 'South beach'],
      dtype='object')

In [20]:
def weighted_mean_function_generator(col, weights):

    def weighted_mean(df):

        col_times_weight = df[col] * df[weights]
        weight = df[weights]

        return col_times_weight.sum() / weight.sum()

    return weighted_mean

In [21]:
def absolute_number_function_generator(col, weights):

    def absolute_number(df):

        col_times_weight = df[col] * df[weights]

        return round(col_times_weight.sum())

    return absolute_number

In [22]:
weighted_mean_working_age = weighted_mean_function_generator('working age', 'person weight')
weighted_mean_job = weighted_mean_function_generator('has a job', 'person weight')
weighted_mean_formal_job = weighted_mean_function_generator('formal job', 'person weight')

In [23]:
absolute_n_working_age = absolute_number_function_generator('working age', 'person weight')
absolute_n_job = absolute_number_function_generator('has a job', 'person weight')
absolute_n_formal_job = absolute_number_function_generator('formal job', 'person weight')

In [24]:
groupby_cols = ['year','zone']

In [25]:
rate_working_age = enaho_2018.groupby(groupby_cols).apply(weighted_mean_working_age).reset_index().rename(columns={0: 'Rate working age'})
rate_jobs = enaho_2018.groupby(groupby_cols).apply(weighted_mean_job).reset_index().rename(columns={0: 'Rate employed'})
rate_formal_jobs = enaho_2018.groupby(groupby_cols).apply(weighted_mean_formal_job).reset_index().rename(columns={0: 'Rate formal job'})

In [26]:
abs_working_age = enaho_2018.groupby(groupby_cols).apply(absolute_n_working_age).reset_index().rename(columns={0: 'N Working age'})
abs_jobs = enaho_2018.groupby(groupby_cols).apply(absolute_n_job).reset_index().rename(columns={0: 'N Employed'})
abs_formal_jobs = enaho_2018.groupby(groupby_cols).apply(absolute_n_formal_job).reset_index().rename(columns={0: 'N Formal job'})

In [27]:
formal_jobs_by_zone = pd.merge(rate_working_age, rate_jobs, how='inner', on=groupby_cols)
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, rate_formal_jobs, how='inner', on=groupby_cols)
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, abs_working_age, how='inner', on=groupby_cols)
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, abs_jobs, how='inner', on=groupby_cols)
formal_jobs_by_zone = pd.merge(formal_jobs_by_zone, abs_formal_jobs, how='inner', on=groupby_cols)

In [28]:
formal_jobs_by_zone

Unnamed: 0,year,zone,Rate working age,Rate employed,Rate formal job,N Working age,N Employed,N Formal job
0,2018,East Lima,1.0,0.647278,0.190162,848632.0,549301.0,161377.0
1,2018,High income Lima,1.0,0.596687,0.324168,1419815.0,847185.0,460259.0
2,2018,North Lima,1.0,0.629057,0.204191,1689719.0,1062930.0,345026.0
3,2018,Old town,1.0,0.630107,0.236875,1316593.0,829595.0,311868.0
4,2018,Port,1.0,0.600821,0.235895,843129.0,506570.0,198890.0
5,2018,San Juan de Lurigancho,1.0,0.602862,0.186603,863519.0,520583.0,161135.0
6,2018,South Lima,1.0,0.640311,0.206142,1234228.0,790290.0,254426.0
7,2018,South beach,1.0,0.676153,0.057569,60506.0,40911.0,3483.0


In [29]:
formal_jobs_by_zone.to_csv('../../data/clean/percentage formal jobs by zone_lima_2018.csv', index=False)