## Libraries

In [1]:
import pandas as pd

## Files

In [2]:
path = '../../data/raw/ENAHO/'
enaho_summary_2016_file = path + 'sumaria-2016.dta'
enaho_summary_2017_file = path + 'sumaria-2017.dta'
enaho_summary_2018_file = path + 'sumaria-2018.dta'

In [3]:
cols_summary = ['aÑo', 'conglome', 'vivienda', 'hogar', 'ubigeo', 'mieperho', 'pobreza', 'factor07']
enaho_summary_2016 = pd.read_stata(enaho_summary_2016_file, columns=cols_summary)
enaho_summary_2017 = pd.read_stata(enaho_summary_2017_file, columns=cols_summary)
enaho_summary_2018 = pd.read_stata(enaho_summary_2018_file, columns=cols_summary)

In [4]:
enaho_summary = pd.concat([enaho_summary_2016,
                           enaho_summary_2017,
                           enaho_summary_2018]).reset_index(drop = True)

In [5]:
summary_names = {'aÑo': 'year',
                 'conglome': 'conglomerate',
                 'vivienda': 'house',
                 'hogar': 'household',
                 'ubigeo': 'IDDIST',
                 'mieperho': 'num hh members',
                 'pobreza': 'poverty classification',
                 'factor07': 'hh weight'}
enaho_summary = enaho_summary.rename(columns = summary_names)

In [6]:
enaho_summary = enaho_summary[enaho_summary['IDDIST'].apply(lambda x: x[:4] == '1501' or x[:2] == '07')]
enaho_summary = enaho_summary[enaho_summary['poverty classification'].notna()]

In [7]:
enaho_summary['hh members weight'] = enaho_summary['hh weight'] * enaho_summary['num hh members']

In [8]:
enaho_summary.loc[(enaho_summary['poverty classification'] == 'pobre no extremo') | \
                  (enaho_summary['poverty classification'] == 'pobreno extremo') | \
                  (enaho_summary['poverty classification'] == 'pobre extremo'), 'poverty'] = 1
enaho_summary.loc[(enaho_summary['poverty classification'] == 'no pobre'), 'poverty'] = 0

In [9]:
enaho_summary.loc[(enaho_summary['poverty classification'] == 'pobre extremo'), 'extreme poverty'] = 1
enaho_summary.loc[(enaho_summary['poverty classification'] != 'pobre extremo'), 'extreme poverty'] = 0

In [10]:
enaho_summary.head()

Unnamed: 0,year,conglomerate,house,household,IDDIST,num hh members,poverty classification,hh weight,hh members weight,poverty,extreme poverty
457,2016,1652,3,11,70101,2,no pobre,330.041718,660.083435,0.0,0.0
458,2016,1652,42,11,70101,6,no pobre,330.041718,1980.250244,0.0,0.0
459,2016,1652,64,11,70101,4,no pobre,330.041718,1320.16687,0.0,0.0
460,2016,1652,103,11,70101,5,no pobre,330.041718,1650.208618,0.0,0.0
461,2016,1661,11,11,70101,4,no pobre,330.041718,1320.16687,0.0,0.0


In [11]:
enaho_summary['poverty classification'].value_counts()

no pobre            11298
pobre no extremo      658
pobreno extremo       270
pobre extremo          21
Name: poverty classification, dtype: int64

## Collapsing by district

In [12]:
def weighted_mean_function_generator(col, weights):

    def weighted_mean(df):

        col_times_weight = df[col] * df[weights]
        weight = df[weights]

        return col_times_weight.sum() / weight.sum() * 100

    return weighted_mean

In [13]:
def absolute_number_function_generator(col, weights):

    def absolute_number(df):

        col_times_weight = df[col] * df[weights] / 3 #div by three because we group three years

        return round(col_times_weight.sum())

    return absolute_number

In [14]:
weighted_mean_poverty = weighted_mean_function_generator('poverty', 'hh members weight')
weighted_mean_extreme_poverty = weighted_mean_function_generator('extreme poverty', 'hh members weight')

In [15]:
absolute_poverty = absolute_number_function_generator('poverty', 'hh members weight')
absolute_extreme_poverty = absolute_number_function_generator('extreme poverty', 'hh members weight')

In [16]:
groupby_cols = ['IDDIST']

In [17]:
district_poverty = enaho_summary.groupby(groupby_cols).apply(weighted_mean_poverty).reset_index().rename(columns={0: 'poverty rate'})
district_extreme_poverty = enaho_summary.groupby(groupby_cols).apply(weighted_mean_extreme_poverty).reset_index().rename(columns={0: 'extreme poverty rate'})

In [18]:
district_abs_poverty = enaho_summary.groupby(groupby_cols).apply(absolute_poverty).reset_index().rename(columns={0: 'absolute poverty'})
district_abs_extreme_poverty = enaho_summary.groupby(groupby_cols).apply(absolute_extreme_poverty).reset_index().rename(columns={0: 'absolute extreme poverty'})

In [19]:
distric_poverty_rates_avg2016_2018 = pd.merge(district_poverty, district_extreme_poverty, how='inner', on=groupby_cols)
distric_poverty_rates_avg2016_2018 = pd.merge(distric_poverty_rates_avg2016_2018, district_abs_poverty, how='inner', on=groupby_cols)
distric_poverty_rates_avg2016_2018 = pd.merge(distric_poverty_rates_avg2016_2018, district_abs_extreme_poverty, how='inner', on=groupby_cols)

In [20]:
distric_poverty_rates_avg2016_2018.head()

Unnamed: 0,IDDIST,poverty rate,extreme poverty rate,absolute poverty,absolute extreme poverty
0,70101,10.580007,0.0,51465.0,0.0
1,70102,3.558496,1.005258,1961.0,554.0
2,70103,9.063425,0.0,4345.0,0.0
3,70104,4.566707,0.0,2296.0,0.0
4,70105,0.0,0.0,0.0,0.0


In [21]:
distric_poverty_rates_avg2016_2018.to_csv('../../data/clean/poverty by district_2016-2018 average.csv', index=False)