# clean_hh_2005

This notebook clean household level df 

## Inputs
1. ii_in.dta : household non labor income
2. ii_inr.dta : household rural income
3. ii_portad.dta : household location

Outputs
1. folio : household id
2. hh_no_savings : hh has no savings
2. hh_has_savings : hh has savings
2. hh_no_debts : hh has no debts
2. hh_has_debts : hh has debts
3. hh_has_liquid : hh has savings or debts

In [23]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

import os 

import sys; sys.path.append('/home/mitch/util/python')
sys.path.append('/home/mitch/school/mfl/')

from src import json_utils

In [24]:
raw = '/home/mitch/Dropbox/data/mexico_fls/raw/2005/'
interim = '/home/mitch/Dropbox/data/mexico_fls/interim2/2005/'

figs = '/home/mitch/school/mfl/fig/'
dicts= '/home/mitch/school/mfl/dicts/'

In [25]:
rename = json_utils.load_json(dicts + 'rename_hh_2005.json')
rename

{'folio': 'hhid',
 'ah03a': 'own_house',
 'ah03b': 'own_otherhouse',
 'ah03c': 'own_bicycles',
 'ah03d': 'own_vehicle',
 'ah03e': 'own_electronics',
 'ah03f': 'own_washmachinestove',
 'ah03g': 'own_domesticappliance',
 'ah03h': 'own_financialassets',
 'ah03i': 'own_machinary',
 'ah03j': 'own_bullcow',
 'ah03k': 'own_horsesmules',
 'ah03l': 'own_pigsgoats',
 'ah03m': 'own_poultry',
 'ah03n': 'own_otherassets',
 'ah04a_1': 'knowvalue_house',
 'ah04b_1': 'knowvalue_otherhouse',
 'ah04c_1': 'knowvalue_bicycles',
 'ah04d_1': 'knowvalue_vehicle',
 'ah04e_1': 'knowvalue_electronics',
 'ah04f_1': 'knowvalue_washmachinestove',
 'ah04g_1': 'knowvalue_domesticappliance',
 'ah04h_1': 'knowvalue_financialassets',
 'ah04i_1': 'knowvalue_machinary',
 'ah04j_1': 'knowvalue_bullcow',
 'ah04k_1': 'knowvalue_horsesmules',
 'ah04l_1': 'knowvalue_pigsgoats',
 'ah04m_1': 'knowvalue_poultry',
 'ah04n_1': 'knowvalue_otherassets',
 'ah04a_2': 'value_house',
 'ah04b_2': 'value_otherhouse',
 'ah04c_2': 'value_bi

In [26]:
keep = ['folio']

In [27]:
os.chdir(raw + 'book II/data/' )
crh = pd.read_stata('ii_crh.dta')
ii_in = pd.read_stata('ii_in.dta')
ii_inr = pd.read_stata('ii_inr.dta')
ii_portad = pd.read_stata('ii_portad.dta')
ii_ah = pd.read_stata('ii_ah.dta').drop(columns=['ls'])
weights = pd.read_stata('hh05w_b2.dta')

os.chdir(raw + 'book I/data/' )
i_cs = pd.read_stata('i_cs.dta')
i_cs1 = pd.read_stata('i_cs1.dta')

os.chdir(raw + 'book C/data/' )
c_cv = pd.read_stata('c_cv.dta')


In [28]:
weights.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8289 entries, 0 to 8288
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   folio     8289 non-null   object
 1   fac_lib2  8289 non-null   int32 
dtypes: int32(1), object(1)
memory usage: 161.9+ KB


In [29]:
#for dfset in [weights, crh, ii_in, ii_inr, ii_portad, i_cs, i_cs1, ii_ah, c_cv]:
    # make folio (household id) numeric for merging
    #dfset['folio'] = dfset['folio'].astype('float64')

In [30]:
df = (weights.merge(crh, on=['folio'], how='left')
            .merge(ii_in, on=['folio'], how='left')
            .merge(ii_inr, on=['folio'], how='left')
            .merge(ii_portad, on=['folio'], how='left')
            .merge(i_cs, on=['folio'], how='left')
            .merge(i_cs1, on=['folio'], how='left')
            .merge(ii_ah, on=['folio'], how='left')
            .merge(c_cv, on=['folio'], how='left')
            )

In [31]:
df = df.rename(columns = rename)
df = df[rename.values()]

In [32]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8289 entries, 0 to 8288
Data columns (total 55 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   hhid                               8289 non-null   object 
 1   own_house                          8021 non-null   float32
 2   own_otherhouse                     8021 non-null   float32
 3   own_bicycles                       8021 non-null   float32
 4   own_vehicle                        8021 non-null   float32
 5   own_electronics                    8020 non-null   float32
 6   own_washmachinestove               8020 non-null   float32
 7   own_domesticappliance              8019 non-null   float32
 8   own_financialassets                8020 non-null   float32
 9   own_machinary                      8020 non-null   float32
 10  own_bullcow                        8021 non-null   float32
 11  own_horsesmules                    8021 non-null   float

In [33]:
df.own_house.value_counts()

1.0    6358
3.0    1663
Name: own_house, dtype: int64

In [34]:
illiquid_assets = [
    'house', 
    'otherhouse',
    'bicycles',
    'vehicle',
    'electronics',
    'washmachinestove',
    'domesticappliance',
    'financialassets',
    'machinary',
    'bullcow',
    'horsesmules',
    'pigsgoats',
    'poultry',
    'otherassets'
]

In [35]:
df.own_house.value_counts()

1.0    6358
3.0    1663
Name: own_house, dtype: int64

In [36]:
for asset in illiquid_assets:
    df['own_'+asset] = df['own_'+asset].replace({1.0:1, 3.0:0})
    df['knowvalue_'+asset] = df['knowvalue_'+asset].replace({1.0:1, 8.0:0})

In [37]:
for asset in illiquid_assets:
    own = df['own_' + asset] == 1
    dont_own = df['own_' + asset] == 0
    dkvalue = df['knowvalue_'+asset] == 0

    df.loc[own & dkvalue, 'value_' + asset] = -1.0
    df.loc[dont_own, 'value_' + asset] = 0.0

In [38]:
df['hh_no_savings']                       = df['hh_no_savings'].replace({1.0:1.0})
df['hh_has_savings_bank']                 = df['hh_has_savings_bank'].replace({2.0:1.0})
df['hh_has_savings_cooperative']          = df['hh_has_savings_cooperative'].replace({3.0:1.0})
df['hh_has_savings_credit_institution']   = df['hh_has_savings_credit_institution'].replace({4.0:1.0})
df['hh_has_savings_batch']                = df['hh_has_savings_batch'].replace({5.0:1.0})
df['hh_has_savings_person_not_hhm']       = df['hh_has_savings_person_not_hhm'].replace({7.0:1.0})
df['hh_has_savings_afores']               = df['hh_has_savings_afores'].replace({8.0:1.0})
df['hh_has_savings_solidarity']           = df['hh_has_savings_solidarity'].replace({9.0:1.0})
df['hh_has_savings_at_house']             = df['hh_has_savings_at_house'].replace({10.0:1.0})
df['hh_has_savings_other']                = df['hh_has_savings_other'].replace({11.0:1.0})

In [39]:
has_savings_columns = [x for x in df.columns if 'hh_has_savings' in x]
df[has_savings_columns] = df[has_savings_columns].fillna(0)
has_savings_columns

['hh_has_savings_bank',
 'hh_has_savings_cooperative',
 'hh_has_savings_credit_institution',
 'hh_has_savings_batch',
 'hh_has_savings_person_not_hhm',
 'hh_has_savings_afores',
 'hh_has_savings_solidarity',
 'hh_has_savings_at_house',
 'hh_has_savings_other']

In [40]:
df['hh_has_savings'] = df[has_savings_columns].sum(axis=1) > 0
df['hh_has_savings'].value_counts()

False    7238
True     1051
Name: hh_has_savings, dtype: int64

In [41]:
df['hh_has_debts_12mth'].value_counts()

2.0    5900
1.0    2026
8.0      93
Name: hh_has_debts_12mth, dtype: int64

In [42]:
has_debt    = df['hh_has_debts_12mth'] == 1.0
no_debt     = df['hh_has_debts_12mth'] == 2.0
dk_debt     = df['hh_has_debts_12mth'] == 8.0

df.loc[no_debt, 'hh_debts_12mth'] = 0.0
df.loc[dk_debt, 'hh_debts_12mth'] = -1.0

In [43]:
df.hh_debts_12mth

0           0.0
1       40000.0
2           0.0
3       20000.0
4           0.0
         ...   
8284        0.0
8285        0.0
8286        0.0
8287     8000.0
8288        0.0
Name: hh_debts_12mth, Length: 8289, dtype: float32

In [44]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8289 entries, 0 to 8288
Data columns (total 56 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   hhid                               8289 non-null   object 
 1   own_house                          8021 non-null   float32
 2   own_otherhouse                     8021 non-null   float32
 3   own_bicycles                       8021 non-null   float32
 4   own_vehicle                        8021 non-null   float32
 5   own_electronics                    8020 non-null   float32
 6   own_washmachinestove               8020 non-null   float32
 7   own_domesticappliance              8019 non-null   float32
 8   own_financialassets                8020 non-null   float32
 9   own_machinary                      8020 non-null   float32
 10  own_bullcow                        8021 non-null   float32
 11  own_horsesmules                    8021 non-null   float

In [45]:
os.chdir(interim)
df.to_csv('hh_2005.csv', index=False)
