In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
pd.options.display.max_columns = None

In [4]:
df = pd.read_stata('dataprep2.dta')

In [118]:
# preparing the data to be used in matlab; first pass:
# data has not been checked and could be prepared more carefully

cohort_list = ['cohort' + str(x) for x in range(1,6)]
time_list = ['time' + str(x) for x in range(1,8)]
df2 = df.loc[df['realyear'] > 94, # asset data for 1994 is underreported and hence not used
         ['HHID', 'age','male', 'PI', 'heal', 'assets'] \
             + time_list+ cohort_list].copy()

list_ind = df2['HHID'].unique()
df2 = df2[df2['HHID'].isin(list_ind)] 

def get_dum(row, col_names):
    for c in col_names:
        if row[c] == 1:
            return int(c[-1])
df2['time'] = df2.loc[:, time_list].apply(get_dum, args = (time_list,), axis=1)
df2['cohort'] = df2.loc[:, cohort_list].apply(get_dum, args = (cohort_list,), axis=1)
df2.drop(time_list+cohort_list, axis=1, inplace=True)

df2['PI'] = df2.groupby('HHID')['PI'].transform('mean')
df2.rename(inplace=True, 
           columns={'male':'g', 'heal': 'h', 'assets': 'a', 'PI': 'I'})
bins = np.linspace(0, 1, num=6, endpoint=True)
names = [x for x in range(1,6)]
df2['quintile'] = pd.cut(df2['I'], bins, labels=names)
df2['cohort'] = df2['cohort'].replace({5:4}) -1 # group last category and start at 1

df2.dropna(inplace=True)

df3 = df2.sort_values('time').groupby('HHID').first()

df3 = df3[[ 'g', 'I', 'h', 'a', 'age', 'time', 'quintile', 'cohort']]

# prepare data for use in matlab

df3.to_csv('assets.csv', header=False)