In [1]:
import sys
import os
import math
import random
import array
from datetime import datetime

import numpy as np
import pandas as pd 

from scipy.stats import lognorm
from scipy.stats import beta
from scipy.stats import binom
from scipy.stats import norm
from scipy.stats import multivariate_normal
from scipy.stats import expon
from scipy.stats import pareto
import scipy.sparse as sp
import scipy.linalg as la
from scipy.stats import dirichlet
 

import matplotlib.pyplot as plt 
from mpl_toolkits.mplot3d import Axes3D
import seaborn as sns

from itertools import combinations
from itertools import combinations_with_replacement
from itertools import permutations
from itertools import product


from tqdm import trange, notebook

pd.set_option('display.max_columns', None)


In [2]:
current_directory = os.getcwd()
parent_directory = os.path.dirname(current_directory)
sys.path.append(parent_directory)
from config import directory_path
os.chdir(directory_path)
os.getcwd()



'D:\\Dropbox\\Dropbox\\uchicago_fourth\\uncertaintyInequality'

## Creating a wide Panel data

In [24]:
HH_Income_dir = 'pyramid/HHIncome/'
zipFiles = os.listdir(HH_Income_dir)
zipFiles

['household_income_20140131_MS_rev_csv.zip',
 'household_income_20140228_MS_rev_csv.zip',
 'household_income_20140331_MS_rev_csv.zip',
 'household_income_20140430_MS_rev_csv.zip',
 'household_income_20140531_MS_rev_csv.zip',
 'household_income_20140630_MS_rev_csv.zip',
 'household_income_20140731_MS_rev_csv.zip',
 'household_income_20140831_MS_rev_csv.zip',
 'household_income_20140930_MS_rev_csv.zip',
 'household_income_20141031_MS_rev_csv.zip',
 'household_income_20141130_MS_rev_csv.zip',
 'household_income_20141231_MS_rev_csv.zip',
 'household_income_20150131_MS_rev_csv.zip',
 'household_income_20150228_MS_rev_csv.zip',
 'household_income_20150331_MS_rev_csv.zip',
 'household_income_20150430_MS_rev_csv.zip',
 'household_income_20150531_MS_rev_csv.zip',
 'household_income_20150630_MS_rev_csv.zip',
 'household_income_20150731_MS_rev_csv.zip',
 'household_income_20150831_MS_rev_csv.zip',
 'household_income_20150930_MS_rev_csv.zip',
 'household_income_20151031_MS_rev_csv.zip',
 'househol

In [25]:
initalFile = 12
fileName = HH_Income_dir+ zipFiles[initalFile]

df = pd.read_csv(fileName, compression='zip')
dropcols = ['STATE','HR', 'DISTRICT','REGION_TYPE','STRATUM','PSU_ID',
            'MONTH_SLOT','MONTH','RESPONSE_STATUS','HH_NON_RESPONSE_MS',
            'HH_NON_RESPONSE_FOR_COUNTRY_MS','HH_NON_RESPONSE_FOR_STATE_MS',
            'AGE_GROUP','OCCUPATION_GROUP','EDUCATION_GROUP','GENDER_GROUP','SIZE_GROUP']
dropCol = [c for c in df.columns if c.find('WEIGHT')>1] + dropcols
df = df.drop(dropCol,axis=1)

if 'FAMILY_SHIFTED' not in df.columns:
    df['FAMILY_SHIFTED'] = 'N'

keep_same = df.columns[0]
suffix = "_"+fileName[fileName.find('_20')+1:fileName.find('_MS')]
df.columns = ['{}{}'.format(c, '' if c in keep_same else suffix) for c in df.columns]

for f in notebook.tqdm(zipFiles[initalFile+1:71], desc='File', leave=True):
    fileName = HH_Income_dir+ f
    df_toMerge = pd.read_csv(fileName, compression='zip')
    df_toMerge = df_toMerge.drop(dropCol,axis=1)
    if 'FAMILY_SHIFTED' not in df.columns:
        df_toMerge['FAMILY_SHIFTED'] = 'N'

    keep_same = df_toMerge.columns[0]
    suffix = "_"+ f[f.find('_20')+1:f.find('_MS')]
    df_toMerge.columns = ['{}{}'.format(c, '' if c in keep_same else suffix) for c in df_toMerge.columns]

    df = df.merge(df_toMerge,on=['HH_ID'],how='outer',validate="1:1")

    

File:   0%|          | 0/58 [00:00<?, ?it/s]

### Finishing up! and savign the HH panel

In [27]:
df.to_feather('proc_data\HH_Income_Panel.feather')


# Creating a long panel data

In [4]:
HH_Income_dir = 'pyramid/HHIncome/'
zipFiles = os.listdir(HH_Income_dir)
zipFiles

['household_income_20140131_MS_rev_csv.zip',
 'household_income_20140228_MS_rev_csv.zip',
 'household_income_20140331_MS_rev_csv.zip',
 'household_income_20140430_MS_rev_csv.zip',
 'household_income_20140531_MS_rev_csv.zip',
 'household_income_20140630_MS_rev_csv.zip',
 'household_income_20140731_MS_rev_csv.zip',
 'household_income_20140831_MS_rev_csv.zip',
 'household_income_20140930_MS_rev_csv.zip',
 'household_income_20141031_MS_rev_csv.zip',
 'household_income_20141130_MS_rev_csv.zip',
 'household_income_20141231_MS_rev_csv.zip',
 'household_income_20150131_MS_rev_csv.zip',
 'household_income_20150228_MS_rev_csv.zip',
 'household_income_20150331_MS_rev_csv.zip',
 'household_income_20150430_MS_rev_csv.zip',
 'household_income_20150531_MS_rev_csv.zip',
 'household_income_20150630_MS_rev_csv.zip',
 'household_income_20150731_MS_rev_csv.zip',
 'household_income_20150831_MS_rev_csv.zip',
 'household_income_20150930_MS_rev_csv.zip',
 'household_income_20151031_MS_rev_csv.zip',
 'househol

In [5]:
initalFile = 12
fileName = HH_Income_dir+ zipFiles[initalFile]
df = pd.read_csv(fileName, compression='zip')
if 'FAMILY_SHIFTED' not in df.columns:
    df['FAMILY_SHIFTED'] = 'N'

for f in notebook.tqdm(zipFiles[initalFile+1:72], desc='File', leave=True):
    fileName = HH_Income_dir+ f
    df_toMerge = pd.read_csv(fileName, compression='zip')
    if 'FAMILY_SHIFTED' not in df.columns:
        df_toMerge['FAMILY_SHIFTED'] = 'N'

    df = pd.concat((df,df_toMerge),axis=0)



File:   0%|          | 0/58 [00:00<?, ?it/s]

### Finishing up!

In [45]:
df = df.reset_index(drop=True)
df.to_feather('proc_data\HH_Income_Panel_long.feather')
