In [42]:
from matplotlib import pyplot as plt
import zipfile
import tempfile
import requests
import os
import numpy as np
import pandas as pd

In [170]:
def get_puf_data(first_week, last_week):
    '''
    download puf files for the given weeks and concatenate the datasets
    '''
    df = pd.DataFrame()
    for i in range(first_week,last_week+1):
        y = 2020
        if i>=22:
            y=2021
        file_str = "pulse{}_puf_{}.csv".format(y,i)
        url_str = "https://www2.census.gov/programs-surveys/demo/datasets/hhp/{yr}/wk{w}/HPS_Week{w}_PUF_CSV.zip".format(yr=y,w=i)
        week_df = download_url(url_str, file_str, y, i)
        df = pd.concat([df,week_df])

    return df


def download_url(url_str,file_str, year, week, chunk_size=128):
    '''
    download puf, avoid intermediate save to disk
    '''
    df = pd.DataFrame()
    r = requests.get(url_str)
    print(r)
    #temp directory rather than file to hopefully accomodate windows users
    with tempfile.TemporaryDirectory() as td:
        f_name = os.path.join(td, 'week_{}'.format(week))
        with open(f_name, 'wb') as fh:
            for chunk in r.iter_content(chunk_size=chunk_size):
                fh.write(chunk)
        zf = zipfile.ZipFile(f_name)
        df = pd.read_csv(zf.open(file_str))
        df1 = pd.read_csv(zf.open("pulse{}_repwgt_puf_{}.csv".format(year,week)))
        zf.close()
        r.close()

    return pd.merge(df,df1,on=["SCRAM","WEEK"],how="inner")


def get_std_err(df, weight):
    #make 1d array of weight col
    obs_wgts = df[weight].to_numpy().reshape(len(df),1)
    
    #make 80d array of replicate weights
    rep_wgts = df[[i for i in df.columns if weight in i and not i == weight]].to_numpy()
    
    #return standard error of estimate
    return np.sqrt((np.sum(np.square(rep_wgts-obs_wgts),axis=1)*(4/80)))


def freq_crosstab(df, col_list, weight, critical_val=1):
    '''

    '''
    df1 = df[-df[col_list].isin([-99,-88]).any(axis=1)].copy()
    pt_estimates = df1.groupby(col_list, as_index=True)[[i for i in df1.columns if weight in i]].agg('sum')
    pt_estimates['std_err'] = get_std_err(pt_estimates, weight)
    pt_estimates['mrgn_err'] = pt_estimates.std_err * critical_val

    return pt_estimates[[weight, 'std_err','mrgn_err']] #.reset_index()

def full_crosstab(df, col_list, weight, proportion_level, critical_val=1):
    rv = freq_crosstab(df, col_list, weight, critical_val)
    rv['proportions'] = rv[weight].groupby(proportion_level).apply(lambda x: x / float(x.sum()))
    return rv.reset_index()


In [201]:
core_df = get_puf_data(13,22)
q_map_df = pd.read_csv('data/question_mapping.csv')
r_map_df = pd.read_csv('data/response_mapping.csv')

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


## Tie Out Housing 1b Table

In [57]:
core_df['rent_check'] = core_df.apply(lambda x: 4 if x.TENURE==4 else x.RENTCUR, axis=1) 
freq_crosstab(df, ['EST_MSA', 'EGENDER', 'RENTCUR'], 'PWEIGHT', critical_val=1.645).round(0)

In [202]:
states = {1:'Alabama',2:'Alaska',4:'Arizona',
    5:'Arkansas',
    6:'California',
    8:'Colorado',
    9:'Connecticut',
    10:'Delaware',
    11:'District of Columbia',
    12:'Florida',
    13:'Georgia',
    15:'Hawaii',
    16:'Idaho',
    17:'Illinois',
    18:'Indiana',
    19:'Iowa',
    20:'Kansas',
    21:'Kentucky',
    22:'Louisiana',
    23:'Maine',
    24:'Maryland',
    25:'Massachusetts',
    26:'Michigan',
    27:'Minnesota',
    28:'Mississippi',
    29:'Missouri',
    30:'Montana',
    31:'Nebraska',
    32:'Nevada',
    33:'New Hampshire',
    34:'New Jersey',
    35:'New Mexico',
    36:'New York',
    37:'North Carolina',
    38:'North Dakota',
    39:'Ohio',
    40:'Oklahoma',
    41:'Oregon',
    42:'Pennsylvania',
    44:'Rhode Island',
    45:'South Carolina',
    46:'South Dakota',
    47:'Tennessee',
    48:'Texas',
    49:'Utah',
    50:'Vermont',
    51:'Virginia',
    53:'Washington',
    54:'West Virginia',
    55:'Wisconsin',
    56:'Wyoming'}
regions = {1:'NE',2:'S',3:'MW',4:'W'}
metros = {35620:'New York',
31080:'Los Angeles',
16980:'Chicago',
19100:'Dallas',
26420:'Houston',
47900:'DC',
33100:'Miami',
37980:'Philadelphia',
12060:'Atlanta',
38060:'Phoenix',
14460:'Boston',
41860:'San Francisco',
40140:'Riverside CA',
19820:'Detroit',
42660:'Seattle'}
races = {1:'white',2:'black',3:'Asian',4:'Other'}
rents = {1:'current',2:'behind'}
df = core_df.copy()
df1 = core_df.copy()
df = df[-df.EST_MSA.isna()]
df = df[df.RRACE>0]
df = df[df.RENTCUR>0]
df1 = df1[df1.RRACE>0]
df1 = df1[df1.RENTCUR>0]
df['metro'] = df.apply(lambda x: metros[x.EST_MSA],axis=1)
df['race'] = df.apply(lambda x: races[x.RRACE],axis=1)
df['rent'] = df.apply(lambda x: rents[x.RENTCUR],axis=1)
df1['state'] = df1.apply(lambda x: states[x.EST_ST],axis=1)
df1['region'] = df1.apply(lambda x: regions[x.REGION],axis=1) 
df1['race'] = df1.apply(lambda x: races[x.RRACE],axis=1)
df1['rent'] = df1.apply(lambda x: rents[x.RENTCUR],axis=1)
x = full_crosstab(df, ['WEEK','metro','race','rent'], 'PWEIGHT', ['WEEK','metro','race'], critical_val=1.645).round(2)
y = full_crosstab(df, ['WEEK','metro','race'], 'PWEIGHT', ['WEEK','metro'], critical_val=1.645).round(2)
rv = x.merge(y,'left',['WEEK','metro','race'])
rv
#rv.to_csv('crosstab1.csv')
#rv = freq_crosstab(df, ['EST_MSA','RENTCUR'], 'PWEIGHT', critical_val=1.645)
#rv['proportions'] = rv['PWEIGHT'].groupby(['EST_MSA']).apply(lambda x: x / float(x.sum()))

Unnamed: 0,WEEK,metro,race,rent,PWEIGHT_x,std_err_x,mrgn_err_x,proportions_x,PWEIGHT_y,std_err_y,mrgn_err_y,proportions_y
0,13,Atlanta,Asian,behind,5923.87,3624.72,5962.66,0.22,26458.70,7205.60,11853.21,0.03
1,13,Atlanta,Asian,current,20534.83,7041.32,11582.98,0.78,26458.70,7205.60,11853.21,0.03
2,13,Atlanta,Other,behind,8375.40,5765.38,9484.05,0.31,26728.32,10129.81,16663.54,0.03
3,13,Atlanta,Other,current,18352.92,6439.09,10592.30,0.69,26728.32,10129.81,16663.54,0.03
4,13,Atlanta,black,behind,84377.39,25286.83,41596.84,0.20,418937.82,54846.83,90223.04,0.48
...,...,...,...,...,...,...,...,...,...,...,...,...
1172,22,Seattle,Other,current,103349.20,28112.04,46244.30,0.86,120849.01,29641.23,48759.83,0.14
1173,22,Seattle,black,behind,13463.14,9451.65,15547.96,0.18,76785.02,31119.11,51190.94,0.09
1174,22,Seattle,black,current,63321.88,29651.54,48776.79,0.82,76785.02,31119.11,51190.94,0.09
1175,22,Seattle,white,behind,72582.99,15875.70,26115.53,0.13,551572.16,40315.17,66318.46,0.63


In [191]:
rv.to_csv('crosstab2.csv')

In [200]:
#full_crosstab(df, ['WEEK','metro'], 'PWEIGHT', ['WEEK','metro'], critical_val=1.645).round(2)
df1 = df[df.WEEK==13].copy()
df1 = df1[df1.metro=='Atlanta']
full_crosstab(df1, ['metro','race','rent'], 'PWEIGHT', ['metro','race'], critical_val=1.645).round(2)

Unnamed: 0,metro,race,rent,PWEIGHT,std_err,mrgn_err,proportions
0,Atlanta,Asian,behind,5923.87,3624.72,5962.66,0.22
1,Atlanta,Asian,current,20534.83,7041.32,11582.98,0.78
2,Atlanta,Other,behind,8375.4,5765.38,9484.05,0.31
3,Atlanta,Other,current,18352.92,6439.09,10592.3,0.69
4,Atlanta,black,behind,84377.39,25286.83,41596.84,0.2
5,Atlanta,black,current,334560.43,52043.74,85611.95,0.8
6,Atlanta,white,behind,32461.67,13918.24,22895.5,0.08
7,Atlanta,white,current,361599.05,46427.68,76373.54,0.92


In [217]:
detail = full_crosstab(df1, ['WEEK','state','race','rent'], 'PWEIGHT', ['WEEK','state','race'], critical_val=1.645).round(2)
top = full_crosstab(df1, ['WEEK','state','race'], 'PWEIGHT', ['WEEK','state'], critical_val=1.645).round(2)
rv = detail.merge(top,'left',['WEEK','state','race'],suffixes=('_full','_demo'))
y = rv[rv.rent=='behind']
y_2 = y[y.proportions_full>0.4]
y_3 = y_2[y_2.PWEIGHT_full>y_2.mrgn_err_full]
y_3.state.unique()


array(['Alabama', 'Iowa', 'North Carolina', 'South Dakota', 'Virginia',
       'Illinois', 'Minnesota', 'New York', 'Ohio', 'Washington',
       'Connecticut', 'Indiana', 'New Mexico', 'Oregon', 'Georgia',
       'Louisiana', 'Maryland', 'Michigan', 'Pennsylvania', 'Wisconsin',
       'Alaska', 'District of Columbia', 'Nevada', 'Tennessee',
       'Mississippi', 'Nebraska', 'Oklahoma', 'Rhode Island',
       'South Carolina', 'Missouri', 'Texas', 'Colorado', 'Delaware',
       'Florida', 'Kentucky', 'Massachusetts', 'Vermont', 'California',
       'Kansas', 'New Jersey', 'Utah'], dtype=object)

In [209]:
detail_r = full_crosstab(df1, ['WEEK','region','race','rent'], 'PWEIGHT', ['WEEK','region','race'], critical_val=1.645).round(2)
top_r = full_crosstab(df1, ['WEEK','region','race'], 'PWEIGHT', ['WEEK','region'], critical_val=1.645).round(2)
rv_r = detail_r.merge(top_r,'left',['WEEK','region','race'],suffixes=('_full','_demo'))
x = rv_r[rv_r.rent=='behind']
x[x.proportions_full>0.4]

Unnamed: 0,WEEK,region,race,rent,PWEIGHT_full,std_err_full,mrgn_err_full,proportions_full,PWEIGHT_demo,std_err_demo,mrgn_err_demo,proportions_demo
172,18,NE,black,behind,857991.48,189038.01,310967.52,0.44,1928187.96,195373.94,321390.14,0.2
292,22,MW,black,behind,733948.69,156260.04,257047.76,0.5,1479393.27,156888.39,258081.41,0.17
298,22,NE,Other,behind,396776.44,110644.16,182009.64,0.57,701300.1,117815.62,193806.69,0.08
