In [34]:
import zipfile
import tempfile
import requests
import os
import numpy as np
import pandas as pd

In [161]:
def get_puf_data(first_week, last_week):
    '''
    download puf files for the given weeks and concatenate the datasets
    '''
    df = pd.DataFrame()
    for i in range(first_week,last_week+1):
        file_str = "pulse2020_puf_{}.csv".format(i)
        url_str = "https://www2.census.gov/programs-surveys/demo/datasets/hhp/2020/wk{w}/HPS_Week{w}_PUF_CSV.zip".format(w=i)
        week_df = download_url(url_str, file_str,i)
        df = pd.concat([df,week_df])

    return df


def download_url(url_str,file_str, week, chunk_size=128):
    '''
    download puf, avoid intermediate save to disk
    '''
    df = pd.DataFrame()
    r = requests.get(url_str)
    print(r)
    #temp directory rather than file to hopefully accomodate windows users
    with tempfile.TemporaryDirectory() as td:
        f_name = os.path.join(td, 'week_{}'.format(week))
        with open(f_name, 'wb') as fh:
            for chunk in r.iter_content(chunk_size=chunk_size):
                fh.write(chunk)
        zf = zipfile.ZipFile(f_name)
        df = pd.read_csv(zf.open(file_str))
        df1 = pd.read_csv(zf.open("pulse2020_repwgt_puf_{}.csv".format(week)))
        zf.close()
        r.close()

    return pd.merge(df,df1,on=["SCRAM","WEEK"],how="left")


def apply_wts(df, weight_str):
    #make 1d array of weight col
    obs_wgts = df[weight_str].to_numpy().reshape(len(df),1)
    #make 80d array of rep weights
    rep_wgts = df[[i for i in df.columns if weight_str in i and not i == weight_str]].to_numpy()

    #return standard error of estimate
    return np.sqrt((np.sum(np.square(obs_wgts-rep_wgts),axis=1)*(4/80)))


def freq_crosstab(df, col_list, weight_str, critical_val):
    df1 = df.groupby(col_list, as_index=False)[[i for i in df.columns if weight_str in i]].agg('sum')
    df1['margin_of_error'] = apply_wts(df1, weight_str)*critical_val

    return df1[col_list+['PWEIGHT','margin_of_error']]


In [162]:
df = get_puf_data(13,21)
q_map_df = pd.read_csv('data/question_mapping.csv')
r_map_df = pd.read_csv('data/response_mapping.csv')

<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>
<Response [200]>


## Metro by Race and Hispanic Ethnicity

In [163]:
foo = freq_crosstab(df,['WEEK','EST_MSA','RRACE','RHISPANIC'],'PWEIGHT',1.645)
foo.round(0) #rounded for readability

Unnamed: 0,WEEK,EST_MSA,RRACE,RHISPANIC,PWEIGHT,margin_of_error
0,13,12060.0,1,1,2182081.0,122200.0
1,13,12060.0,1,2,290540.0,68354.0
2,13,12060.0,2,1,1615628.0,97670.0
3,13,12060.0,2,2,64870.0,39410.0
4,13,12060.0,3,1,225291.0,46002.0
...,...,...,...,...,...,...
1069,21,47900.0,2,2,42260.0,18473.0
1070,21,47900.0,3,1,417412.0,75438.0
1071,21,47900.0,3,2,26007.0,15270.0
1072,21,47900.0,4,1,160349.0,35108.0
