In [1]:
import zipfile
import tempfile
import requests
import os
import numpy as np
import pandas as pd

In [2]:
def get_puf_data(first_week, last_week):
    '''
    download puf files for the given weeks and concatenate the datasets
    '''
    df = pd.DataFrame()
    for i in range(first_week,last_week+1):
        file_str = "pulse2020_puf_{}.csv".format(i)
        url_str = "https://www2.census.gov/programs-surveys/demo/datasets/hhp/2020/wk{w}/HPS_Week{w}_PUF_CSV.zip".format(w=i)
        week_df = download_url(url_str, file_str,i)
        df = pd.concat([df,week_df])
    return df


def download_url(url_str,file_str, week, chunk_size=128):
    '''
    download puf, avoid intermediate save to disk
    '''
    df = pd.DataFrame()
    r = requests.get(url_str)
    print(r)
    #temp directory rather than file to accomodate windows users
    with tempfile.TemporaryDirectory() as td:
        f_name = os.path.join(td, 'week_{}'.format(week))
        with open(f_name, 'wb') as fh:
            for chunk in r.iter_content(chunk_size=chunk_size):
                fh.write(chunk)
        zf = zipfile.ZipFile(f_name)
        df = pd.read_csv(zf.open(file_str))
        zf.close()
        r.close()
    return df

#columns for crosstabs 'EST_MSA','RRACE', 'RHISPANIC', 'EEDUC', 'EGENDER', 'TBIRTH_YEAR'
#It might be work normalizing at the metro level before running crosstabs
puf_data = get_puf_data(13,13)
y = puf_data[['EST_MSA','PWEIGHT','HWEIGHT','RRACE', 'RHISPANIC', 'EEDUC', 'EGENDER', 'TBIRTH_YEAR']]
df1 = (y.groupby(['EST_MSA', 'RRACE', 'RHISPANIC'], as_index=False).PWEIGHT.agg({'weighted_freq': 'sum', 'sigma': np.std}))
df1['std_err'] = df1.sigma/np.sqrt(df1.weighted_freq)

<Response [200]>
