In [2]:
import pandas as pd
import requests
import os
from zipfile import ZipFile, is_zipfile
import time
import json

# 1. Scrape data from censusreporter.org

In [16]:
## For each request, scrape data by one table on state
## For example, table B19001 is for Household Income in the Past 12 Months (In 2017 Inflation-adjusted Dollars), and each request scrapes one state
def get_data(table_code):
    url = ('https://api.censusreporter.org/1.0/data/download/latest?table_ids=%s&geo_ids=050|01000US&format=csv' % 
           (table_code))
    #print(url)
    filename = 'data/%s.zip' % (table_code)
    if not is_zipfile(filename): ## check if the file has been already scraped, if yes then avoid to do it again
        result = requests.get(url)
        if not os.path.isdir('data/%s' % table_code):
            os.makedirs('data/%s' % table_code)
        with open(filename, 'wb') as f:
            f.write(result.content) 
    return is_zipfile(filename)

In [18]:
get_data('B992705') ## Allocation of Direct-purchase Health Insurance

True

In [19]:
get_data('B19001') ## Household Income in the Past 12 Months (In 2016 Inflation-adjusted Dollars

True

In [20]:
get_data('B11001') ## Household Type (Including Living Alone)

True

In [21]:
get_data('B01001') ## Sex by Age

True

In [22]:
get_data('B09001') ## Population Under 18 Years by Age

True

In [23]:
get_data('B20001') ## Sex by Earnings in the Past 12 Months (In 2016 Inflation-adjusted Dollars) for the Population 16 Years and Over With Earnings in the Past 12 Months

True

In [24]:
get_data('B18101') ## Sex by Age by Disability Status

True

In [25]:
get_data('B08136') ## Aggregate Travel Time to Work (In Minutes) of Workers by Means of Transportation to Work

True

In [26]:
get_data('B24011') ## Occupation by Median Earnings in the Past 12 Months (In 2016 Inflation-adjusted Dollars) for the Civilian Employed Population 16 Years and Over

True

In [27]:
get_data('B24124') ## Detailed Occupation for the Full-time, Year-round Civilian Employed Population 16 Years and Over

True

In [28]:
get_data('B23020') ## Mean Usual Hours Worked in the Past 12 Months for Workers 16 to 64 Years

True

In [29]:
get_data('B01003') ## Total Population

True

In [77]:
file_dir = 'data/'+table_code+'.zip'
with ZipFile(file_dir) as f:
    files = f.namelist()
    ## save csv file
    csv = [f for f in files if f.lower().endswith('.csv')][0]
    df = pd.read_csv(f.open(csv))

In [154]:
def open_csv(table_code):
    file_dir = 'data/'+table_code+'.zip'
    with ZipFile(file_dir) as f:
        files = f.namelist()
        ## save csv file
        csv = [f for f in files if f.lower().endswith('.csv')][0]
        df = pd.read_csv(f.open(csv))
        df['fips'] = df['geoid'].str.slice(7, 13)

        return df

In [155]:
sex_age = open_csv('B01001')
sex_age.shape

(3220, 101)

In [156]:
children = open_csv('B09001')
children.shape

(3220, 23)

In [157]:
family = open_csv('B11001')
family.shape

(3220, 21)

In [158]:
sex_age_disability = open_csv('B18101')
sex_age_disability.shape

(3220, 81)

In [159]:
household_income = open_csv('B19001')
household_income.shape

(3220, 37)

In [160]:
sex_earnings = open_csv('B20001')
sex_earnings.shape

(3220, 89)

In [161]:
travel_time = open_csv('B08136')
travel_time.shape

(3220, 27)

In [162]:
detail_occupation = open_csv('B24124')
detail_occupation.shape

(3220, 1055)

In [163]:
hours_worked = open_csv('B23020')
hours_worked.shape

(3220, 11)

In [164]:
population = open_csv('B01003')
population.shape

(3220, 5)

In [185]:
health_insurance = open_csv('B992705')
health_insurance.shape

(3220, 9)

In [186]:
def get_dictionary(table_code):
    file_dir = 'data/'+table_code+'.zip'
    with ZipFile(file_dir) as zipfile: 
        files = zipfile.namelist()
        ## save csv file
        json_file = [f for f in files if f.lower().endswith('.json')][0]
        f = zipfile.open(json_file)
        j = json.load(f)
        table_data = list(j['tables'].values())[0]
        tname = table_data['title']
        column_dic = table_data['columns']
        columns = []
        name_stack = []
        for c in sorted(column_dic.keys()):
            record = column_dic[c]
            name = record['name']
            indent = record['indent']
            if indent == 0:
                columns.append((c, name))
            else:
                if not name_stack or indent > name_stack[-1][0]:
                    name_stack.append((indent, name))
                if indent == name_stack[-1][0]:
                    name_stack[-1] = (indent, name)
                if indent < name_stack[-1][0]:
                    name_stack = [x for x in name_stack if x[0] < indent]
                    name_stack.append((indent, name))
                columns.append((c, ' '.join([x[1] for x in name_stack])))
        df = pd.DataFrame(columns, columns=['columns', 'description'])
        df['table'] = tname
        return df

In [187]:
get_dictionary('B01003')

Unnamed: 0,columns,description,table
0,B01003001,Total,Total Population


In [188]:
table_list = [f.replace('.zip','') for f in os.listdir('data') if f.endswith('.zip')]
census_dictionary = pd.concat([
    get_dictionary(table_code) for table_code in table_list
])

In [189]:
census_dictionary.sample(10)

Unnamed: 0,columns,description,table
196,B24124197,Dental assistants,"Detailed Occupation for the Full-time, Year-ro..."
412,B24124413,First-line supervisors of production and opera...,"Detailed Occupation for the Full-time, Year-ro..."
31,B18101032,Female: 35 to 64 years: With a disability,Sex by Age by Disability Status
8,B24124009,Computer and information systems managers,"Detailed Occupation for the Full-time, Year-ro..."
130,B24124131,Miscellaneous legal support workers,"Detailed Occupation for the Full-time, Year-ro..."
83,B24124084,Electrical and electronics engineers,"Detailed Occupation for the Full-time, Year-ro..."
233,B24124234,"Food preparation and serving related workers, ...","Detailed Occupation for the Full-time, Year-ro..."
298,B24124299,"Interviewers, except eligibility and loan","Detailed Occupation for the Full-time, Year-ro..."
419,B24124420,"Butchers and other meat, poultry, and fish pro...","Detailed Occupation for the Full-time, Year-ro..."
491,B24124492,Aircraft pilots and flight engineers,"Detailed Occupation for the Full-time, Year-ro..."


In [190]:
census_dictionary.to_csv('data/dictionary.csv', index=False)

In [191]:
def process_table(df, total_col_name, doordash_col='avg_subtotal'):
    df1 = df.copy()
    df1 = df1.loc[~df1['name'].duplicated(), :]
    cols = [c for c in df1.columns if 'Error' not in c and ##
            c not in ('fips', 'geoid', 'name', total_col_name)]
    to_drop = []
    if total_col_name:
        total_col = df1[total_col_name]
        for c in cols:
            df1[c] /= total_col
        df1.loc[total_col == 0, cols] = 0
    else:
        for c in cols:
            if df1[c].max() > 0:
                df1[c] /= df1[c].max()
            else:
                to_drop.append(c)
    df1 = df1.loc[:, ['fips'] + cols]
    df1.drop(to_drop, axis=1, inplace=True)
    # df1['fips'] = df1['fips'].astype(str).str.zfill(5)
    return df1.dropna()

In [194]:
census_preprocessed = process_table(health_insurance, 'B992705001').merge(
    process_table(family, 'B11001001')).merge(
    process_table(children, 'B09001001')).merge(
    process_table(sex_age, 'B01001001')).merge(
    process_table(household_income, 'B19001001')).merge(
    process_table(sex_age_disability, 'B18101001')).merge(
    process_table(detail_occupation, 'B24124001')).merge(
    process_table(hours_worked, None)).merge(
    process_table(population, None))

#family = process_census('../../census_data/data/family.csv', 'B11001001', doordash_col=dda_col)
#transportation = process_census('../../census_data/data/sex_transportaion.csv', 'B08006001', doordash_col=dda_col)
#children = process_census('../../census_data/data/children.csv', 'B09001001', doordash_col=dda_col)
#travel_time =  process_census('../../census_data/data/travel_time.csv', None, doordash_col=dda_col)
#sex_age = process_census('../../census_data/data/sex_age.csv', 'B01001001', doordash_col=dda_col)
#household_income = process_census('../../census_data/data/household_income.csv', 'B19001001', doordash_col=dda_col)
#sex_disability = process_census('../../census_data/data/sex_age_disability.csv', 'B18101001', doordash_col=dda_col)
#occupation_earning = process_census('../../census_data/data/occupation_earning.csv', 'B24011001', doordash_col=dda_col)
census_preprocessed.shape

(3220, 651)

In [195]:
census_preprocessed = census_preprocessed.loc[census_preprocessed.max(axis=1) > 0,
                                             census_preprocessed.max(axis=0) > 0] ## 
census_preprocessed.shape

(3220, 126)

In [196]:
census_preprocessed.sample(10)

Unnamed: 0,fips,B992705002,B992705003,B11001002,B11001003,B11001004,B11001005,B11001006,B11001007,B11001008,...,B18101034,B18101035,B18101036,B18101037,B18101038,B18101039,B23020001,B23020002,B23020003,B01003001
3087,55079,0.096795,0.903205,0.565549,0.347986,0.217563,0.051423,0.16614,0.434451,0.351501,...,0.0374,0.009552,0.027848,0.033932,0.017838,0.016093,0.608696,0.595489,0.668529,0.094658
222,6073,0.116826,0.883174,0.672141,0.500716,0.171425,0.051167,0.120258,0.327859,0.239812,...,0.039987,0.008397,0.03159,0.033049,0.017438,0.015611,0.621578,0.615038,0.664804,0.324931
1201,24019,0.113666,0.886334,0.659892,0.446291,0.213601,0.040031,0.17357,0.340108,0.277898,...,0.061212,0.014607,0.046605,0.047637,0.030152,0.017485,0.634461,0.627068,0.6946,0.003205
552,16003,0.108956,0.891044,0.650922,0.521313,0.129608,0.026498,0.103111,0.349078,0.325461,...,0.07604,0.018627,0.057413,0.047461,0.013014,0.034448,0.611916,0.621053,0.633147,0.00039
817,19057,0.09563,0.90437,0.630933,0.463034,0.167899,0.061233,0.106666,0.369067,0.321786,...,0.053884,0.012696,0.041187,0.049313,0.024326,0.024987,0.621578,0.628571,0.655493,0.003946
1351,27075,0.105804,0.894196,0.642857,0.536403,0.106454,0.046635,0.059819,0.357143,0.293585,...,0.05862,0.00788,0.05074,0.058332,0.018931,0.0394,0.624799,0.61203,0.681564,0.001047
189,6007,0.142363,0.857637,0.597537,0.433762,0.163775,0.050832,0.112943,0.402463,0.288312,...,0.051452,0.013143,0.038309,0.042455,0.022359,0.020096,0.57649,0.572932,0.61825,0.022285
52,1105,0.124423,0.875577,0.595591,0.265548,0.330043,0.063508,0.266535,0.404409,0.395525,...,0.053819,0.020562,0.033256,0.047629,0.032522,0.015107,0.586151,0.560902,0.664804,0.000958
1056,21127,0.058405,0.941595,0.685481,0.54653,0.138951,0.045091,0.09386,0.314519,0.28094,...,0.048491,0.024786,0.023705,0.033174,0.02021,0.012965,0.645733,0.661654,0.66108,0.001568
1794,34041,0.098577,0.901423,0.685417,0.543869,0.141549,0.046804,0.094744,0.314583,0.257799,...,0.049849,0.012366,0.037483,0.039847,0.020804,0.019043,0.621578,0.633083,0.648045,0.010597


In [197]:
census_preprocessed.to_csv('data/cencus_preprocessed.csv', index=False)