<h1>Creating an Automatic Census Data Scraper by State</h1>
<h2><b>IMPORTANT:</b><br><span style="color:red;">This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau.</span></h2>

<h2>Setting up the Census Bureau API</h2>

In [4]:
import pandas as pd
import numpy as np
import requests
import csv
import warnings
warnings.simplefilter('ignore') #Turn off warnings

In [5]:
base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
api_key = "672276f2a0ad053d60f8bb0848cad8a290a29427"
zcta_url = "&for=zip%20code%20tabulation%20area:" # need to include a 0 at the end when using the ZCTA range

In [6]:
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP02.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
variable_df

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group,Unnamed: 8
0,DP02_0001E,Estimate HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
1,DP02_0001EA,Annotation of Estimate HOUSEHOLDS BY TYPE Tota...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
2,DP02_0001M,Margin of Error HOUSEHOLDS BY TYPE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
3,DP02_0001MA,Annotation of Margin of Error HOUSEHOLDS BY TY...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
4,DP02_0001PE,Percent HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
...,...,...,...,...,...,...,...,...,...
1228,DP02_0154PE,Percent COMPUTERS AND INTERNET USE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1229,DP02_0154PEA,Annotation of Percent COMPUTERS AND INTERNET U...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
1230,DP02_0154PM,Percent Margin of Error COMPUTERS AND INTERNET...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1231,DP02_0154PMA,Annotation of Percent Margin of Error COMPUTER...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,


In [7]:
variable_table = variable_df['Name'][0:1232]
variable_table = variable_table[~variable_table.str.endswith('A')] # want to remove values ending in A
variable_table.reset_index(drop = True, inplace = True)
variable_table

0       DP02_0001E
1       DP02_0001M
2      DP02_0001PE
3      DP02_0001PM
4       DP02_0002E
          ...     
611    DP02_0153PM
612     DP02_0154E
613     DP02_0154M
614    DP02_0154PE
615    DP02_0154PM
Name: Name, Length: 616, dtype: object

In [8]:
test = ','.join(variable_table[0:48])
test_all_vars_url = f"{base_url}{test}{zcta_url}01001&key={api_key}"
response = requests.get(test_all_vars_url)

In [9]:
labels = response.json()[0][:-1]
values = response.json()[1:][0][:-1]
test_dict = {labels[i]: values[i] for i in range(len(values))}

test_df = variable_df[~variable_df["Name"].str.endswith("A")]
test_df.reset_index(drop = True, inplace = True)
test_labels = test_df["Label"][0:192]
test_dict2 = {test_labels[i]: values[i] for i in range(len(values))}

test_df = pd.DataFrame(np.array(values).reshape(-1,4)).rename(columns = {0: "Estimate", 
                                                               1: "Margin of Error", 
                                                               2: "Percent", 
                                                               3: "Percent Margin of Error"})
test_df

Unnamed: 0,Estimate,Margin of Error,Percent,Percent Margin of Error
0,6791,345,6791.0,-888888888.0
1,2959,265,43.6,4.1
2,874,173,12.9,2.6
3,726,232,10.7,3.3
4,196,129,2.9,1.9
5,1156,222,17.0,3.2
6,48,55,0.7,0.8
7,843,207,12.4,3.0
8,367,146,5.4,2.1
9,1950,314,28.7,4.0


In [10]:
test_labels = test_labels[0::4]
new_index = [test_labels[i][test_labels[i].find("BY TYPE")+8:] for i in range(0, len(test_labels), 4)]

test_df.index = new_index
test_df

Unnamed: 0,Estimate,Margin of Error,Percent,Percent Margin of Error
Total households,6791,345,6791.0,-888888888.0
Total households Married-couple household,2959,265,43.6,4.1
Total households Married-couple household With children of the householder under 18 years,874,173,12.9,2.6
Total households Cohabiting couple household,726,232,10.7,3.3
Total households Cohabiting couple household With children of the householder under 18 years,196,129,2.9,1.9
"Total households Male householder, no spouse/partner present",1156,222,17.0,3.2
"Total households Male householder, no spouse/partner present With children of the householder under 18 years",48,55,0.7,0.8
"Total households Male householder, no spouse/partner present Householder living alone",843,207,12.4,3.0
"Total households Male householder, no spouse/partner present Householder living alone 65 years and over",367,146,5.4,2.1
"Total households Female householder, no spouse/partner present",1950,314,28.7,4.0


In [11]:
social_labels = variable_df["Label"][:1232:8].reset_index(drop=True).str.title()

# Getting labels for the Demographic Table
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP05.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)

demographic_labels = variable_df["Label"][:712:8].reset_index(drop=True).str.title()

In [21]:
def get_social_df_url(url):
    response = requests.get(url)
    print(response.status_code) if response.status_code != 200 else False
    if response.status_code == 200:
        data = response.json()
        valid_data = np.array(data[1:][0][:-4:2]).reshape(-1,4)
        socs_df = pd.DataFrame(valid_data).rename(columns = {0: "Estimate", 
                                                             1: "Margin of Error", 
                                                             2: "Percent", 
                                                             3: "Percent Margin of Error"})

        # Need to add proper indices. 
        socs_df.index = social_labels
        # Need to replace "-888888888", need to do for rest of Annotation values.
        socs_df = test_df.replace("-888888888", "(X)")
    
        return socs_df
    else:
        return 0

<h2>Pulling ZCTA Per State to Allow Scraping by State</h2>

In [13]:
zcta_mapping_df = pd.read_excel("./ani_csv/ZIPCodetoZCTACrosswalk2021UDS.xlsx")
zcta_mapping_df

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601.0,Zip matches ZCTA
3,602,Aguada,PR,Zip Code Area,602.0,Zip matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603.0,Zip matches ZCTA
...,...,...,...,...,...,...
41086,99926,Metlakatla,AK,Zip Code Area,99926.0,Zip matches ZCTA
41087,99927,Point Baker,AK,Zip Code Area,99927.0,Zip matches ZCTA
41088,99928,Ward Cove,AK,Post Office or large volume customer,99901.0,Spatial join to ZCTA
41089,99929,Wrangell,AK,Zip Code Area,99929.0,Zip matches ZCTA


In [14]:
#Clean it so it is only ZCTA and State

zcta_to_state = zcta_mapping_df[["STATE", "ZCTA"]]
zcta_to_state = zcta_to_state[~zcta_to_state["ZCTA"].isna()]
zcta_to_state["ZCTA"] = zcta_to_state["ZCTA"].astype(int)
zcta_to_state

Unnamed: 0,STATE,ZCTA
0,NY,11742
1,NY,11742
2,PR,601
3,PR,602
4,PR,603
...,...,...
41086,AK,99926
41087,AK,99927
41088,AK,99901
41089,AK,99929


<h2>Preparing State-Specific Scraping Function</h2>

In [15]:
import os

In [22]:
def get_social_df_state(abbrev):
    base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
    api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
    group = "group(DP02)"
    zcta_url = "&for=zip%20code%20tabulation%20area:0"

    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    try:
        os.mkdir("census_data")
    except:
        pass
    
    try:
        os.mkdir("./census_data/social_chars")
    except:
        pass
    
    try:
        os.mkdir(f"./census_data/social_chars/{abbrev}")
    except:
        pass

    for zcta in ZCTA_Range:
        url = f"{base_url}{group}{zcta_url}{zcta}{api_key}"
        new_social_df = get_social_df_url(url)
        if not isinstance(new_social_df, pd.DataFrame):
            print(f"Failed at zcta {zcta}")
            continue
        else:
            try:
                new_social_df.to_csv(f"census_data/social_chars/{abbrev}/{zcta}.csv")
            except:
                pass
            print(f"Succeeded at zcta {zcta}")

In [23]:
get_social_df_state("MA")

Succeeded at zcta 1001
Succeeded at zcta 1002
Succeeded at zcta 1003
Succeeded at zcta 1002
Succeeded at zcta 1005
Succeeded at zcta 1007
Succeeded at zcta 1008
Succeeded at zcta 1009
Succeeded at zcta 1010
Succeeded at zcta 1011
Succeeded at zcta 1012
Succeeded at zcta 1013
Succeeded at zcta 1013
Succeeded at zcta 1020
Succeeded at zcta 1020
Succeeded at zcta 1022
Succeeded at zcta 1026
Succeeded at zcta 1027
Succeeded at zcta 1028
Succeeded at zcta 1029
Succeeded at zcta 1030
Succeeded at zcta 1031
Succeeded at zcta 1032
Succeeded at zcta 1033
Succeeded at zcta 1034
Succeeded at zcta 1035
Succeeded at zcta 1036
Succeeded at zcta 1037
Succeeded at zcta 1038
Succeeded at zcta 1039
Succeeded at zcta 1040
Succeeded at zcta 1040
Succeeded at zcta 1050
Succeeded at zcta 1053
Succeeded at zcta 1054
Succeeded at zcta 1056
Succeeded at zcta 1057
Succeeded at zcta 1002
Succeeded at zcta 1060
Succeeded at zcta 1060
Succeeded at zcta 1062
Succeeded at zcta 1063
Succeeded at zcta 1066
Succeeded a

In [25]:
len(list(zcta_to_state[zcta_to_state["STATE"] == "MA"]["ZCTA"]))
#It took 10 minutes, or around 600 seconds to scrape 681 data sets
#Texas has 1939 ZCTAs so it will take an estimated 

681