<h1>Creating an Automatic Census Data Scraper by State</h1>
<h2><b>IMPORTANT:</b><br><span style="color:red;">This product uses the Census Bureau Data API but is not endorsed or certified by the Census Bureau.</span></h2>

<h2>Setting up the Census Bureau API</h2>

In [3]:
import pandas as pd
import numpy as np
import requests
import csv
import warnings
warnings.simplefilter('ignore') #Turn off warnings

In [4]:
base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
api_key = "672276f2a0ad053d60f8bb0848cad8a290a29427"
zcta_url = "&for=zip%20code%20tabulation%20area:" # need to include a 0 at the end when using the ZCTA range

In [5]:
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP02.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
variable_df

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group,Unnamed: 8
0,DP02_0001E,Estimate HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
1,DP02_0001EA,Annotation of Estimate HOUSEHOLDS BY TYPE Tota...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
2,DP02_0001M,Margin of Error HOUSEHOLDS BY TYPE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
3,DP02_0001MA,Annotation of Margin of Error HOUSEHOLDS BY TY...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
4,DP02_0001PE,Percent HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
...,...,...,...,...,...,...,...,...,...
1228,DP02_0154PE,Percent COMPUTERS AND INTERNET USE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1229,DP02_0154PEA,Annotation of Percent COMPUTERS AND INTERNET U...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
1230,DP02_0154PM,Percent Margin of Error COMPUTERS AND INTERNET...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1231,DP02_0154PMA,Annotation of Percent Margin of Error COMPUTER...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,


In [6]:
feature_list = list(variable_df["Label"])

with open('feature_list_social.txt', 'w', encoding='utf-8') as file:
    for feature in feature_list:
        if feature.startswith("Estimate"):
            file.write(feature + '\n\n') 

<h2>Getting list for Demographic Features</h2>

In [7]:
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP03.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
variable_df

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group,Unnamed: 8
0,DP03_0001E,Estimate EMPLOYMENT STATUS Population 16 years...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,int,DP03,
1,DP03_0001EA,Annotation of Estimate EMPLOYMENT STATUS Popul...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,string,DP03,
2,DP03_0001M,Margin of Error EMPLOYMENT STATUS Population 1...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,int,DP03,
3,DP03_0001MA,Annotation of Margin of Error EMPLOYMENT STATU...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,string,DP03,
4,DP03_0001PE,Percent EMPLOYMENT STATUS Population 16 years ...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,int,DP03,
...,...,...,...,...,...,...,...,...,...
1092,DP03_0137PE,Percent PERCENTAGE OF FAMILIES AND PEOPLE WHOS...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,float,DP03,
1093,DP03_0137PEA,Annotation of Percent PERCENTAGE OF FAMILIES A...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,string,DP03,
1094,DP03_0137PM,Percent Margin of Error PERCENTAGE OF FAMILIES...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,float,DP03,
1095,DP03_0137PMA,Annotation of Percent Margin of Error PERCENTA...,SELECTED ECONOMIC CHARACTERISTICS,predicate-only,,0,string,DP03,


In [8]:
feature_list = list(variable_df["Label"])

with open('feature_list_economic.txt', 'w', encoding='utf-8') as file:
    for feature in feature_list:
        if feature.startswith("Estimate"):
            file.write(feature + '\n\n') 

<h2>Getting Demographic Features</h2>

In [9]:
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP05.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
variable_df

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group,Unnamed: 8
0,DP05_0001E,Estimate SEX AND AGE Total population,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,int,DP05,
1,DP05_0001EA,Annotation of Estimate SEX AND AGE Total popul...,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,string,DP05,
2,DP05_0001M,Margin of Error SEX AND AGE Total population,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,int,DP05,
3,DP05_0001MA,Annotation of Margin of Error SEX AND AGE Tota...,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,string,DP05,
4,DP05_0001PE,Percent SEX AND AGE Total population,ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,int,DP05,
...,...,...,...,...,...,...,...,...,...
708,DP05_0089PE,"Percent CITIZEN, VOTING AGE POPULATION Citizen...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,float,DP05,
709,DP05_0089PEA,"Annotation of Percent CITIZEN, VOTING AGE POPU...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,string,DP05,
710,DP05_0089PM,"Percent Margin of Error CITIZEN, VOTING AGE PO...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,float,DP05,
711,DP05_0089PMA,"Annotation of Percent Margin of Error CITIZEN,...",ACS DEMOGRAPHIC AND HOUSING ESTIMATES,predicate-only,,0,string,DP05,


In [10]:
feature_list = list(variable_df["Label"])

with open('feature_list_demographic.txt', 'w', encoding='utf-8') as file:
    for feature in feature_list:
        if feature.startswith("Estimate"):
            file.write(feature + '\n\n') 

In [11]:
variable_table = variable_df['Name'][0:1232]
variable_table = variable_table[~variable_table.str.endswith('A')] # want to remove values ending in A
variable_table.reset_index(drop = True, inplace = True)
variable_table

0         DP05_0001E
1         DP05_0001M
2        DP05_0001PE
3        DP05_0001PM
4         DP05_0002E
           ...      
352       DP05_0089E
353       DP05_0089M
354      DP05_0089PE
355      DP05_0089PM
356    712 variables
Name: Name, Length: 357, dtype: object

In [12]:
test = ','.join(variable_table[0:48])
test_all_vars_url = f"{base_url}{test}{zcta_url}01001&key={api_key}"
response = requests.get(test_all_vars_url)

In [13]:
labels = response.json()[0][:-1]
values = response.json()[1:][0][:-1]
test_dict = {labels[i]: values[i] for i in range(len(values))}

test_df = variable_df[~variable_df["Name"].str.endswith("A")]
test_df.reset_index(drop = True, inplace = True)
test_labels = test_df["Label"][0:192]
test_dict2 = {test_labels[i]: values[i] for i in range(len(values))}

test_df = pd.DataFrame(np.array(values).reshape(-1,4)).rename(columns = {0: "Estimate", 
                                                               1: "Margin of Error", 
                                                               2: "Percent", 
                                                               3: "Percent Margin of Error"})
test_df

Unnamed: 0,Estimate,Margin of Error,Percent,Percent Margin of Error
0,16088.0,845.0,16088.0,-888888888.0
1,7996.0,529.0,49.7,2.3
2,8092.0,594.0,50.3,2.3
3,98.8,9.1,-888888888.0,-888888888.0
4,960.0,227.0,6.0,1.4
5,815.0,209.0,5.1,1.3
6,677.0,196.0,4.2,1.2
7,516.0,177.0,3.2,1.1
8,611.0,216.0,3.8,1.3
9,2389.0,463.0,14.8,2.7


In [14]:
test_labels = test_labels[0::4]
new_index = [test_labels[i][test_labels[i].find("BY TYPE")+8:] for i in range(0, len(test_labels), 4)]

test_df.index = new_index
test_df

Unnamed: 0,Estimate,Margin of Error,Percent,Percent Margin of Error
e SEX AND AGE Total population,16088.0,845.0,16088.0,-888888888.0
e SEX AND AGE Total population Male,7996.0,529.0,49.7,2.3
e SEX AND AGE Total population Female,8092.0,594.0,50.3,2.3
e SEX AND AGE Total population Sex ratio (males per 100 females),98.8,9.1,-888888888.0,-888888888.0
e SEX AND AGE Total population Under 5 years,960.0,227.0,6.0,1.4
e SEX AND AGE Total population 5 to 9 years,815.0,209.0,5.1,1.3
e SEX AND AGE Total population 10 to 14 years,677.0,196.0,4.2,1.2
e SEX AND AGE Total population 15 to 19 years,516.0,177.0,3.2,1.1
e SEX AND AGE Total population 20 to 24 years,611.0,216.0,3.8,1.3
e SEX AND AGE Total population 25 to 34 years,2389.0,463.0,14.8,2.7


In [15]:
social_labels = variable_df["Label"][:1232:8].reset_index(drop=True).str.title()

# Getting labels for the Demographic Table
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP05.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)

demographic_labels = variable_df["Label"][:712:8].reset_index(drop=True).str.title()

In [16]:
def get_social_df_url(url):
    response = requests.get(url)
    print(response.status_code) if response.status_code != 200 else False
    if response.status_code == 200:
        data = response.json()
        valid_data = np.array(data[1:][0][:-4:2]).reshape(-1,4)
        socs_df = pd.DataFrame(valid_data).rename(columns = {0: "Estimate", 
                                                             1: "Margin of Error", 
                                                             2: "Percent", 
                                                             3: "Percent Margin of Error"})

        # Need to add proper indices. 
        socs_df.index = social_labels
        # Need to replace "-888888888", need to do for rest of Annotation values.
        socs_df = test_df.replace("-888888888", "(X)")
    
        return socs_df
    else:
        return 0

<h2>Pulling ZCTA Per State to Allow Scraping by State</h2>

In [17]:
zcta_mapping_df = pd.read_excel("./ani_csv/ZIPCodetoZCTACrosswalk2021UDS.xlsx")
zcta_mapping_df

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601.0,Zip matches ZCTA
3,602,Aguada,PR,Zip Code Area,602.0,Zip matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603.0,Zip matches ZCTA
...,...,...,...,...,...,...
41086,99926,Metlakatla,AK,Zip Code Area,99926.0,Zip matches ZCTA
41087,99927,Point Baker,AK,Zip Code Area,99927.0,Zip matches ZCTA
41088,99928,Ward Cove,AK,Post Office or large volume customer,99901.0,Spatial join to ZCTA
41089,99929,Wrangell,AK,Zip Code Area,99929.0,Zip matches ZCTA


In [18]:
#Clean it so it is only ZCTA and State

zcta_to_state = zcta_mapping_df[["STATE", "ZCTA"]]
zcta_to_state = zcta_to_state[~zcta_to_state["ZCTA"].isna()]
zcta_to_state["ZCTA"] = zcta_to_state["ZCTA"].astype(int)
zcta_to_state

Unnamed: 0,STATE,ZCTA
0,NY,11742
1,NY,11742
2,PR,601
3,PR,602
4,PR,603
...,...,...
41086,AK,99926
41087,AK,99927
41088,AK,99901
41089,AK,99929


In [19]:
clean_zip = lambda zip: str(zip) if zip > 10000 else (f"0{zip}" if zip >= 1000 else f"00{zip}")
zcta_to_state['ZCTA'] = zcta_to_state['ZCTA'].map(clean_zip)
zcta_to_state

Unnamed: 0,STATE,ZCTA
0,NY,11742
1,NY,11742
2,PR,00601
3,PR,00602
4,PR,00603
...,...,...
41086,AK,99926
41087,AK,99927
41088,AK,99901
41089,AK,99929


<h2>Preparing State-Specific Scraping Function</h2>

In [20]:
import os

In [21]:
def get_social_df_state(abbrev):
    base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
    api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
    group = "group(DP02)"
    zcta_url = "&for=zip%20code%20tabulation%20area:"

    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    try:
        os.mkdir("census_data")
    except:
        pass
    
    try:
        os.mkdir("./census_data/social_chars")
    except:
        pass
    
    try:
        os.mkdir(f"./census_data/social_chars/{abbrev}")
    except:
        pass

    for zcta in ZCTA_Range:
        url = f"{base_url}{group}{zcta_url}{zcta}{api_key}"
        new_social_df = get_social_df_url(url)
        if not isinstance(new_social_df, pd.DataFrame):
            continue
        else:
            try:
                new_social_df.to_csv(f"census_data/social_chars/{abbrev}/{zcta}.csv")
            except:
                pass

In [22]:
len(list(zcta_to_state[zcta_to_state["STATE"] == "MA"]["ZCTA"]))
#It took 10 minutes, or around 600 seconds to scrape 681 data sets
#Texas has 1939 ZCTAs so it will take an estimated 

681

In [23]:
# Getting labels for the Demographic Table
variable_table_url = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP03.html'
v_table = pd.read_html(variable_table_url) # reading all available variables from API for the ACS5
variable_df = pd.DataFrame(v_table[0])
variable_df['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)

economic_labels = variable_df['Label'][:1089:8].reset_index(drop=True).str.title()
economic_labels

0      Estimate Employment Status Population 16 Years...
1      Estimate Employment Status Population 16 Years...
2      Estimate Employment Status Population 16 Years...
3      Estimate Employment Status Population 16 Years...
4      Estimate Employment Status Population 16 Years...
                             ...                        
132    Estimate Percentage Of Families And People Who...
133    Estimate Percentage Of Families And People Who...
134    Estimate Percentage Of Families And People Who...
135    Estimate Percentage Of Families And People Who...
136    Estimate Percentage Of Families And People Who...
Name: Label, Length: 137, dtype: object

In [24]:
def get_economic_df_url(url):
    response = requests.get(url)
    if response.status_code == 200:
        data = response.json()
        valid_data = np.array(data[1][:-3:2]).reshape(-1,4)
        demo_df = pd.DataFrame(valid_data).rename(columns = {0: "Estimate", 
                                                             1: "Margin of Error", 
                                                             2: "Percent", 
                                                             3: "Percent Margin of Error"})

        # Need to add proper indices. 
        demo_df.index = economic_labels
        # Need to replace "-888888888", need to do for rest of Annotation values.
        demo_df = demo_df.replace("-888888888", "(X)")
    
        return demo_df
    else:
        return response.status_code

In [25]:
base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
group = "group(DP03)"
zcta_url = "&for=zip%20code%20tabulation%20area:"

In [26]:
test_url = f"{base_url}{group}{zcta_url}01002{api_key}"
get_economic_df_url(test_url)

Unnamed: 0_level_0,Estimate,Margin of Error,Percent,Percent Margin of Error
Label,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Estimate Employment Status Population 16 Years And Over,24110,717,24110,(X)
Estimate Employment Status Population 16 Years And Over In Labor Force,15630,1029,64.8,3.4
Estimate Employment Status Population 16 Years And Over In Labor Force Civilian Labor Force,15630,1029,64.8,3.4
Estimate Employment Status Population 16 Years And Over In Labor Force Civilian Labor Force Employed,14536,994,60.3,3.5
Estimate Employment Status Population 16 Years And Over In Labor Force Civilian Labor Force Unemployed,1094,385,4.5,1.6
...,...,...,...,...
Estimate Percentage Of Families And People Whose Income In The Past 12 Months Is Below The Poverty Level All People 18 Years And Over,(X),(X),24.8,3.4
Estimate Percentage Of Families And People Whose Income In The Past 12 Months Is Below The Poverty Level All People 18 Years And Over 18 To 64 Years,(X),(X),29.1,4.0
Estimate Percentage Of Families And People Whose Income In The Past 12 Months Is Below The Poverty Level All People 18 Years And Over 65 Years And Over,(X),(X),3.7,2.1
Estimate Percentage Of Families And People Whose Income In The Past 12 Months Is Below The Poverty Level All People People In Families,(X),(X),6.7,2.6


In [27]:
def get_economic_df_state(abbrev):
    base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
    api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
    group = "group(DP03)"
    zcta_url = "&for=zip%20code%20tabulation%20area:"

    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    try:
        os.mkdir("census_data")
    except:
        pass
    
    try:
        os.mkdir("./census_data/economic_chars")
    except:
        pass
    
    try:
        os.mkdir(f"./census_data/economic_chars/{abbrev}")
    except:
        pass

    for zcta in ZCTA_Range:
        url = f"{base_url}{group}{zcta_url}{zcta}{api_key}"
        new_economic_df = get_economic_df_url(url)
        if not isinstance(new_economic_df, pd.DataFrame):
            print(f"Failed at zcta {zcta} - Error code {new_economic_df}")
            continue
        else:
            try:
                new_economic_df.to_csv(f"census_data/economic_chars/{abbrev}/{zcta}.csv")
            except:
                pass


<h2>Attempting to Scrape Asynchronously</h2>

In [28]:
!pip install aiohttp
!pip install asyncio



In [29]:
all_zctas = list(zcta_to_state["ZCTA"])

In [30]:
import aiohttp
import asyncio
import os

CONCURRENT_REQUESTS = 500

async def get_economic_df_url(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            data = await response.json()
            valid_data = np.array(data[1][:-3:2]).reshape(-1, 4)
            demo_df = pd.DataFrame(valid_data).rename(columns={0: "Estimate", 
                                                               1: "Margin of Error", 
                                                               2: "Percent", 
                                                               3: "Percent Margin of Error"})
            demo_df.index = economic_labels
            demo_df = demo_df.replace("-888888888", "(X)")
            return demo_df
        else:
            return response.status

async def worker(session, semaphore, abbrev, zcta):
    async with semaphore:
        base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
        api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
        group = "group(DP03)"
        zcta_url = f"&for=zip%20code%20tabulation%20area:{zcta}"
        url = f"{base_url}{group}{zcta_url}{api_key}"
        
        new_economic_df = await get_economic_df_url(session, url)
        
        if not isinstance(new_economic_df, pd.DataFrame):
            print(f"Failed at zcta {zcta} - Error code {new_economic_df}")
        else:
            try:
                new_economic_df.to_csv(f"census_data/async_economic_chars/{zcta}.csv")
                counter += 1
                if counter % 200 == 0:
                    print(f'{round(10000 * counter/len(all_zctas))/100}% Complete')
            except:
                pass

async def get_economic_df_state(abbrev):
    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    os.makedirs(f"census_data/async_economic_chars", exist_ok=True)
    
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    
    async with aiohttp.ClientSession() as session:
        tasks = [worker(session, semaphore, abbrev, zcta) for zcta in ZCTA_Range]
        await asyncio.gather(*tasks)


In [31]:
zcta_to_state

Unnamed: 0,STATE,ZCTA
0,NY,11742
1,NY,11742
2,PR,00601
3,PR,00602
4,PR,00603
...,...,...
41086,AK,99926
41087,AK,99927
41088,AK,99901
41089,AK,99929


In [32]:
state_list = list(zcta_to_state.value_counts("STATE").index)

In [33]:
async def scrape_all_economic_data():
    counter = 0
    for state in state_list:
        await get_economic_df_state(state)

In [34]:
CONCURRENT_REQUESTS = 500

async def get_demographic_df_url(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            data = await response.json()
            valid_data = np.array(data[1][:-3:2]).reshape(-1, 4)
            demo_df = pd.DataFrame(valid_data).rename(columns={0: "Estimate", 
                                                               1: "Margin of Error", 
                                                               2: "Percent", 
                                                               3: "Percent Margin of Error"})
            demo_df.index = demographic_labels
            demo_df = demo_df.replace("-888888888", "(X)")
            return demo_df
        else:
            return response.status

async def worker_demographic(session, semaphore, abbrev, zcta):
    async with semaphore:
        base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
        api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
        group = "group(DP05)"
        zcta_url = f"&for=zip%20code%20tabulation%20area:{zcta}"
        url = f"{base_url}{group}{zcta_url}{api_key}"
        
        if not os.path.exists(f'./census_data/async_demographic_chars/{zcta}.csv'):
            new_demographic_df = await get_demographic_df_url(session, url)
        
            if not isinstance(new_demographic_df, pd.DataFrame):
                print(f"Failed at zcta {zcta} - Error code {new_demographic_df}")
            else:
                try:
                    new_demographic_df.to_csv(f"census_data/async_demographic_chars/{zcta}.csv")
                    counter += 1
                    if counter % 200 == 0:
                        print(f'{round(10000 * counter/len(all_zctas))/100}% Complete')
                except:
                    pass

async def get_demographic_df_state(abbrev):
    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    os.makedirs(f"census_data/async_demographic_chars", exist_ok=True)
    
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    
    async with aiohttp.ClientSession() as session:
        tasks = [worker_demographic(session, semaphore, abbrev, zcta) for zcta in ZCTA_Range]
        await asyncio.gather(*tasks)

In [35]:
variable_table_url_demographic = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP05.html'
v_table_demographic = pd.read_html(variable_table_url_demographic) # reading all available variables from API for the ACS5
variable_df_demographic = pd.DataFrame(v_table_demographic[0])
variable_df_demographic['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
variable_df_demographic
demographic_labels = variable_df_demographic["Label"][:712:8].reset_index(drop=True).str.title()

In [36]:
async def scrape_all_demographic_data():
    counter = 0
    for state in state_list:
        await get_demographic_df_state(state)

In [38]:
CONCURRENT_REQUESTS = 500

async def get_social_df_url(session, url):
    async with session.get(url) as response:
        if response.status == 200:
            data = await response.json()
            valid_data = np.array(data[1:][0][:-4:2]).reshape(-1,4)
            demo_df = pd.DataFrame(valid_data).rename(columns={0: "Estimate", 
                                                               1: "Margin of Error", 
                                                               2: "Percent", 
                                                               3: "Percent Margin of Error"})
            demo_df.index = social_labels
            demo_df = demo_df.replace("-888888888", "(X)")
            return demo_df
        else:
            return response.status

async def worker_social(session, semaphore, abbrev, zcta):
    async with semaphore:
        base_url = "https://api.census.gov/data/2021/acs/acs5/profile?get="
        api_key = "&key=672276f2a0ad053d60f8bb0848cad8a290a29427"
        group = "group(DP02)"
        zcta_url = f"&for=zip%20code%20tabulation%20area:{zcta}"
        url = f"{base_url}{group}{zcta_url}{api_key}"
        
        if not os.path.exists(f'./census_data/async_social_chars/{zcta}.csv'):
            new_social_df = await get_social_df_url(session, url)
        
            if not isinstance(new_social_df, pd.DataFrame):
                print(f"Failed at zcta {zcta} - Error code {new_social_df}")
            else:
                try:
                    new_social_df.to_csv(f"census_data/async_social_chars/{zcta}.csv")
                    counter += 1
                    if counter % 200 == 0:
                        print(f'{round(10000 * counter/len(all_zctas))/100}% Complete')
                except:
                    pass

async def get_social_df_state(abbrev):
    ZCTA_Range = list(zcta_to_state[zcta_to_state["STATE"] == abbrev]["ZCTA"])
    
    os.makedirs(f"census_data/async_social_chars", exist_ok=True)
    
    semaphore = asyncio.Semaphore(CONCURRENT_REQUESTS)
    
    async with aiohttp.ClientSession() as session:
        tasks = [worker_social(session, semaphore, abbrev, zcta) for zcta in ZCTA_Range]
        await asyncio.gather(*tasks)

In [39]:
variable_table_url_social = f'https://api.census.gov/data/2021/acs/acs5/profile/groups/DP02.html'
v_table_social = pd.read_html(variable_table_url_social) # reading all available variables from API for the ACS5
variable_df_social = pd.DataFrame(v_table_social[0])
variable_df_social['Label'].replace({"!!": " ", ":": ""}, regex=True, inplace=True)
social_labels = variable_df_social["Label"][:1232:8].reset_index(drop=True).str.title()
variable_df_social

Unnamed: 0,Name,Label,Concept,Required,Attributes,Limit,Predicate Type,Group,Unnamed: 8
0,DP02_0001E,Estimate HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
1,DP02_0001EA,Annotation of Estimate HOUSEHOLDS BY TYPE Tota...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
2,DP02_0001M,Margin of Error HOUSEHOLDS BY TYPE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
3,DP02_0001MA,Annotation of Margin of Error HOUSEHOLDS BY TY...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
4,DP02_0001PE,Percent HOUSEHOLDS BY TYPE Total households,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,int,DP02,
...,...,...,...,...,...,...,...,...,...
1228,DP02_0154PE,Percent COMPUTERS AND INTERNET USE Total house...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1229,DP02_0154PEA,Annotation of Percent COMPUTERS AND INTERNET U...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,
1230,DP02_0154PM,Percent Margin of Error COMPUTERS AND INTERNET...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,float,DP02,
1231,DP02_0154PMA,Annotation of Percent Margin of Error COMPUTER...,SELECTED SOCIAL CHARACTERISTICS IN THE UNITED ...,predicate-only,,0,string,DP02,


In [40]:
async def scrape_all_social_data():
    counter = 0
    for state in state_list:
        await get_social_df_state(state)

In [None]:
len(social_labels) + len(economic_labels) + len(demographic_labels)

In [None]:
#The number of data points scraped
380 * 4 * 41083

<h3>Testing Data Types for MySQL</h3>

In [None]:
test_csv = pd.read_csv('./census_data/async_demographic_chars/00601.csv')
test_csv

In [None]:
for stat in test_csv["Percent"].values:
    if type(stat) == str:
        print(stat)

In [None]:
!pip install mysql-connector-python

In [44]:
import os
import mysql.connector

In [45]:
def get_census_data(zcta, cat):
    cnx = mysql.connector.connect(
        host='localhost',
        user='root',
        password='GoingForGold831LH!',
        database=f'census_{cat}_db'
    )
    
    query = f"SELECT * FROM zcta_{zcta}"
    df = pd.read_sql(query, cnx)
    
    cnx.close()
    
    return df

In [46]:
get_census_data("21231", "economic")

Unnamed: 0,Label,Estimate,Margin of Error,Percent,Percent Margin of Error
0,Estimate Employment Status Population 16 Years...,13487.0,974.0,13487.0,0.0
1,Estimate Employment Status Population 16 Years...,9897.0,827.0,73.4,2.8
2,Estimate Employment Status Population 16 Years...,9865.0,825.0,73.1,2.8
3,Estimate Employment Status Population 16 Years...,9561.0,818.0,70.9,2.8
4,Estimate Employment Status Population 16 Years...,304.0,114.0,2.3,0.9
...,...,...,...,...,...
132,Estimate Percentage Of Families And People Who...,0.0,0.0,16.2,2.6
133,Estimate Percentage Of Families And People Who...,0.0,0.0,16.2,2.7
134,Estimate Percentage Of Families And People Who...,0.0,0.0,16.4,6.1
135,Estimate Percentage Of Families And People Who...,0.0,0.0,20.7,5.3
