In [1]:
#Tabular Manipulation
import pandas as pd
import numpy as np

#File management
import os
import csv

#Asynchronous Code
import asyncio #Run !pip install asyncio
import aiohttp #Run !pip install aiohttp

#Other
import requests
import warnings
import mysql.connector #run !pip install mysql-connector-python


warnings.simplefilter('ignore') #Turn off warnings

In [2]:
#Import and clean zctas
clean_zip = lambda zip: str(zip) if zip > 10000 else (f"0{zip}" if zip >= 1000 else f"00{zip}")

zctas = list(pd.read_excel("./ani_csv/ZIPCodetoZCTACrosswalk2021UDS.xlsx")["ZIP_CODE"]
         .map(clean_zip))

In [3]:
#Getting selected features
with open("./census_data/selected_features.txt", "r") as f:
    lines = f.readlines()

selected_features = [line.replace("\n", "").title() for line in lines if line != "\n" ]

In [4]:
#Getting selected features prefixes for division purposes
prefixes = []
curr_prefix = selected_features[0]
for feature in selected_features:
    if feature.startswith(curr_prefix):
        prefixes.append(curr_prefix)
    else:
        prefixes.append(feature)
        curr_prefix = feature

In [5]:
async def get_census_data_by_cat(zcta, cat):
    return pd.read_csv(f"./census_data/async_{cat}_chars/{zcta}.csv")

async def get_census_data_helper(zcta):
    lst = []
    for cat in ["demographic", "economic", "social"]:
        lst.append(await get_census_data_by_cat(zcta, cat))
    return lst

async def get_census_data(zcta):
    df_lst = await get_census_data_helper(zcta)
    return pd.concat([df for df in df_lst], axis=0)

async def isolate_selected_features(df):
    return df[df["Label"].isin(selected_features)].drop_duplicates(subset=["Label"])

async def add_prefix(df):
    new_df = df.copy()
    prefixes = []
    selected_features = list(df["Label"])
    curr_prefix = selected_features[0]
    for feature in selected_features:
        if feature.startswith(curr_prefix):
            prefixes.append(curr_prefix)
        else:
            prefixes.append(feature)
            curr_prefix = feature
    new_df["Prefix"] = prefixes
    return new_df

async def get_cleaned_unit_df(df):
    new_df = df.copy()
    raw_features = list(new_df[new_df["Percent"] == "(X)"]["Label"])
    labels = list(new_df["Label"])
    vals = []
    for i in range(len(labels)):
        if labels[i] in raw_features:
            vals.append(list(new_df[new_df['Label'] == labels[i]]['Estimate'])[0])
        elif labels[i] == list(new_df[new_df["Label"] == labels[i]]["Prefix"])[0]:
            vals.append(1)
        else:
            vals.append(float(list(new_df[new_df['Label'] == labels[i]]['Percent'])[0]) * 0.01)
            labels[i] = f"{labels[i]} As a Percent of {list(new_df[new_df['Label'] == labels[i]]['Prefix'])[0]}"
    return pd.DataFrame({"Label": labels, "Value": vals})

async def pivot_df_for_concat(df, zcta):
    new_df = df.copy()
    new_df["ZCTA"] = zcta
    pivoted_df = new_df.pivot(index="ZCTA", columns="Label", values="Value")
    return pivoted_df

In [6]:
async def clean_zcta(zcta):
    data_full = await get_census_data(zcta)
    data_filtered = await isolate_selected_features(data_full)
    data_with_prefix = await add_prefix(data_filtered)
    unit_df = await get_cleaned_unit_df(data_with_prefix)
    pivot_df = await pivot_df_for_concat(unit_df, zcta)
    return pivot_df

In [7]:
start_df = await clean_zcta("00601")

In [8]:
len(zctas)

41091

In [9]:
len(os.listdir("./census_data/async_demographic_chars/"))

32923

In [None]:
failed = 0
succeeded = 0
for i in range(len(zctas)):
    zcta = zctas[i]
    if (i % 329 == 0):
        print(f"{i / 329}% Complete")
    try:
        start_df = pd.concat([start_df, await clean_zcta(zcta)], axis=0)
        succeeded = succeeded + 1
    except:
        failed = failed + 1

0.0% Complete
1.0% Complete
2.0% Complete
3.0% Complete
4.0% Complete
5.0% Complete
6.0% Complete
7.0% Complete
8.0% Complete
9.0% Complete
10.0% Complete
11.0% Complete
12.0% Complete
13.0% Complete
14.0% Complete
15.0% Complete
16.0% Complete
17.0% Complete
18.0% Complete
19.0% Complete
20.0% Complete
21.0% Complete
22.0% Complete
23.0% Complete
24.0% Complete
25.0% Complete
26.0% Complete
27.0% Complete
28.0% Complete
29.0% Complete
30.0% Complete
31.0% Complete
32.0% Complete
33.0% Complete
34.0% Complete
35.0% Complete
36.0% Complete
37.0% Complete
38.0% Complete
39.0% Complete
40.0% Complete
41.0% Complete
42.0% Complete
43.0% Complete
44.0% Complete
45.0% Complete
46.0% Complete
47.0% Complete
48.0% Complete
49.0% Complete
50.0% Complete
51.0% Complete
52.0% Complete
53.0% Complete
54.0% Complete
55.0% Complete
56.0% Complete
57.0% Complete
58.0% Complete
59.0% Complete
60.0% Complete
61.0% Complete
62.0% Complete
63.0% Complete
64.0% Complete
65.0% Complete
66.0% Complete
67.0%

In [12]:
feature_df_uncleaned = start_df

In [21]:
feature_df_uncleaned[feature_df_uncleaned["Estimate Computers And Internet Use Total Households With A Broadband Internet Subscription As a Percent of Estimate Computers And Internet Use Total Households"].isna()

Label,Estimate Computers And Internet Use Total Households,Estimate Computers And Internet Use Total Households With A Broadband Internet Subscription As a Percent of Estimate Computers And Internet Use Total Households,Estimate Computers And Internet Use Total Households With A Computer As a Percent of Estimate Computers And Internet Use Total Households,Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population,Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population,Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years,Estimate Educational Attainment Population 25 Years And Over,...,Estimate Sex And Age Total Population 75 To 84 Years As a Percent of Estimate Sex And Age Total Population,Estimate Sex And Age Total Population Male As a Percent of Estimate Sex And Age Total Population,Estimate Sex And Age Total Population Median Age (Years),Estimate Sex And Age Total Population Under 5 Years As a Percent of Estimate Sex And Age Total Population,Estimate U.S. Citizenship Status Foreign-Born Population,Estimate U.S. Citizenship Status Foreign-Born Population Naturalized U.S. Citizen As a Percent of Estimate U.S. Citizenship Status Foreign-Born Population,Estimate Veteran Status Civilian Population 18 Years And Over,Estimate Veteran Status Civilian Population 18 Years And Over Civilian Veterans As a Percent of Estimate Veteran Status Civilian Population 18 Years And Over,Estimate Households By Type Total Households Average Family Size,Estimate Households By Type Total Households Average Household Size
ZCTA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00601,1,,,1,,1,,1,,1,...,0.064,0.493,43.7,0.038,1,,1,,,
00601,1,,,1,,1,,1,,1,...,0.064,0.493,43.7,0.038,1,,1,,,
00602,1,,,1,,1,,1,,1,...,0.062,0.491,44.4,0.033,1,,1,,,
00603,1,,,1,,1,,1,,1,...,0.077,0.485,44.1,0.038,1,,1,,,
00606,1,,,1,,1,,1,,1,...,0.047,0.49,44.9,0.039,1,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
00979,1,,,1,,1,,1,,1,...,0.071,0.502,49.8,0.019,1,,1,,,
00982,1,,,1,,1,,1,,1,...,0.103,0.435,44.7,0.041,1,,1,,,
00983,1,,,1,,1,,1,,1,...,0.09,0.447,41.7,0.048,1,,1,,,
00985,1,,,1,,1,,1,,1,...,0.095,0.441,46.3,0.038,1,,1,,,


In [55]:
#Getting States per ZCTA
zcta_df = pd.read_excel("./ani_csv/ZIPCodetoZCTACrosswalk2021UDS.xlsx")
zcta_df

Unnamed: 0,ZIP_CODE,PO_NAME,STATE,ZIP_TYPE,ZCTA,zip_join_type
0,501,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
1,544,Holtsville,NY,Post Office or large volume customer,11742.0,Spatial join to ZCTA
2,601,Adjuntas,PR,Zip Code Area,601.0,Zip matches ZCTA
3,602,Aguada,PR,Zip Code Area,602.0,Zip matches ZCTA
4,603,Aguadilla,PR,Zip Code Area,603.0,Zip matches ZCTA
...,...,...,...,...,...,...
41086,99926,Metlakatla,AK,Zip Code Area,99926.0,Zip matches ZCTA
41087,99927,Point Baker,AK,Zip Code Area,99927.0,Zip matches ZCTA
41088,99928,Ward Cove,AK,Post Office or large volume customer,99901.0,Spatial join to ZCTA
41089,99929,Wrangell,AK,Zip Code Area,99929.0,Zip matches ZCTA


In [56]:
#Realizing the merging was done incorrectly, so redoing for ZCTAs not in feature_df_uncleaned.index()
zcta_df = zcta_df[["STATE", "ZCTA", "ZIP_CODE"]]
zcta_df = zcta_df[~zcta_df["ZCTA"].isna()]
zcta_df["ZCTA"] = zcta_df["ZCTA"].astype(int)
zcta_df["ZIP_CODE"] = zcta_df["ZIP_CODE"].map(clean_zip)
zcta_df

Unnamed: 0,STATE,ZCTA,ZIP_CODE
0,NY,11742,00501
1,NY,11742,00544
2,PR,601,00601
3,PR,602,00602
4,PR,603,00603
...,...,...,...
41086,AK,99926,99926
41087,AK,99927,99927
41088,AK,99901,99928
41089,AK,99929,99929


In [57]:
zcta_df["ZCTA"] = zcta_df["ZCTA"].map(clean_zip)

In [62]:
true_zctas = list(zcta_df[zcta_df["ZCTA"] != zcta_df["ZIP_CODE"]]["ZCTA"])
len(true_zctas)

7944

In [None]:
for i in range(len(true_zctas)):
    zcta = true_zctas[i]
    if (i % 79 == 0):
        print(f"{i / 79}% Complete")
    try:
        feature_df_uncleaned = pd.concat([feature_df_uncleaned, await clean_zcta(zcta)], axis=0)
        succeeded = succeeded + 1
    except:
        failed = failed + 1

0.0% Complete
1.0% Complete
2.0% Complete
3.0% Complete
4.0% Complete
5.0% Complete
6.0% Complete
7.0% Complete
8.0% Complete
9.0% Complete
10.0% Complete
11.0% Complete
12.0% Complete
13.0% Complete
14.0% Complete
15.0% Complete
16.0% Complete
17.0% Complete
18.0% Complete
19.0% Complete
20.0% Complete
21.0% Complete
22.0% Complete
23.0% Complete
24.0% Complete
25.0% Complete
26.0% Complete
27.0% Complete
28.0% Complete
29.0% Complete
30.0% Complete
31.0% Complete
32.0% Complete
33.0% Complete
34.0% Complete
35.0% Complete
36.0% Complete
37.0% Complete
38.0% Complete
39.0% Complete
40.0% Complete
41.0% Complete
42.0% Complete
43.0% Complete
44.0% Complete
45.0% Complete
46.0% Complete
47.0% Complete
48.0% Complete
49.0% Complete
50.0% Complete
51.0% Complete
52.0% Complete
53.0% Complete
54.0% Complete
55.0% Complete
56.0% Complete
57.0% Complete
58.0% Complete
59.0% Complete
60.0% Complete
61.0% Complete
62.0% Complete
63.0% Complete
64.0% Complete
65.0% Complete
66.0% Complete
67.0%

In [65]:
feature_df_uncleaned

Label,Estimate Computers And Internet Use Total Households,Estimate Computers And Internet Use Total Households With A Broadband Internet Subscription As a Percent of Estimate Computers And Internet Use Total Households,Estimate Computers And Internet Use Total Households With A Computer As a Percent of Estimate Computers And Internet Use Total Households,Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population 18 To 64 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population,Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population Total Civilian Noninstitutionalized Population,Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years,Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years With A Disability As a Percent of Estimate Disability Status Of The Civilian Noninstitutionalized Population Under 18 Years,Estimate Educational Attainment Population 25 Years And Over,...,Estimate Sex And Age Total Population 75 To 84 Years As a Percent of Estimate Sex And Age Total Population,Estimate Sex And Age Total Population Male As a Percent of Estimate Sex And Age Total Population,Estimate Sex And Age Total Population Median Age (Years),Estimate Sex And Age Total Population Under 5 Years As a Percent of Estimate Sex And Age Total Population,Estimate U.S. Citizenship Status Foreign-Born Population,Estimate U.S. Citizenship Status Foreign-Born Population Naturalized U.S. Citizen As a Percent of Estimate U.S. Citizenship Status Foreign-Born Population,Estimate Veteran Status Civilian Population 18 Years And Over,Estimate Veteran Status Civilian Population 18 Years And Over Civilian Veterans As a Percent of Estimate Veteran Status Civilian Population 18 Years And Over,Estimate Households By Type Total Households Average Family Size,Estimate Households By Type Total Households Average Household Size
ZCTA,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00601,1,,,1,,1,,1,,1,...,0.064,0.493,43.7,0.038,1,,1,,,
00601,1,,,1,,1,,1,,1,...,0.064,0.493,43.7,0.038,1,,1,,,
00602,1,,,1,,1,,1,,1,...,0.062,0.491,44.4,0.033,1,,1,,,
00603,1,,,1,,1,,1,,1,...,0.077,0.485,44.1,0.038,1,,1,,,
00606,1,,,1,,1,,1,,1,...,0.047,0.49,44.9,0.039,1,,1,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99801,1,0.92,0.961,1,0.092,1,0.115,1,0.047,1,...,0.034,0.51,38.7,0.055,1,0.665,1,0.076,3.19,2.48
99801,1,0.92,0.961,1,0.092,1,0.115,1,0.047,1,...,0.034,0.51,38.7,0.055,1,0.665,1,0.076,3.19,2.48
99801,1,0.92,0.961,1,0.092,1,0.115,1,0.047,1,...,0.034,0.51,38.7,0.055,1,0.665,1,0.076,3.19,2.48
99901,1,0.876,0.937,1,0.117,1,0.15,1,0.047,1,...,0.038,0.516,40.3,0.051,1,0.688,1,0.096,2.99,2.5


In [77]:
print(f'The success rate to get info for each ZCTA was {1 - failed/succeeded}')

The success rate to get info for each ZCTA was 0.7992506795308176


In [76]:
zcta_df

Unnamed: 0,STATE,ZCTA,ZIP_CODE
0,NY,11742,00501
1,NY,11742,00544
2,PR,00601,00601
3,PR,00602,00602
4,PR,00603,00603
...,...,...,...
41086,AK,99926,99926
41087,AK,99927,99927
41088,AK,99901,99928
41089,AK,99929,99929


In [88]:
from sklearn.preprocessing import OneHotEncoder

ohe = OneHotEncoder()

In [93]:
os.path.getsize("./census_data/feature_df_uncleaned.csv")

35048283

OneHotEncoder()