In [38]:
import requests
import pandas as pd

In [39]:
zip_codes = []

# Read ZIP codes from the file
with open('data/zip-code-csv.txt', 'r') as file:
    zip_codes = [line.strip() for line in file.readlines()][1:]

print(zip_codes)

['94558', '95687', '94533', '94534', '95688', '95448', '94574', '95446', '95436', '95694', '94515', '95987', '95461', '95457', '95627', '95421', '95441', '95450', '94567', '95637', '95607', '95606', '95679', '94591', '95403', '95404', '95476', '95616', '95409', '95401', '95472', '94510', '95695', '95492', '94585', '94559', '95405', '95620', '94503', '95422', '95451', '94571', '95425', '95467', '95423', '95932', '95497', '95442', '95445', '94599', '94535', '95912', '94508', '95462', '95426', '95452', '95465', '95449', '95937', '95444', '95419', '95979', '95439', '95471', '95424', '95653', '95412', '94576', '95430', '95625', '95486', '95697', '94573', '95698', '95433', '94562', '94512', '95416']


DP05
Data Link: https://api.census.gov/data/2022/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95687
Estimate, ignore, Margin of Error, ignore, Percent, ignore, Percent Margin of Error, ignore

S1901
Data Link: https://api.census.gov/data/2022/acs/acs5/subject?get=group(S1901)&ucgid=860Z200US95687

In [40]:
# Define the years and ZIP codes
years = range(2017, 2023)

labels = [
    'Total (Households)',
    'Less than $10,000',
    '$10,000 to $14,999',
    '$15,000 to $24,999',
    '$25,000 to $34,999',
    '$35,000 to $49,999',
    '$50,000 to $74,999',
    '$75,000 to $99,999',
    '$100,000 to $149,999',
    '$150,000 to $199,999',
    '$200,000 or more',
    'Median income (dollars)',
    'Mean income (dollars)'
]

all_data = []

In [41]:
# test on one case S1901

# Use a single year and zip code
year = 2017
zip_code = '95687'

# Make the API call
urls = [
    f"https://api.census.gov/data/{year}/acs/acs5/subject?get=group(S1901)&ucgid=860Z200US{zip_code}",
    f"https://api.census.gov/data/{year}/acs/acs5/subject?get=group(S1901)&ucgid=8600000US{zip_code}",
]
        
response = None
for url in urls:
    try:
        response = requests.get(url)
        if response.status_code == 200:
            break
    except ConnectionError as e:
        print(f"Connection error for URL {url}: {e}")
        continue

if response.status_code == 200:
    result = response.json()
    data = result[1][2:]  # Remove the first two entries
    # Remove unwanted values
    unwanted_values = [None, 'null', '(X)', 'N', '-888888888', '-888888888.0', '-999999999']
    data = [item for item in data if item not in unwanted_values]
    # Take only the first 26 entries
    data = data[:26]
    # Pair the data into (Estimate, Margin of Error)
    estimates = data[::2]
    margins = data[1::2]
    data_pairs = list(zip(estimates, margins))
    
    # # Ensure the lengths match
    # min_length = min(len(data_pairs), len(labels))
    # data_pairs = data_pairs[:min_length]
    # labels = labels[:min_length]
    
    # Create DataFrame
    df = pd.DataFrame(data_pairs, columns=['Estimate', 'Margin of Error'], index=labels)
    df['Year'] = year
    df['Zip Code'] = zip_code
    # Convert 'Estimate' and 'Margin of Error' to numeric
    df['Estimate'] = pd.to_numeric(df['Estimate'], errors='coerce')
    df['Margin of Error'] = pd.to_numeric(df['Margin of Error'], errors='coerce')
    # Display the DataFrame
    print(df)
else:
    print(f"Error fetching data: {response.status_code}")

                         Estimate  Margin of Error  Year Zip Code
Total (Households)        22060.0            469.0  2017    95687
Less than $10,000             3.6              0.9  2017    95687
$10,000 to $14,999            3.8              1.0  2017    95687
$15,000 to $24,999            6.1              1.1  2017    95687
$25,000 to $34,999            6.1              1.3  2017    95687
$35,000 to $49,999           11.9              1.6  2017    95687
$50,000 to $74,999           16.7              1.8  2017    95687
$75,000 to $99,999           15.7              1.9  2017    95687
$100,000 to $149,999         20.5              1.5  2017    95687
$150,000 to $199,999          9.2              1.1  2017    95687
$200,000 or more              6.2              1.0  2017    95687
Median income (dollars)   77222.0           3698.0  2017    95687
Mean income (dollars)     92441.0           3297.0  2017    95687


In [42]:
for year in years:
    print(f"Processing year {year}")
    for zip_code in zip_codes:
        
        urls = [
            f"https://api.census.gov/data/{year}/acs/acs5/subject?get=group(S1901)&ucgid=860Z200US{zip_code}",
            f"https://api.census.gov/data/{year}/acs/acs5/subject?get=group(S1901)&ucgid=8600000US{zip_code}",
        ]
        
        response = None
        for url in urls:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    break
            except ConnectionError as e:
                print(f"Connection error for URL {url}: {e}")
                continue
            
        if response.status_code == 200:
            result = response.json()
            data = result[1][2:]  # Remove the first two entries
            # Remove unwanted values
            unwanted_values = [None, 'null', '(X)', 'N', '-888888888', '-888888888.0', '-999999999']
            data = [item for item in data if item not in unwanted_values]
            # Take only the first 26 entries
            data = data[:26]
            # Pair the data into (Estimate, Margin of Error)
            estimates = data[::2]
            margins = data[1::2]
            data_pairs = list(zip(estimates, margins))
            # Create DataFrame
            df = pd.DataFrame(data_pairs, columns=['Estimate', 'Margin of Error'], index=labels)
            df['Year'] = year
            df['Zip Code'] = zip_code
            all_data.append(df.reset_index())
    print(df)

# Concatenate all data
final_df = pd.concat(all_data, ignore_index=True)
final_df.rename(columns={'index': 'Label'}, inplace=True)
# Convert 'Estimate' and 'Margin of Error' to numeric
final_df['Estimate'] = pd.to_numeric(final_df['Estimate'], errors='coerce')
final_df['Margin of Error'] = pd.to_numeric(final_df['Margin of Error'], errors='coerce')
# Save to CSV
final_df.to_csv("data/ACS_S1901_YearZIP.csv", index=False)

Processing year 2017
                           Estimate Margin of Error  Year Zip Code
Total (Households)               32              30  2017    94512
Less than $10,000               0.0            54.4  2017    94512
$10,000 to $14,999              0.0            54.4  2017    94512
$15,000 to $24,999              0.0            54.4  2017    94512
$25,000 to $34,999              0.0            54.4  2017    94512
$35,000 to $49,999              0.0            54.4  2017    94512
$50,000 to $74,999              0.0            54.4  2017    94512
$75,000 to $99,999              0.0            54.4  2017    94512
$100,000 to $149,999           50.0            50.0  2017    94512
$150,000 to $199,999           50.0            50.0  2017    94512
$200,000 or more                0.0            54.4  2017    94512
Median income (dollars)  -666666666               -  2017    94512
Mean income (dollars)    -222222222              **  2017    94512
Processing year 2018
                    

In [43]:
# Define your labels
labels_2022 = [
    'SEX AND AGE',
    'Total population',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    'Under 5 years',
    '5 to 9 years',
    '10 to 14 years',
    '15 to 19 years',
    '20 to 24 years',
    '25 to 34 years',
    '35 to 44 years',
    '45 to 54 years',
    '55 to 59 years',
    '60 to 64 years',
    '65 to 74 years',
    '75 to 84 years',
    '85 years and over',
    'Median age (years)',
    'Under 18 years',
    '16 years and over',
    '18 years and over',
    '21 years and over',
    '62 years and over',
    '65 years and over',
    '18 years and over',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    '65 years and over',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    'RACE',
    'Total population',
    'One race',
    'Two or More Races',
    'One race',
    'White',
    'Black or African American',
    'American Indian and Alaska Native',
    'Cherokee tribal grouping',
    'Chippewa tribal grouping',
    'Navajo tribal grouping',
    'Sioux tribal grouping',
    'Asian',
    'Asian Indian',
    'Chinese',
    'Filipino',
    'Japanese',
    'Korean',
    'Vietnamese',
    'Other Asian',
    'Native Hawaiian and Other Pacific Islander',
    'Chamorro',
    'Native Hawaiian',
    'Samoan',
    'Other Native Hawaiian and Other Pacific Islander',
    'Some Other Race',
    'Two or More Races',
    'White and Black or African American',
    'White and American Indian and Alaska Native',
    'White and Asian',
    'White and Some Other Race',
    'Black or African American and American Indian and Alaska Native',
    'Black or African American and Some Other Race',
    'Race alone or in combination with one or more other races',
    'Total population',
    'White',
    'Black or African American',
    'American Indian and Alaska Native',
    'Asian',
    'Native Hawaiian and Other Pacific Islander',
    'Some Other Race',
    'HISPANIC OR LATINO AND RACE',
    'Total population',
    'Hispanic or Latino (of any race)',
    'Mexican',
    'Puerto Rican',
    'Cuban',
    'Other Hispanic or Latino',
    'Not Hispanic or Latino',
    'White alone',
    'Black or African American alone',
    'American Indian and Alaska Native alone',
    'Asian alone',
    'Native Hawaiian and Other Pacific Islander alone',
    'Some Other Race alone',
    'Two or More Races',
    'Two races including Some Other Race',
    'Two races excluding Some Other Race, and three or more races',
    'Total housing units',
    'CITIZEN, VOTING AGE POPULATION',
    'Citizen, 18 and over population',
    'Male',
    'Female'
]

labels = [
    'SEX AND AGE',
    'Total population',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    'Under 5 years',
    '5 to 9 years',
    '10 to 14 years',
    '15 to 19 years',
    '20 to 24 years',
    '25 to 34 years',
    '35 to 44 years',
    '45 to 54 years',
    '55 to 59 years',
    '60 to 64 years',
    '65 to 74 years',
    '75 to 84 years',
    '85 years and over',
    'Median age (years)',
    'Under 18 years',
    '16 years and over',
    '18 years and over',
    '21 years and over',
    '62 years and over',
    '65 years and over',
    '18 years and over',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    '65 years and over',
    'Male',
    'Female',
    'Sex ratio (males per 100 females)',
    'RACE',
    'Total population',
    'One race',
    'Two or more races',
    'One race',
    'White',
    'Black or African American',
    'American Indian and Alaska Native',
    'Cherokee tribal grouping',
    'Chippewa tribal grouping',
    'Navajo tribal grouping',
    'Sioux tribal grouping',
    'Asian',
    'Asian Indian',
    'Chinese',
    'Filipino',
    'Japanese',
    'Korean',
    'Vietnamese',
    'Other Asian',
    'Native Hawaiian and Other Pacific Islander',
    'Native Hawaiian',
    'Chamorro',
    'Samoan',
    'Other Pacific Islander',
    'Some other race',
    'Two or more races',
    'White and Black or African American',
    'White and American Indian and Alaska Native',
    'White and Asian',
    'Black or African American and American Indian and Alaska Native',
    'Race alone or in combination with one or more other races',
    'Total population',
    'White',
    'Black or African American',
    'American Indian and Alaska Native',
    'Asian',
    'Native Hawaiian and Other Pacific Islander',
    'Some other race',
    'HISPANIC OR LATINO AND RACE',
    'Total population',
    'Hispanic or Latino (of any race)',
    'Mexican',
    'Puerto Rican',
    'Cuban',
    'Other Hispanic or Latino',
    'Not Hispanic or Latino',
    'White alone',
    'Black or African American alone',
    'American Indian and Alaska Native alone',
    'Asian alone',
    'Native Hawaiian and Other Pacific Islander alone',
    'Some other race alone',
    'Two or more races',
    'Two races including Some other race',
    'Two races excluding Some other race, and Three or more races',
    'Total housing units',
    'CITIZEN, VOTING AGE POPULATION',
    'Citizen, 18 and over population',
    'Male',
    'Female'
]

# Define headers that contain no data
headers = {
    'SEX AND AGE',
    'RACE',
    'Race alone or in combination with one or more other races',
    'HISPANIC OR LATINO AND RACE',
    'CITIZEN, VOTING AGE POPULATION'
}

In [44]:
# Assuming `data` is the response data as a list of strings
def preprocess_2018_data(data, ugcid):
    # Remove nulls
    data = [x for x in data if x is not None]
    # Replace (X) with 0
    data = ['0' if x in('-888888888', '-888888888.0', '(X)') else x for x in data]
    # Extract the first section
    first_section = data[:data.index(ugcid)]
    # Remove UCGID at the end
    ucgid = first_section.pop()
    # Append UCGID to the specified section
    specified_section = data[data.index(ugcid)+1:]
    specified_section.append(ucgid)
    return specified_section + first_section

In [45]:
# test a single response Make the API call
year = 2018
zip_code = 94503
url = f"https://api.census.gov/data/{year}/acs/acs5/profile?get=group(DP05)&ucgid=8600000US{zip_code}"
response = requests.get(url)

if response.status_code == 200:
    result = response.json()
    if year == 2018:
        data = preprocess_2018_data(result[1],f"8600000US{zip_code}")
        print(data)
    else:
        data = result[1][:-3] # Remove the last three entries
        print(data)
        print(f"Raw data length: {len(result[1])}")
        data = data[::2]
        print(f"Data length after removing every other entry: {len(data)}")
        print(data)
        # Replace '-888888888' and '-888888888.0' with '(X)'
        data = ['(X)' if x in ('-888888888', '-888888888.0') else x for x in data]
        print(f"Data length after replacing '-888888888': {len(data)}")
        print(data)
        # Remove None and 'null' values
        data = [x for x in data if x not in [None, 'null']]
        print(f"Data length after removing None and 'null': {len(data)}")
        print(data)

    # Now, group the data into chunks of four
    data_chunks = [data[i:i+4] for i in range(0, len(data), 4)]

    # Get the list of data labels (excluding headers)
    data_labels = [label for label in labels if label not in headers]

    # Now, check if the number of data chunks matches the number of data labels
    print(f"Number of data chunks: {len(data_chunks)}")
    print(f"Number of labels: {len(labels)}")
    print(f"Number of data labels: {len(data_labels)}")

    # If data_chunks is less than data_labels, then we may be missing data
    # Adjust data_labels to match the length of data_chunks if necessary
    if len(data_chunks) < len(data_labels):
        print("Warning: Not enough data to cover all labels.")
        data_labels = data_labels[:len(data_chunks)]
    elif len(data_chunks) > len(data_labels):
        print("Warning: More data than labels. Truncating data.")
        data_chunks = data_chunks[:len(data_labels)]

    # Initialize list to store the rows
    rows = []
    data_index = 0  # Index to track position in data list

    for label in labels:
        if label in headers:
            # Headers have no data
            rows.append([label, None, None, None, None])
        else:
            # Get the next four data elements
            if data_index + 4 <= len(data):
                row_data = data[data_index:data_index+4]
                data_index += 4
            else:
                row_data = [None, None, None, None]
            rows.append([label] + row_data)

    # Create DataFrame
    df = pd.DataFrame(
        rows,
        columns=['Label', 'Estimate', 'Margin of Error', 'Percent', 'Percent Margin of Error']
    )
    # Replace any remaining '-888888888' or '-888888888.0' with '(X)'
    df.replace({'-888888888': '(X)', '-888888888.0': '(X)'}, inplace=True)
    # Convert numerical columns to numeric, ignoring errors
    numeric_cols = ['Estimate', 'Margin of Error', 'Percent', 'Percent Margin of Error']
    for col in numeric_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Display the DataFrame
    print(df)
    
    # Save DataFrame to CSV
    df.to_csv(f'ACS_Demographics_{zip_code}.csv', index=False)
else:
    print(f"Error fetching data: {response.status_code}")
    print(url)


['20306', '45', '20306', '0', '10091', '279', '49.7', '1.4', '10215', '274', '50.3', '1.4', '98.8', '5.3', '0', '0', '1290', '220', '6.4', '1.1', '1221', '162', '6.0', '0.8', '1817', '211', '8.9', '1.0', '1494', '177', '7.4', '0.9', '1531', '197', '7.5', '1.0', '2219', '234', '10.9', '1.1', '2956', '240', '14.6', '1.2', '3081', '219', '15.2', '1.1', '1271', '189', '6.3', '0.9', '1080', '163', '5.3', '0.8', '1352', '140', '6.7', '0.7', '795', '134', '3.9', '0.7', '199', '76', '1.0', '0.4', '37.3', '1.6', '0', '0', '5345', '263', '26.3', '1.3', '15657', '274', '77.1', '1.4', '14961', '261', '73.7', '1.3', '14163', '265', '69.7', '1.3', '2921', '237', '14.4', '1.2', '2346', '201', '11.6', '1.0', '14961', '261', '14961', '0', '7221', '245', '48.3', '1.3', '7740', '223', '51.7', '1.3', '93.3', '4.9', '0', '0', '2346', '201', '2346', '0', '1077', '149', '45.9', '3.9', '1269', '119', '54.1', '0', '0', '0', '0', '0', '0', '0', '0', 'ZCTA5 94503', '0', '0', '0', '0', '0', '0', '0', '0', '0', '8

In [46]:
all_data = []

for year in years:
    for zip_code in zip_codes:
        urls = [
            f"https://api.census.gov/data/{year}/acs/acs5/profile?get=group(DP05)&ucgid=8600000US{zip_code}",
            f"https://api.census.gov/data/{year}/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US{zip_code}"
        ]
        
        response = None
        for url in urls:
            try:
                response = requests.get(url)
                if response.status_code == 200:
                    break
            except ConnectionError as e:
                print(f"Connection error for URL {url}: {e}")
                continue
        
        if response and response.status_code == 200:
            result = response.json()
            if year == 2018:
                data = preprocess_2018_data(result[1],f"8600000US{zip_code}")
            else:
                data = result[1][:-3]  # Remove the last three entries

                # Remove every other entry starting from the second entry
                data = data[::2]

                # Replace '-888888888' and '-888888888.0' with '(X)'
                data = ['(X)' if x in ('-888888888', '-888888888.0') else x for x in data]

                # Remove None and 'null' values
                data = [x for x in data if x not in [None, 'null']]

            # Initialize list to store the rows
            rows = []
            data_index = 0  # Index to track position in data list
            
            # Use labels_2022 for the year 2022, otherwise use labels
            current_labels = labels_2022 if year == 2022 else labels

            for label in current_labels:
                if label in headers:
                    # Headers have no data
                    rows.append([label, None, None, None, None])
                else:
                    # Get the next four data elements
                    if data_index + 4 <= len(data):
                        row_data = data[data_index:data_index+4]
                        data_index += 4
                    else:
                        row_data = [None, None, None, None]
                    rows.append([label] + row_data)

            # Create DataFrame
            df = pd.DataFrame(
                rows,
                columns=[
                    'Label', 'Estimate', 'Margin of Error', 'Percent', 'Percent Margin of Error'
                ]
            )
            df['Year'] = year
            df['Zip Code'] = zip_code
            all_data.append(df)
        else:
            print(f"Error fetching data for ZIP {zip_code} and Year {year}: {response.status_code}")
            print(url)

# Concatenate all data
final_df = pd.concat(all_data, ignore_index=True)
# Replace any remaining '-888888888' or '-888888888.0' with '(X)'
final_df.replace({'-888888888': '(X)', '-888888888.0': '(X)'}, inplace=True)
# Convert numerical columns to numeric, ignoring errors
numeric_cols = ['Estimate', 'Margin of Error', 'Percent', 'Percent Margin of Error']
for col in numeric_cols:
    final_df[col] = pd.to_numeric(final_df[col], errors='coerce')
# Save to CSV
final_df.to_csv("data/ACS_Demographics_YearZIP.csv", index=False)

Error fetching data for ZIP 95419 and Year 2017: 204
https://api.census.gov/data/2017/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95419
Error fetching data for ZIP 95424 and Year 2017: 204
https://api.census.gov/data/2017/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95424
Error fetching data for ZIP 95433 and Year 2017: 204
https://api.census.gov/data/2017/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95433
Error fetching data for ZIP 94562 and Year 2017: 204
https://api.census.gov/data/2017/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US94562
Error fetching data for ZIP 95416 and Year 2017: 204
https://api.census.gov/data/2017/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95416
Error fetching data for ZIP 95419 and Year 2018: 204
https://api.census.gov/data/2018/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95419
Error fetching data for ZIP 95424 and Year 2018: 204
https://api.census.gov/data/2018/acs/acs5/profile?get=group(DP05)&ucgid=860Z200US95424
Error fetching data 