In [1]:
with open('census_api_key.txt') as f:
    lines = f.readlines()
API_KEY = lines[0]

import pandas as pd
import urllib.request, json 
import requests
import os.path

In [2]:
years = list(range(2021, 2022))

In [47]:
fields = [
    ['B01001_001E', 'population', 1, 'use as is and use for calculations'], 
    # ['B01003_001E', 'populatio2', 1, 'seems to be same as B01001_001E'], 
    ['B11001_001E', 'num_households', 0, 'use it for ave household size (popn/).'],
    ['B25087_001E', 'costoliving_old', 0, ''],
    ['B25088_001E', 'costoliving_new1', 0, ''],
    ['B25089_001E', 'costoliving_new2', 0, ''],
    ['B25092_001E', 'costoliving_new3', 0, ''],
    ['B25104_001E', 'costoliving_new4', 0, ''],
    ['B25105_001E', 'costoliving_new5', 0, ''],
    # ['B25140_001E', 'costoliving_new4', 0, ''],
]
df_fields = pd.DataFrame(fields, columns=['code', 'myname', 'use_as_is', 'note'])
df_fields.head()

Unnamed: 0,code,myname,use_as_is,note
0,B01001_001E,population,1,use as is and use for calculations
1,B11001_001E,num_households,0,use it for ave household size (popn/).
2,B25087_001E,costoliving_old,0,
3,B25088_001E,costoliving_new1,0,
4,B25089_001E,costoliving_new2,0,


In [48]:
# setup some variables to be used below
textsplitter = ' zzz '
max_year = max(years)

In [49]:
def get_field_descriptions(api_key, year):
    """
    Fetch descriptions for fields for a specific year.
    
    :param api_key: Your Census API Key.
    :param year: Census year.
    :return: Dictionary with field codes as keys and descriptions as values.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5/variables"
    response = requests.get(base_url, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # try:
        #     # Extract field code and description from the data
        #     return {var["name"]: var["label"] for var in data["variables"].values()}
        # except TypeError as e:
        #     print(f"Unexpected data structure for year {year}: {data}")
        #     return data
        # descriptions = {row[0]: row[1] for row in data[1:]}
        descriptions = {row[0]: str(row[1]) + textsplitter + str(row[2]) for row in data[1:]}
        return descriptions
    else:
        print(f"Error {response.status_code}: {response.text}")
        return {}


In [None]:
# call the function and put the descriptions in a dictionary
descriptions = {}
descriptions.update(get_field_descriptions(API_KEY, max_year))
# add the descriptions to df_fields
for field in df_fields['code'].to_list():
    this_descr = descriptions.get(field, "Unknown")
    splitted = this_descr.split(textsplitter)
    df_fields.loc[df_fields['code'] == field, 'descr1'] = splitted[0]
    descr2=''
    if len(splitted)>1:
        descr2=splitted[1],
    df_fields.loc[df_fields['code'] == field, 'descr2'] = descr2

In [51]:
    
df_fields

Unnamed: 0,code,myname,use_as_is,note,descr1,descr2
0,B01001_001E,population,1,use as is and use for calculations,Estimate!!Total:,SEX BY AGE
1,B11001_001E,num_households,0,use it for ave household size (popn/).,Estimate!!Total:,HOUSEHOLD TYPE (INCLUDING LIVING ALONE)
2,B25087_001E,costoliving_old,0,,Estimate!!Total:,MORTGAGE STATUS AND SELECTED MONTHLY OWNER COSTS
3,B25088_001E,costoliving_new1,0,,Estimate!!Median selected monthly owner costs ...,MEDIAN SELECTED MONTHLY OWNER COSTS (DOLLARS) ...
4,B25089_001E,costoliving_new2,0,,Estimate!!Aggregate selected monthly owner cos...,AGGREGATE SELECTED MONTHLY OWNER COSTS (DOLLAR...
5,B25092_001E,costoliving_new3,0,,Estimate!!Median selected monthly owner costs ...,MEDIAN SELECTED MONTHLY OWNER COSTS AS A PERCE...
6,B25104_001E,costoliving_new4,0,,Estimate!!Total:,MONTHLY HOUSING COSTS
7,B25105_001E,costoliving_new5,0,,Estimate!!Median monthly housing costs,MEDIAN MONTHLY HOUSING COSTS (DOLLARS)


In [52]:
download_fields = [item[0] for item in fields]
new_field_names = pd.Series(df_fields.myname.values,index=df_fields.code).to_dict()

In [53]:
def get_census_data_by_zip(api_key, fields, year):
    """
    Fetch census data by ZIP Code Tabulation Areas (ZCTAs) for specified fields.
    
    :param api_key: Your Census API Key.
    :param fields: List of fields to fetch.
    :param year: Census year.
    :return: DataFrame with fetched data.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    
    # Combine the fields into a comma-separated string
    fields_str = ",".join(fields)
    
    # Construct the final URL
    url = f"{base_url}?get={fields_str}&for=zip%20code%20tabulation%20area:*"
    
    headers = {
        "Content-Type": "application/json",
    }
    
    # Make the API request
    response = requests.get(url, headers=headers, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # Convert data to DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])
        df['year'] = year
        return df
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

In [54]:
dfraw = pd.DataFrame()
for year in years:
    print(year)
    temp_df = get_census_data_by_zip(API_KEY, download_fields, year)
    dfraw = pd.concat([dfraw, temp_df])
    print(dfraw.shape)
dfraw.rename(columns=new_field_names, inplace=True)
dfraw.rename(columns = {'zip code tabulation area':'zipcode'}, inplace = True)
# re-order columns
cols=['zipcode', 'year']
dfraw = dfraw[cols + [c for c in dfraw.columns if c not in cols]]

2021
(33774, 10)


In [55]:
dfraw.loc[dfraw['zipcode']=='90266']

Unnamed: 0,zipcode,year,population,num_households,costoliving_old,costoliving_new1,costoliving_new2,costoliving_new3,costoliving_new4,costoliving_new5
30688,90266,2021,35585,13422,9096,3762,38354400,19.7,13422,3314


In [61]:
dfraw['costoliving_new3'].max()

'9.0'

In [63]:
pd.to_numeric(dfraw)

TypeError: arg must be a list, tuple, 1-d array, or Series

In [65]:
dfzz = dfraw.astype({
    'costoliving_new3': 'float16',
})

In [69]:
dfzz['costoliving_new3'].describe()

count    33774.00000
mean            -inf
std              NaN
min             -inf
25%         13.50000
50%         16.00000
75%         18.40625
max         51.00000
Name: costoliving_new3, dtype: float64