In [1]:
with open('census_api_key.txt') as f:
    lines = f.readlines()
API_KEY = lines[0]

## imports

In [2]:
import pandas as pd
import urllib.request, json 
import requests


# census data
* ref: https://api.census.gov/data/2021/acs/acs5/variables.html


In [3]:
# list years to download
years = [2015, 2020]
max(years)

2020

## create a list of the fields to download, along with a "friendly" name

In [18]:
# list fields to download. 
#     second column is a friendly name so that I don't need to refer to the codes. 
#     third column indicates if I plan to use it in regression (some fields are just for calculations).
#     4th col is just a note to self
fields = [
    ['B01001_001E', 'population', 1, 'compare to ...1003_001E'], 
    ['B19013_001E', 'median_household_income', 1, ''],
    ['B01002_001E', 'median_age', 1, ''],
    ['B23025_002E', 'labor_force', 0, 'use it for proportions'],
    ['B23025_005E', 'unemployed', 0, 'use it for proportions'],
    ['B15003_022E', 'bachelors_degr', 0, 'use it for proportions'],
    ['B15003_023E', 'masters_degr', 0, 'use it for proportions'],
    ['B11001_001E', 'num_households', 0, 'use it for typ household size'],
    ['B25064_001E', 'median_rent', 1, ''],
    ['B25003_001E', 'total_units_1', 0, 'use for ratio'],
    ['B25003_002E', 'owner_occupied', 0, 'divide by units'],
    # ['B28011_001E', 'internet_access_perc', 1, ''],
    # ['B28011_002E', 'internet_access_with_subscrip', 1, ''],
    ['B25034_010E', 'housing_units_built_last_year', 1, 'they also have 2 years ago etc'],
    ['B11005_001E', 'num_families', 1, ''],
    ['B11005_002E', 'families_with_children', 1, 'under 18'],
    ['B17001_002E', 'below_poverty_level', 1, ''],
    ['B17001_001E', 'total_for_poverty_level_calc', 1, ''],
    ['B11001_007E', 'non_family_households', 1, 'divide by households'],
    ['B08303_001E', 'mean_travel_time_to_work', 1, ''],
    ['B25002_003E', 'vacant_units', 0, 'divide by total units'],
    ['B25002_001E', 'total_units_2', 1, 'redundant?'],
    ['B07001_017E', 'perc_moved_fr_same_county', 1, ''],
    ['B07001_018E', 'perc_moved_fr_other_county', 1, ''],
    ['B07001_019E', 'perc_moved_fr_other_state', 1, ''],
    ['B07001_020E', 'perc_moved_fr_abroad', 1, ''],
    ['B25077_001E', 'median_val_of_owner_occup', 1, ''],
    ['B25018_001E', 'ave_num_rooms', 1, ''],
    ['B25024_002E', 'single_family_units', 0, 'divide by total_units'],
    ['B25024_001E', 'total_units_3', 0, 'use for ratios'],
    ['B08301_010E', 'workers_using_public_trans', 0, 'divide by workers'],
    ['B08301_001E', 'workers', 0, 'for ratios'],
    ['B05002_013E', 'foreign_born', 0, 'divide by population'],
    ['B01002_002E', 'male', 0, 'use it for proportion'],
    ['B01002_003E', 'female', 0, 'use it for proportion'],
    ['B19001_001E', 'income', 0, '? will calc average'],
]
df_fields = pd.DataFrame(fields, columns=['code', 'myname', 'use4regression', 'note'])
# df_fields
df_fields.sort_values('code')

Unnamed: 0,code,myname,use4regression,note
0,B01001_001E,population,1,compare to ...1003_001E
2,B01002_001E,median_age,1,
31,B01002_002E,male,0,use it for proportion
32,B01002_003E,female,0,use it for proportion
30,B05002_013E,foreign_born,0,divide by population
20,B07001_017E,perc_moved_fr_same_county,1,
21,B07001_018E,perc_moved_fr_other_county,1,
22,B07001_019E,perc_moved_fr_other_state,1,
23,B07001_020E,perc_moved_fr_abroad,1,
29,B08301_001E,workers,0,for ratios


## other topics not used: 
* crime (not in census), 
* businesses (not in census), 
* race
* Proximity to Amenities (parks, schools, shopping centers) (not in census)
* tax environment, incl local and state (not in census)
* Proximity to Major Highways or Transport Hubs (not in census)


## add the census descriptions to df_fields

In [12]:
# add the census descriptions to fields_df
textsplitter = ' zzz '
def get_field_descriptions(api_key, year):
    """
    Fetch descriptions for fields for a specific year.
    
    :param api_key: Your Census API Key.
    :param year: Census year.
    :return: Dictionary with field codes as keys and descriptions as values.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5/variables"
    response = requests.get(base_url, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # try:
        #     # Extract field code and description from the data
        #     return {var["name"]: var["label"] for var in data["variables"].values()}
        # except TypeError as e:
        #     print(f"Unexpected data structure for year {year}: {data}")
        #     return data
        # descriptions = {row[0]: row[1] for row in data[1:]}
        descriptions = {row[0]: str(row[1]) + textsplitter + str(row[2]) for row in data[1:]}
        return descriptions
    else:
        print(f"Error {response.status_code}: {response.text}")
        return {}

descriptions = {}
descriptions.update(get_field_descriptions(API_KEY, max(years)))
for field in df_fields['code'].to_list():
    this_descr = descriptions.get(field, "Unknown")
    splitted = this_descr.split(textsplitter)
    df_fields.loc[df_fields['code'] == field, 'descr1'] = splitted[0]
    descr2=''
    if len(splitted)>1:
        descr2=splitted[1],
    df_fields.loc[df_fields['code'] == field, 'descr2'] = descr2

df_fields

Unnamed: 0,code,myname,use4regression,note,descr1,descr2
0,B01001_001E,population,1,compare to ...1003_001E,Estimate!!Total:,SEX BY AGE
1,B19013_001E,median_household_income,1,,Estimate!!Median household income in the past ...,MEDIAN HOUSEHOLD INCOME IN THE PAST 12 MONTHS ...
2,B01002_001E,median_age,1,,Estimate!!Median age --!!Total:,MEDIAN AGE BY SEX
3,B23025_002E,labor_force,0,use it for proportions,Estimate!!Total:!!In labor force:,EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS ...
4,B23025_005E,unemployed,0,use it for proportions,Estimate!!Total:!!In labor force:!!Civilian la...,EMPLOYMENT STATUS FOR THE POPULATION 16 YEARS ...
5,B15003_022E,bachelors_degr,0,use it for proportions,Estimate!!Total:!!Bachelor's degree,EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 Y...
6,B15003_023E,masters_degr,0,use it for proportions,Estimate!!Total:!!Master's degree,EDUCATIONAL ATTAINMENT FOR THE POPULATION 25 Y...
7,B11001_001E,num_households,0,use it for typ household size,Estimate!!Total:,HOUSEHOLD TYPE (INCLUDING LIVING ALONE)
8,B25064_001E,median_rent,1,,Estimate!!Median gross rent,MEDIAN GROSS RENT (DOLLARS)
9,B25003_001E,total_units_1,0,use for ratio,Estimate!!Total:,TENURE


## create a list of the *calculated* fields and their formulas

In [13]:
# list the calculated fields
formulas = [
    ['male_prop', 'dfx.male / dfx.population', 0, 'this is just an example'], 
    ['income_ave1', 'dfx.income / dfx.population', 0, 'another example'],
]
df_formulas = pd.DataFrame(formulas, columns=['myname', 'formula', 'use4regression', 'note'])
df_formulas

Unnamed: 0,myname,formula,use4regression,note
0,male_prop,dfx.male / dfx.population,0,this is just an example
1,income_ave1,dfx.income / dfx.population,0,another example


In [14]:

# see my spreadsheet for a way to calculate mean and standard deviation for binned data like the following: 

a = '''
    ['B19001_002E'], 'income<10', False, 'will calc proportion'],
    ['B19001_003E'], 'income10-15', False, 'will calc proportion'],
    ['B19001_004E'], 'income15-20', False, 'will calc proportion'],
    ['B19001_005E'], 'income20-25', False, 'will calc proportion'],
    ['B19001_006E'], 'income25-30', False, 'will calc proportion'],
    ['B19001_007E'], 'income30-35', False, 'will calc proportion'],
    ['B19001_008E'], 'income35-40', False, 'will calc proportion'],
    ['B19001_009E'], 'income40-45', False, 'will calc proportion'],
    ['B19001_010E'], 'income45-50', False, 'will calc proportion'],
    ['B19001_011E'], 'income50-60', False, 'will calc proportion'],
    ['B19001_012E'], 'income60-75', False, 'will calc proportion'],
    ['B19001_013E'], 'income75-100', False, 'will calc proportion'],
    ['B19001_014E'], 'income100-125', False, 'will calc proportion'],
    ['B19001_015E'], 'income125-150', False, 'will calc proportion'],
    ['B19001_016E'], 'income150-200', False, 'will calc proportion'],
    ['B19001_017E'], 'income200+', False, 'will calc proportion'],
    '''


## function to pull the census data 

In [15]:
# function to get the raw data from the census
def get_census_data_by_zip(api_key, fields, year):
    """
    Fetch census data by ZIP Code Tabulation Areas (ZCTAs) for specified fields.
    
    :param api_key: Your Census API Key.
    :param fields: List of fields to fetch.
    :param year: Census year.
    :return: DataFrame with fetched data.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    
    # Combine the fields into a comma-separated string
    fields_str = ",".join(fields)
    
    # Construct the final URL
    url = f"{base_url}?get={fields_str}&for=zip%20code%20tabulation%20area:*"
    
    headers = {
        "Content-Type": "application/json",
    }
    
    # Make the API request
    response = requests.get(url, headers=headers, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # Convert data to DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])
        df['year'] = year
        return df
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None


In [16]:
# a = get_census_data_by_zip(API_KEY, download_fields, 2015)
# b = get_census_data_by_zip(API_KEY, download_fields, 2020)
# print(a.shape)
# print(b.shape)
# c = pd.concat([a, b])
# print(c.shape)
# c.head()
# c.sample(10)

## get the data

In [17]:
# get the data
download_fields = [item[0] for item in fields]
dfraw = pd.DataFrame()
for year in years:
    print(year)
    temp_df = get_census_data_by_zip(API_KEY, download_fields, year)
    dfraw = pd.concat([dfraw, temp_df])
    print(dfraw.shape)


2015
Error 400: error: error: unknown variable 'B28011_001E'
(0, 0)
2020
(33120, 37)


In [19]:
# rename the field codes with 'myname'
dict1 = pd.Series(df_fields.myname.values,index=df_fields.code).to_dict()
dfraw.rename(columns=dict1, inplace=True)
# dfraw.head()

In [23]:
# rename the zip column, and re order columns
dfraw.rename(columns = {'zip code tabulation area':'zipcode'}, inplace = True)
# cols=['state', 'zipcode', 'year']
cols=['zipcode', 'year']
dfraw = dfraw[cols + [c for c in dfraw.columns if c not in cols]]
dfraw.head()

Unnamed: 0,zipcode,year,population,median_household_income,median_age,labor_force,unemployed,bachelors_degr,masters_degr,num_households,...,median_val_of_owner_occup,ave_num_rooms,single_family_units,total_units_3,workers_using_public_trans,workers,foreign_born,male,female,income
0,601,2020,16773,14398,42.9,5229,1599,1644,155,5555,...,81800,4.9,5849,7282,0,3588,38,41.6,43.8,5555
1,602,2020,37083,16771,43.6,11764,913,4210,1065,12901,...,85600,4.8,12666,17510,83,10811,255,42.9,45.0,12901
2,603,2020,45652,15786,43.2,14679,2575,5367,1686,19431,...,116000,4.9,16397,24453,28,11201,544,41.4,44.8,19431
3,606,2020,6231,14980,45.1,1612,157,319,163,1983,...,86000,4.2,2478,2789,0,1455,260,44.7,45.6,1983
4,610,2020,26502,20167,43.2,9150,877,2610,585,8864,...,87100,4.9,9233,12454,26,8208,141,41.6,45.6,8864


## add the calculated fields (and calculated values)

In [24]:
# make the columns numeric
dfraw = dfraw.apply(pd.to_numeric)

In [25]:
df_formulas

Unnamed: 0,myname,formula,use4regression,note
0,male_prop,dfx.male / dfx.population,0,this is just an example
1,income_ave1,dfx.income / dfx.population,0,another example


In [26]:
dfraw.head()

Unnamed: 0,zipcode,year,population,median_household_income,median_age,labor_force,unemployed,bachelors_degr,masters_degr,num_households,...,median_val_of_owner_occup,ave_num_rooms,single_family_units,total_units_3,workers_using_public_trans,workers,foreign_born,male,female,income
0,601,2020,16773,14398,42.9,5229,1599,1644,155,5555,...,81800,4.9,5849,7282,0,3588,38,41.6,43.8,5555
1,602,2020,37083,16771,43.6,11764,913,4210,1065,12901,...,85600,4.8,12666,17510,83,10811,255,42.9,45.0,12901
2,603,2020,45652,15786,43.2,14679,2575,5367,1686,19431,...,116000,4.9,16397,24453,28,11201,544,41.4,44.8,19431
3,606,2020,6231,14980,45.1,1612,157,319,163,1983,...,86000,4.2,2478,2789,0,1455,260,44.7,45.6,1983
4,610,2020,26502,20167,43.2,9150,877,2610,585,8864,...,87100,4.9,9233,12454,26,8208,141,41.6,45.6,8864


In [29]:
# do the calculated fields
dfx = dfraw
for newcol, formula, aa, bb in formulas:
    # print(newcol + ' = ' + formula)
    dfx = pd.eval(newcol + ' = ' + formula, target = dfx)

dfx.head()

Unnamed: 0,zipcode,year,population,median_household_income,median_age,labor_force,unemployed,bachelors_degr,masters_degr,num_households,...,single_family_units,total_units_3,workers_using_public_trans,workers,foreign_born,male,female,income,male_prop,income_ave1
0,601,2020,16773,14398,42.9,5229,1599,1644,155,5555,...,5849,7282,0,3588,38,41.6,43.8,5555,0.00248,0.331187
1,602,2020,37083,16771,43.6,11764,913,4210,1065,12901,...,12666,17510,83,10811,255,42.9,45.0,12901,0.001157,0.347895
2,603,2020,45652,15786,43.2,14679,2575,5367,1686,19431,...,16397,24453,28,11201,544,41.4,44.8,19431,0.000907,0.425633
3,606,2020,6231,14980,45.1,1612,157,319,163,1983,...,2478,2789,0,1455,260,44.7,45.6,1983,0.007174,0.318247
4,610,2020,26502,20167,43.2,9150,877,2610,585,8864,...,9233,12454,26,8208,141,41.6,45.6,8864,0.00157,0.334465


In [30]:
dfx[dfx['zipcode']==90266]

Unnamed: 0,zipcode,year,population,median_household_income,median_age,labor_force,unemployed,bachelors_degr,masters_degr,num_households,...,single_family_units,total_units_3,workers_using_public_trans,workers,foreign_born,male,female,income,male_prop,income_ave1
30101,90266,2020,35064,153926,42.9,16965,843,10285,5244,13313,...,10239,14952,113,15591,4572,42.9,43.0,13313,0.001223,0.379677


In [None]:
# generate list of fields for regression. 



In [None]:
# convert to long format


