Metro Census
============
This notebook gathers US census data from the American Community Survey (ACS) 5-year data set
and prepares it in straightforward data files for analysis, focusing on cities and metro
areas in the US.


In [10]:

from census import Census
import us
from us import states
import os
from types import SimpleNamespace
import pandas as pd
import geopandas as gpd

api_key = os.environ["CENSUS_API_KEY"]

c = Census(api_key, year=2022)



In [11]:

# use these to give human readable names to census variables
census_pretty = {
    "B01003_001E": "Total Population",
    "B19013_001E": "Median Household Income",
    "B25001_001E": "Total Housing Units",
    "B25002_002E": "Occupied Housing Units",
    "B03002_001E": "Total Population - Race",
    "B03002_003E": "White Alone",
    "B03002_004E": "Black or African American Alone",
    "B03002_012E": "Hispanic or Latino",
    "B17001_002E": "Poverty Status - Below Poverty Level",
    "B22002_002E": "Households - With Children",
    "P008003": "White Population",
    "NAME": "Name",
    "B08012_001E": "# of Commuters",
    "B08012_002E": "Less than 5 Minutes",
    "B08012_003E": "5-9 Minutes",
    "B08012_004E": "10-14 Minutes",
    "B08012_005E": "15-19 Minutes",
    "B08012_006E": "20-24 Minutes",
    "B08012_007E": "25-29 Minutes",
    "B08012_008E": "30-34 Minutes",
    "B08012_009E": "35-39 Minutes",
    "B08012_010E": "40-44 Minutes",
    "B08012_011E": "45-59 Minutes",
    "B08012_012E": "60-89 Minutes",
    "B08012_013E": ">90 Minutes",
    "GEO_ID": "GEOID",
    "metropolitan statistical area/micropolitan statistical area": "MSA"
}

# dot notation access to census variables
cv = {
    "name": "NAME",
    "total_pop": "B01003_001E",
    "median_inc": "B19013_001E",
    "total_housing": "B25001_001E",
    "occupied_housing": "B25002_002E",
    "total_pop_race": "B03002_001E",
    "asian": "B03002_006E",
    "black": "B03002_004E",
    "indian": "B03002_005E",
    "latino": "B03002_012E",
    "mixed": "B03002_009E",
    "other": "B03002_008E",
    "pacific": "B03002_007E",
    "white": "B03002_003E",
    "white_pop": "P008003",
    "poverty": "B17001_002E",
    "geoid": "GEO_ID"
}
cv = SimpleNamespace(**cv)
# Fetch data for all MSAs (metropolitan statistical areas)
# data = c.acs5.get([cv.name, cv.total_pop, cv.geoid], {'for': 'metropolitan statistical area/micropolitan statistical area:*'})
data = c.acs5.get([cv.name, cv.total_pop, cv.geoid], {'for': 'place:*'})

df = pd.DataFrame(data)
df.columns = ["name", "total_pop", "geoid", "state_fips", "place"]
df["state"] = df.state_fips.map(us.states.mapping('fips', 'abbr'))
df.sort_values("total_pop", ascending=False, inplace=True)

CensusException: <!doctype html><html lang="en"><head><title>HTTP Status 404 ? Not Found</title><style type="text/css">body {font-family:Tahoma,Arial,sans-serif;} h1, h2, h3, b {color:white;background-color:#525D76;} h1 {font-size:22px;} h2 {font-size:16px;} h3 {font-size:14px;} p {font-size:12px;} a {color:black;} .line {height:1px;background-color:#525D76;border:none;}</style></head><body><h1>HTTP Status 404 ? Not Found</h1></body></html>

In [8]:
df = df.sort_values(by="total_pop", ascending=False)
cities = df[df.total_pop > 100_000].copy()
print(len(cities))
cities["city"] = cities["name"].str.split(",", expand=True)[0].replace(" city", "", regex=True)
cities

340


Unnamed: 0,name,total_pop,geoid,state_fips,place,state,city
19124,"New York city, New York",8622467.0,1600000US3651000,36,51000,NY,New York
2879,"Los Angeles city, California",3881041.0,1600000US0644000,06,44000,CA,Los Angeles
6705,"Chicago city, Illinois",2721914.0,1600000US1714000,17,14000,IL,Chicago
27481,"Houston city, Texas",2296253.0,1600000US4835000,48,35000,TX,Houston
1226,"Phoenix city, Arizona",1609456.0,1600000US0455000,04,55000,AZ,Phoenix
...,...,...,...,...,...,...,...
1283,"San Tan Valley CDP, Arizona",101207.0,1600000US0464210,04,64210,AZ,San Tan Valley CDP
12565,"Quincy city, Massachusetts",100981.0,1600000US2555745,25,55745,MA,Quincy
27214,"Edinburg city, Texas",100964.0,1600000US4822660,48,22660,TX,Edinburg
12498,"Lynn city, Massachusetts",100653.0,1600000US2537490,25,37490,MA,Lynn


In [22]:
tiger = gpd.read_file("../data/tl_2023_us_cbsa/tl_2023_us_cbsa.shp")
tiger = tiger.rename(columns={"GEOID": "geoid"})
msa = tiger.merge(df, on="geoid", how="inner")
tiger.columns


Index(['CSAFP', 'CBSAFP', 'geoid', 'GEOIDFQ', 'NAME', 'NAMELSAD', 'LSAD',
       'MEMI', 'MTFCC', 'ALAND', 'AWATER', 'INTPTLAT', 'INTPTLON', 'geometry'],
      dtype='object')

In [2]:
tracts = gpd.read_file("https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Census_Tracts_for_2020_US_Census/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson")
# explore tracts as a folium map with click to show info, no hover
tracts.columns

Index(['OBJECTID', 'CTLabel', 'BoroCode', 'BoroName', 'CT2020', 'BoroCT2020',
       'CDEligibil', 'NTAName', 'NTA2020', 'CDTA2020', 'CDTANAME', 'GEOID',
       'PUMA', 'Shape__Area', 'Shape__Length', 'geometry'],
      dtype='object')

In [9]:
import geopandas as gpd
import pandas as pd

# census tract shape file
tracts = gpd.read_file("https://services5.arcgis.com/GfwWNkhOj9bNBqoJ/arcgis/rest/services/NYC_Census_Tracts_for_2020_US_Census/FeatureServer/0/query?where=1=1&outFields=*&outSR=4326&f=pgeojson")

api_key = "d98445f15e3957fcd14495fcb31e126105797cfc"

census_cols = ",".join(["NAME", "TRACT", "GEO_ID", "B19013_001E","B22002_002E"])
counties = ",".join(["005", "047", "061", "085", "081"])  # Borough codes for NYC
NYS = "36"  # New York State


url = f"https://api.census.gov/data/2020/acs/acs5?get={census_cols}&for=tract:*&in=state:{NYS}&in=county:{counties}&key={api_key}"

df = pd.read_json(url)


# rename columns and drop the first row
df.columns = ['name', 'CT2020', 'GEOID', 'Median Household Income', 'Households - With Children', 'state', 'county', 'tract']
df = df[1:]

# clean the data
df['GEOID'] = df['GEOID'].str[-11:]
cols = ["GEOID"] + list(set(df.columns) - set(tracts.columns))
df['Median Household Income'] = df['Median Household Income'].astype(int)
df['Median Household Income'] = df['Median Household Income'].astype(float)
df['Households - With Children'] = df['Households - With Children'].astype(float)

df = df[df["Median Household Income"] > 0]


data = tracts.merge(df[cols], on="GEOID", how="inner")

data["popup"] = ui.popup(

m = data.explore(column="Median Household Income",
    tiles="cartodbpositron",
    cmap="seismic_r", 
    style_kwds={'fillOpacity': .5},
    scheme="FisherJenks",
    k=10,
    tooltip=False, 
    popup="Median Household Income")

m



In [20]:
data[['OBJECTID', 'CTLabel', 'CT2020', 'BoroCT2020',
       'CDEligibil', 'NTAName', 'NTA2020', 'CDTA2020', 'CDTANAME', 
       'Households - With Children', 'Median Household Income', 'tract',
       'county', 'name']]
income = pd.DataFrame(data.groupby("NTAName")["Median Household Income"].mean().sort_values(ascending=False)).reset_index()


Unnamed: 0,NTAName,Median Household Income
0,Upper East Side-Carnegie Hill,189470.357143
1,Financial District-Battery Park City,183866.142857
2,Tribeca-Civic Center,177522.600000
3,Brooklyn Heights,154802.833333
4,Greenwich Village,154185.428571
...,...,...
196,Brownsville,29075.428571
197,West Farms,28815.200000
198,Belmont,27838.714286
199,Tremont,25699.666667


In [None]:

folium.Choropleth(
    geo_data=data,
    data=data,  # Your data DataFrame
    name='Median Household Income',
    columns=['GEOID', 'Median Household Income'],  # Columns from your data
    key_on='feature.properties.GEOID',  # Key that links your data to the GeoJSON
    fill_color='YlGn',  # Color scheme, e.g., 'YlGn' for yellow-green
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Legend Title',
    popup="tract info"
).add_to(base_map)
base_map


In [None]:




#Renaming columns of data
dict = {df.columns[0]: 'name',
        df.columns[1]: 'CT2020',
        df.columns[2]: 'GEOID',
        df.columns[3]: 'Median Household Income',
        df.columns[4]: 'Households - With Children',
        df.columns[5]: 'state',
        df.columns[6]: 'county',
        df.columns[7]: 'tract'
}
#first row is only labels so it is removed
df = df.drop(0)
df.rename(columns=dict,inplace=True)
df['Median Household Income'] = df['Median Household Income'].astype(int)
# If "my_variable" is a string
df['GEOID'] = df['GEOID'].str[-11:]
df = df[df['Median Household Income'] > 0]
cols = ["GEOID"] + list(set(df.columns) - set(loc.columns))
data = loc.merge(df[cols], on="GEOID", how="inner")
data['Median Household Income'] = data['Median Household Income'].astype(float)
data['Households - With Children'] = data['Households - With Children'].astype(float)
data = data.drop(columns = ['CT2020','BoroCT2020','NTA2020','CDTA2020','CDEligibil','PUMA','tract','CTLabel','BoroName','BoroCode','CDTANAME'])
# ax = data.plot(figsize=(10, 10), alpha=0.5, edgecolor='k')
# # Plot the choropleth map based on 'Median Household Income'
# fig, ax = plt.subplots(1, 1, figsize=(10, 10))
# data.plot(column='Median Household Income', cmap='YlGn', legend=True, ax=ax)
# # Add a title
# ax.set_title("Median Household Income by Tract NYC")
# # Show the plot
# plt.show()
# Create a map centered on a specific location (adjust as needed)
base_map = folium.Map(location=[40.7128, -74.0060], zoom_start=10)
# make the popup info column
info = ["Median Household Income","NTAName","Households - With Children"]
data["tract info"] = data.apply(ui.popup(info), axis=1)
folium.Choropleth(
    geo_data=data,
    data=data,  # Your data DataFrame
    name='Median Household Income',
    columns=['GEOID', 'Median Household Income'],  # Columns from your data
    key_on='feature.properties.GEOID',  # Key that links your data to the GeoJSON
    fill_color='YlGn',  # Color scheme, e.g., 'YlGn' for yellow-green
    fill_opacity=0.7,
    line_opacity=0.2,
    legend_name='Legend Title',
    popup="tract info"
).add_to(base_map)
base_map

In [None]:
import pandas as pd

api_key ="put your api key here"

# url for population, at the county level, filtered for NY
url = f"https://api.census.gov/data/2019/acs/acs5?get=NAME,B08006_001E&for=county:*&in=state:36&key={api_key}"

df = pd.read_json(url)
df



In [None]:

# # Create a DataFrame
df = pd.DataFrame(data[1:], columns=data[0])
print(df.columns)
df
