In [71]:
import requests
import json
import io
import pandas as pd
import numpy as np
import geopandas as gpd
import os

Step 1: Get 2010 chi census tracks into a pandas dataFrame \
Step 2: Get 2020 chi census tracks into a pandas dataFrame \
Step 3: Determine relationship between 2010 and 2020. \
Step 4: Limit to just census tracks in 2020 and 2010 that have a one to one relationship

In [72]:
# step 1
def extract_chi_census_tracts_2010():
    """
    Takes in census track data (geojson) and returns shorted pandas df
    to filter by census track for chicago. Includes geometries.

    """
    filename = "raw_data/census_tracts_2010.geojson"
    census = gpd.read_file(filename)
    columns = ["tractce10", "geoid10", "name10", "namelsad10", "geometry"]
    final_df = pd.DataFrame(census[columns])

    return final_df

In [94]:
# step 2
def extract_chi_census_tracts_2020():
    """
    Takes in census track data and returns shorted table to filter by
    census track for chicago
    """
    # for now until we get 2020 data
    filename = "raw_data/chi_ct_2020.csv"
    census = pd.read_csv(filename, dtype=str)
    final_df = census[["ct_chicago", "community_name"]].rename(
        columns={"ct_chicago": "geoid20"}
    )

    return final_df

In [95]:
# all census tracts for chicago from 2010
# chi_census_tracts_2010 = extract_chi_census_tracts_2010()

# all census tracts for chicago from 2020
chi_census_tracts_2020 = extract_chi_census_tracts_2020()

In [96]:
chi_census_tracts_2020

Unnamed: 0,geoid20,community_name
0,17031010100,Rogers Park
1,17031010201,Rogers Park
2,17031010202,Rogers Park
3,17031010300,Rogers Park
4,17031010400,Rogers Park
...,...,...
796,17031844700,North Lawndale
797,17031980000,O'Hare
798,17031980100,Garfield Ridge
799,17043840000,


In [112]:
# step 3 / 4
# https://www.census.gov/geographies/reference-files/time-series/geo/relationship-files.2020.html#tract


def census_tracts_2020_2010_relationships():
    """
    returns a mapping of only 1:1 census tracts from 2020 and 2010 with GEOID_TRACT
    """
    url = "https://www2.census.gov/geo/docs/maps-data/data/rel2020/tract/tab20_tract20_tract10_natl.txt"
    data = requests.get(url).content
    relationships = pd.read_csv(io.StringIO(data.decode("utf-8")), sep="|", dtype=str)

    chi_census_tracts_2020 = extract_chi_census_tracts_2020()

    # filter to just tracts in chicago, will need to be 2020
    chi_geoid20 = list(chi_census_tracts_2020["geoid20"].astype(str))
    filter = relationships["GEOID_TRACT_20"].isin(chi_geoid20)
    chi_relationships = relationships[filter]

    dupe_count = chi_relationships["GEOID_TRACT_20"].value_counts().reset_index()

    chi_relationships_flag = chi_relationships.merge(
        dupe_count, how="left", left_on="GEOID_TRACT_20", right_on="GEOID_TRACT_20"
    )

    filter = chi_relationships_flag["count"] == 1
    chi_relationships_flag["relation"] = np.where(filter, "one", "many")

    # limit columns
    columns = ["GEOID_TRACT_20", "GEOID_TRACT_10", "relation"]

    return chi_relationships_flag[columns].reset_index(drop=True)

In [111]:
relationships

Unnamed: 0,OID_TRACT_20,GEOID_TRACT_20,NAMELSAD_TRACT_20,AREALAND_TRACT_20,AREAWATER_TRACT_20,MTFCC_TRACT_20,FUNCSTAT_TRACT_20,OID_TRACT_10,GEOID_TRACT_10,NAMELSAD_TRACT_10,AREALAND_TRACT_10,AREAWATER_TRACT_10,MTFCC_TRACT_10,FUNCSTAT_TRACT_10,AREALAND_PART,AREAWATER_PART
0,20790540092527,01001020100,Census Tract 201,9825304,28435,G5020,S,20740540092527,01001020100,Census Tract 201,9827271,28435,G5020,S,9820448,28435
1,20790540092527,01001020100,Census Tract 201,9825304,28435,G5020,S,20740540092534,01001020200,Census Tract 202,3325674,5669,G5020,S,4856,0
2,20790540092534,01001020200,Census Tract 202,3320818,5669,G5020,S,20740540092534,01001020200,Census Tract 202,3325674,5669,G5020,S,3320818,5669
3,20790540092528,01001020300,Census Tract 203,5349271,9054,G5020,S,20740540092528,01001020300,Census Tract 203,5349271,9054,G5020,S,5349271,9054
4,20790540092529,01001020400,Census Tract 204,6384282,8408,G5020,S,20740540092529,01001020400,Census Tract 204,6384282,8408,G5020,S,6384282,8408
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126445,207904252102449,78030961100,Census Tract 9611,3479232,0,G5020,S,20740228304757,78030961200,Census Tract 9612,1017540,802134,G5020,S,14104,0
126446,20790228304757,78030961200,Census Tract 9612,1101324,802134,G5020,S,20740228304707,78030960400,Census Tract 9604,11709358,413661,G5020,S,217781,0
126447,20790228304757,78030961200,Census Tract 9612,1101324,802134,G5020,S,207404252102449,78030961100,Census Tract 9611,3513895,0,G5020,S,203,0
126448,20790228304757,78030961200,Census Tract 9612,1101324,802134,G5020,S,20740228304757,78030961200,Census Tract 9612,1017540,802134,G5020,S,883340,802134


In [128]:
chi_ct_relationships = census_tracts_2020_2010_relationships()
# chi_ct_relationships[chi_ct_relationships['relation']== 'one']

In [None]:
def final_10_20_tracts():
    tracts_2010 = extract_chi_census_tracts_2010()
    tracts_2010 = tracts_2010[["geoid10", "geometry"]]
    tracts_2020 = extract_chi_census_tracts_2020()

    tract_relationships = census_tracts_2020_2010_relationships()
    # only 1:1 tracts in 2020
    tract_relationships_1_1 = tract_relationships[
        tract_relationships["relation"] == "one"
    ]

    # add on geometries to file, preference 2020 geo (2010 for now)
    final_df = tract_relationships_1_1.merge(
        tracts_2010, how="left", left_on="GEOID_TRACT_10", right_on="geoid10"
    )

    final_df = final_df.drop(columns=["geoid10", "relation"])

In [145]:
# list of 2010 census tracks did we drop and keep

def tracts_2010_key():
    # geoid10, keep: bool, geometry

    tracts_2010 = extract_chi_census_tracts_2010()
    chi_ct_relationships = census_tracts_2020_2010_relationships()

    collapse = (
        chi_ct_relationships.groupby("GEOID_TRACT_10")
        .agg({"relation": lambda x: max(x)})
        .reset_index()
    )
    final_df = collapse.merge(
        tracts_2010,
        how="left",
        left_on="GEOID_TRACT_10",
        right_on="geoid10",
    )

    final_df = final_df[["geoid10", "relation", "namelsad10", "geometry"]]

    # take out NaN
    final_df = final_df.dropna()

    return final_df

In [1]:
# convert to py file
#!jupyter nbconvert --to script data_extract_census.ipynb

[NbConvertApp] Converting notebook data_extract_census.ipynb to script
[NbConvertApp] Writing 4493 bytes to data_extract_census.py
