In [1]:
import time
import pandas as pd
from census import Census
import altair as alt
import logging, sys

#logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Census API access
api_key = "639f2aedf7c17b164527591258cda00b25249b4b"
c = Census(key=api_key)

# Conditions

What are the conditions of the census tracts within different urban cores across Contra Costa County?

## Race
What is the racial and ethnic percentages of each census tract? Are there any census tracts with MOEs that are too high to include?

In [3]:
def process_acs_df(in_df):
    '''
    Inputs:
    - in_df (pd.DataFrame): a DataFrame containing race/ethnicity data from Table B03002

    Outputs:
    A modified version of in_df with certain races grouped and proportions and margins
    of error calculated
    '''
    
    # Start by making a copy of in_df, so we don't destroy the original data.
    # df is also a nice short name we can use throughout this function
    df = in_df.copy()

    ### CLEAN UNUSUAL MOES
    df = df.replace(-555555555.0, 0)

    ### AGGREGATE ESTIMATES
    # Define a list of columns to combine into "nh_other"
    nh_other_cols = ['nh_native', 'nh_pi', 'nh_1other', 'nh_multi']

    # Calculate nh_other
    df['nh_other'] = df[nh_other_cols].sum(axis='columns')

    # Use a list comprehension to append "_moe" to all strings in our list
    nh_other_moes = [f'{col}_moe' for col in nh_other_cols]

    # Use this new list to calculate nh_other_moe    
    df['nh_other_moe'] = (df[nh_other_moes]**2).sum(axis='columns')**0.5

    ### CALCULATE PROPORTIONS
    # Iterate through the racial/ethnic groups we're using
    for group in ['nh_white', 'nh_black', 'nh_asian', 'nh_other', 'hispanic']:
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']

    return df  # Don't forget to *return* the modified DataFrame

In [32]:
# Define the dict of variables to pull and rename
race_variables = {
    'NAME': 'NAME',
    'GEO_ID': 'GEO_ID',
    'B03002_001E': 'total',
    'B03002_001M': 'total_moe',
    'B03002_003E': 'nh_white',
    'B03002_003M': 'nh_white_moe',
    'B03002_004E': 'nh_black',
    'B03002_004M': 'nh_black_moe',
    'B03002_005E': 'nh_native',
    'B03002_005M': 'nh_native_moe',
    'B03002_006E': 'nh_asian',
    'B03002_006M': 'nh_asian_moe',
    'B03002_007E': 'nh_pi',
    'B03002_007M': 'nh_pi_moe',
    'B03002_008E': 'nh_1other',
    'B03002_008M': 'nh_1other_moe',
    'B03002_009E': 'nh_multi',
    'B03002_009M': 'nh_multi_moe',
    'B03002_012E': 'hispanic',
    'B03002_012M': 'hispanic_moe',
}

# Get ACS 2022 5 year Table B03002 in all tracts in CC County
# Get ACS 2022 5 year Table B03002 in Contra Costa County total
# county:013

df_tracts = pd.DataFrame(
    c.acs5.get(
        list(race_variables.keys()),
        {'for': 'tract:*', 'in': 'state:06 county:013'},
        year=2022
    )
)
df_county = pd.DataFrame(
    c.acs5.get(
        list(race_variables.keys()),
        {'for': 'county:013', 'in': 'state:06'},
        year=2022
    )
)

# Rename the DataFrame columns again using the dict
df_tracts = df_tracts.rename(columns=race_variables)
df_county = df_county.rename(columns=race_variables)

# Combo races, generate percentages and percentage MOEs
df_tracts_processed = process_acs_df(df_tracts)
df_county_processed = process_acs_df(df_county)

#export settings, for various maps and tables

#     'pct_nh_white', 'pct_nh_white_moe',
#     'pct_nh_black', 'pct_nh_black_moe',
#     'pct_nh_asian', 'pct_nh_asian_moe',
#     'pct_nh_other', 'pct_nh_other_moe',
#     'pct_hispanic', 'pct_hispanic_moe',

df_out = df_tracts_processed[[
     'NAME', 'GEO_ID', 'tract',
     'pct_hispanic', 'pct_hispanic_moe'
]]
df_out.insert(1, "GEOID", "06013" + df_out["tract"])
#export to csv
df_out.to_csv('tract_hispanic_pct.csv', index=False)

### Race per tract

First, a small-multiples grouped bar chart of race per census tract. The MOEs indicate which census tract alone has too much error to be considered.

In [11]:
#Manipulate the processed tract data into tidy data for charts

#format for inline chart
race_export_names = {
    'pct_nh_white':'White', 
    'pct_nh_white_moe':'White_moe',
    'pct_nh_black':'Black', 
    'pct_nh_black_moe':'Black_moe',
    'pct_nh_asian':'Asian', 
    'pct_nh_asian_moe':'Asian_moe',
    'pct_nh_other':'Other', 
    'pct_nh_other_moe':'Other_moe',
    'pct_hispanic':'Hispanic', 
    'pct_hispanic_moe':'Hispanic_moe',
}
# Melt into tidy data
id_vars = ["tract"]
value_vars = ['pct_nh_white','pct_nh_black','pct_nh_asian','pct_nh_other','pct_hispanic']
rt_out = pd.melt(df_tracts_processed,
                 id_vars,
                 value_vars,
                 var_name="race",
                 value_name="race_pct"
                )
value_vars_moe = ['pct_nh_white_moe','pct_nh_black_moe','pct_nh_asian_moe','pct_nh_other_moe','pct_hispanic_moe']
rt_out_moe = pd.melt(df_tracts_processed,
                 id_vars,
                 value_vars_moe,
                 var_name="race_moe",
                 value_name="race_pct_moe"
                )

rt_out = pd.concat([rt_out, rt_out_moe], axis=1)
# concat does not remove the duplicate tract column, so remove it, then clean and sort
rt_out = rt_out.loc[:,~rt_out.columns.duplicated()].copy().drop(columns=["race_moe"]).sort_values(by="tract")
#clean up names ugh this is not the best way to do this
rt_out = rt_out.replace(race_export_names)
#export to csv
rt_out.to_csv("race_per_tract.csv", index=False)

In [5]:
# Make a grouped bar chart of census tracts
bars = alt.Chart(rt_out).mark_bar(size=10).encode(
    x=alt.X('race_pct:Q').scale(domain=(0,1)).axis(format='%').title("Percentage of population"),
    y='race:N',
    color=alt.Color('race:N').legend(None)
).properties(
    width=300,
    height=70
)

#Show MOE
error_bars = alt.Chart().mark_errorbar().encode(
    x=alt.X('race_pct:Q').scale(zero=False),
    xError=('race_pct_moe:Q'),
    y='race:N'
)

alt.layer(bars, error_bars, data=rt_out).facet(
    row='tract:N'
)

### Race per county

An identical grouped bar chart but for the entire county, as a relative comparison.

In [6]:
# Melt county processed data into tidy data for chart
id_vars = ["county"]
# value_vars = ['pct_nh_white','pct_nh_black','pct_nh_asian','pct_nh_other','pct_hispanic']
rc_out = pd.melt(df_county_processed,
                 id_vars,
                 value_vars,
                 var_name="race",
                 value_name="race_pct"
                )
# value_vars_moe = ['pct_nh_white_moe','pct_nh_black_moe','pct_nh_asian_moe','pct_nh_other_moe','pct_hispanic_moe']
rc_out_moe = pd.melt(df_county_processed,
                 id_vars,
                 value_vars_moe,
                 var_name="race_moe",
                 value_name="race_pct_moe"
                )

rc_out = pd.concat([rc_out, rc_out_moe], axis=1)
# drop columns the chart doesn't need
rc_out = rc_out.drop(columns=["county","race_moe"])
#clean up names ugh this is not the best way to do this
rc_out = rc_out.replace(race_export_names)

In [7]:
# Make a grouped bar chart of census tracts
bars = alt.Chart(rc_out, title="Race and Ethnicity for Contra Costa County").mark_bar(size=10).encode(
    x=alt.X('race_pct:Q').scale(domain=(0,1)).axis(format='%').title("Percentage of population"),
    y='race:N',
    color=alt.Color('race:N').legend(None)
).properties(
    width=300,
    height=75
)

#Show MOE
error_bars = alt.Chart().mark_errorbar().encode(
    x=alt.X('race_pct:Q').scale(zero=False),
    xError=('race_pct_moe:Q'),
    y='race:N'
)

alt.layer(bars, error_bars, data=rc_out)

## Poverty by Race, per tract

In [9]:
def process_race_tracts(in_df):

    '''
    Input: dataframe from B17020
    Output:
    A modified version of in_df with proportions and margins of error calculated
    '''
    
    df = in_df.copy()

    ### CLEAN UNUSUAL MOES
    df = df.replace(-555555555.0, 0)

    ### CALCULATE PROPORTIONS
    # Iterate through the tenure
    for group in ['below_poverty', 'above_poverty']:
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']
      
        #NaN-out any tracts of too-low absolute n
        df.loc[df.total < 25, f'pct_{group}'] = float('NaN')
        df.loc[df.total < 25, f'pct_{group}_moe'] = float('NaN')

        #NaN-out any tracts of too-low moe
        #df.loc[df[f'pct_{group}_moe']/df[f'pct_{group}'] > .4, f'pct_{group}'] = float('NaN')
        #df.loc[df[f'pct_{group}_moe']/df[f'pct_{group}'] > .4, f'pct_{group}_moe'] = float('NaN')

    return df

In [10]:
# Contra Costa Poverty by Race
# White Non-Hispanic table B17020H
# Hispanic B17020I
# Black B17020B
# Asian B17020D
poverty_in_columns = {
    "GEO_ID": "GEO_ID",
    "NAME": "NAME",
    "B17020I_001E": "total",
    "B17020I_001M": "total_moe",
    "B17020I_002E": "below_poverty",
    "B17020I_002M": "below_poverty_moe",
    "B17020I_010E": "above_poverty",
    "B17020I_010M": "above_poverty_moe"
}
poverty_out_columns = ['NAME', 'tract', 'pct_below_poverty', 'pct_below_poverty_moe']

df_poverty_tracts_2022 = pd.DataFrame(
    c.acs5.get(
        list(poverty_in_columns.keys()),
        {'for': 'tract:*', 'in': 'state:06 county:013'},
        year=2022
    )
)
df_poverty_tracts_2022 = df_poverty_tracts_2022.rename(columns=poverty_in_columns)

df_pov_2022_proc = process_race_tracts(df_poverty_tracts_2022)

df_p_out = df_pov_2022_proc[poverty_out_columns]
df_p_out.insert(1, "GEOID", '06013' + df_p_out["tract"])
df_p_out.sort_values(by=["pct_below_poverty"])

# df_p_out.to_csv("poverty_tracts/bp_hispanic_ccc.csv", index=False)

Unnamed: 0,NAME,GEOID,tract,pct_below_poverty,pct_below_poverty_moe
156,Census Tract 3540.01; Contra Costa County; Cal...,06013354001,354001,0.000000,0.104839
9,Census Tract 3031.04; Contra Costa County; Cal...,06013303104,303104,0.000000,0.012322
227,Census Tract 3852; Contra Costa County; Califo...,06013385200,385200,0.000000,0.302326
8,Census Tract 3020.14; Contra Costa County; Cal...,06013302014,302014,0.000000,0.009121
4,Census Tract 3020.09; Contra Costa County; Cal...,06013302009,302009,0.000000,0.011942
...,...,...,...,...,...
58,Census Tract 3160; Contra Costa County; Califo...,06013316000,316000,0.542683,0.308319
170,Census Tract 3551.23; Contra Costa County; Cal...,06013355123,355123,0.583333,0.561172
145,Census Tract 3511.01; Contra Costa County; Cal...,06013351101,351101,,
240,Census Tract 9800; Contra Costa County; Califo...,06013980000,980000,,


## Tenure
What percent of residents rent vs own?

In [8]:
def process_acs_tenure(in_df):
    '''
    Inputs:
    - in_df: a DataFrame containing owner/renter data from Table B25003

    Outputs:
    A modified version of in_df with proportions and margins
    of error calculated
    '''
    df = in_df.copy()

    ### CLEAN UNUSUAL MOES
    df = df.replace(-555555555.0, 0)

    ### CALCULATE PROPORTIONS
    # Iterate through the tenure
    for group in ['owner_occupied', 'renter_occupied']:
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']
    
    return df

In [9]:
# Define the dict of variables to pull and rename
tenure_variables = {
    'NAME': 'NAME',
    'GEO_ID': 'GEO_ID',
    'B25003_001E': 'total',
    'B25003_001M': 'total_moe',
    'B25003_002E': 'owner_occupied',
    'B25003_002M': 'owner_occupied_moe',
    'B25003_003E': 'renter_occupied',
    'B25003_003M': 'renter_occupied_moe',
}

# Get ACS 2022 5 year Table B25003 in Richmond City, North Richmond, Rollingwood, CA
# Get ACS 2022 5 year Table B25003 in Contra Costa County
t_tracts = pd.DataFrame(
    c.acs5.get(
        list(tenure_variables.keys()),
        {'for': c_tracts, 'in': 'state:06 county:013'},
        year=2022
    )
)
t_county = pd.DataFrame(
    c.acs5.get(
        list(tenure_variables.keys()),
        {'for': 'county:013', 'in': 'state:06'},
        year=2022
    )
)

#rename columns
t_tracts = t_tracts.rename(columns=tenure_variables)
t_county = t_county.rename(columns=tenure_variables)

#calculate percentages
t_tracts_processed = process_acs_tenure(t_tracts)
t_county_processed = process_acs_tenure(t_county)

#export settings
t_out = pd.concat([t_tracts_processed, t_county_processed])
#export to csv
#t_out.to_csv('tenure_pct.csv', index=False)