In [103]:
import time
import pandas as pd
from census import Census
import altair as alt
import logging, sys
import numpy as np

#logging.basicConfig(stream=sys.stderr, level=logging.DEBUG)

# Census API access
api_key = "639f2aedf7c17b164527591258cda00b25249b4b"
c = Census(key=api_key)

# Conditions

What are the conditions of the census tracts within different urban cores across Contra Costa County?

## Race
What is the racial and ethnic percentages of each census tract? Are there any census tracts with MOEs that are too high to include?

In [133]:
def process_acs_df(in_df):
    '''
    Inputs:
    - in_df (pd.DataFrame): a DataFrame containing race/ethnicity data from Table B03002

    Outputs:
    A modified version of in_df with certain races grouped and proportions and margins
    of error calculated
    '''
    
    # Start by making a copy of in_df, so we don't destroy the original data.
    # df is also a nice short name we can use throughout this function
    df = in_df.copy()

    ### CLEAN UNUSUAL MOES
    df = df.replace(-555555555.0, 0)

    ### AGGREGATE ESTIMATES
    # Define a list of columns to combine into "nh_other"
    nh_other_cols = ['nh_native', 'nh_pi', 'nh_1other', 'nh_multi']

    # Calculate nh_other
    df['nh_other'] = df[nh_other_cols].sum(axis='columns')

    # Use a list comprehension to append "_moe" to all strings in our list
    nh_other_moes = [f'{col}_moe' for col in nh_other_cols]

    # Use this new list to calculate nh_other_moe    
    df['nh_other_moe'] = (df[nh_other_moes]**2).sum(axis='columns')**0.5

    ### CALCULATE PROPORTIONS
    # Iterate through the racial/ethnic groups we're using
    for group in ['nh_white', 'nh_black', 'nh_asian', 'nh_other', 'hispanic']:
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']

        #NaN-out any tracts of too-low absolute n
        df.loc[df.total < 25, f'pct_{group}'] = float('NaN')
        df.loc[df.total < 25, f'pct_{group}_moe'] = float('NaN')
        
        #NaN-out any tracts of too-low moe
        df[f'pct_{group}_moe_ratio'] = df[f'pct_{group}_moe']/df[f'pct_{group}']
        df.loc[df[f'pct_{group}_moe_ratio'] > .4, f'pct_{group}'] = float('NaN')
        df.loc[df[f'pct_{group}_moe_ratio'] > .4, f'pct_{group}_moe'] = float('NaN')
    
    return df

In [36]:
# Define the dict of variables to pull and rename
race_in_variables = {
    'NAME': 'NAME',
    'GEO_ID': 'GEO_ID',
    'B03002_001E': 'total',
    'B03002_001M': 'total_moe',
    'B03002_003E': 'nh_white',
    'B03002_003M': 'nh_white_moe',
    'B03002_004E': 'nh_black',
    'B03002_004M': 'nh_black_moe',
    'B03002_005E': 'nh_native',
    'B03002_005M': 'nh_native_moe',
    'B03002_006E': 'nh_asian',
    'B03002_006M': 'nh_asian_moe',
    'B03002_007E': 'nh_pi',
    'B03002_007M': 'nh_pi_moe',
    'B03002_008E': 'nh_1other',
    'B03002_008M': 'nh_1other_moe',
    'B03002_009E': 'nh_multi',
    'B03002_009M': 'nh_multi_moe',
    'B03002_012E': 'hispanic',
    'B03002_012M': 'hispanic_moe',
}

def get_race_out_variables(in_race):
    race_out_variables = [
        'NAME', 'GEO_ID', 'tract', 'total',
        'pct_nh_'+in_race, 'pct_nh_'+in_race+'_moe',
        'pct_nh_'+in_race+'_moe_ratio'
    ]
    return race_out_variables

In [None]:
# Get ACS 2022 5 year Table B03002 for CC County totals, for grouped bar chart
# county:013
df_county = pd.DataFrame(
    c.acs5.get(
        list(race_in_variables.keys()),
        {'for': 'county:013', 'in': 'state:06'},
        year=2022
    )
)

# Rename the DataFrame columns again using the dict
df_county = df_county.rename(columns=race_in_variables)

# Combo races, generate percentages and percentage MOEs
df_county_processed = process_acs_df(df_county)

In [142]:
# Get ACS 2022 5 year Table B03002 for CC County totals, for per-tract county maps
df_tracts = pd.DataFrame(
    c.acs5.get(
        list(race_in_variables.keys()),
        {'for': 'tract:*', 'in': 'state:06 county:013'},
        year=2022
    )
)
# Rename the DataFrame columns again using the dict
df_tracts = df_tracts.rename(columns=race_in_variables)

# Combo races, generate percentages and percentage MOEs
df_tracts_processed = process_acs_df(df_tracts)

df_tracts_out = df_tracts_processed[get_race_out_variables('other')]
df_tracts_out.insert(1, "GEOID", "06013" + df_tracts_out["tract"])
#export to csv
#df_tracts_out.to_csv('race_tracts/tract_other_pct.csv', index=False)

TypeError: unsupported operand type(s) for ** or pow(): 'str' and 'int'

## Race per Place

In [81]:
#Variables for bar charts
race_export_names = {
    'pct_nh_white':'White', 
    'pct_nh_white_moe':'White_moe',
    'pct_nh_black':'Black', 
    'pct_nh_black_moe':'Black_moe',
    'pct_nh_asian':'Asian', 
    'pct_nh_asian_moe':'Asian_moe',
    'pct_nh_other':'Other', 
    'pct_nh_other_moe':'Other_moe',
    'pct_hispanic':'Latinae', 
    'pct_hispanic_moe':'Latinae_moe',
}
place_export_names = {
    "60620": "Richmond",
    "52162": "North Richmond",
    "21796": "El Cerrito",
    "57456": "Pittsburg",
    "04415": "Bay Point",
    "39122": "Lafayette"
}

value_vars = ['pct_nh_white','pct_nh_black','pct_nh_asian','pct_nh_other','pct_hispanic']
value_vars_moe = ['pct_nh_white_moe','pct_nh_black_moe','pct_nh_asian_moe','pct_nh_other_moe','pct_hispanic_moe']

In [129]:
def get_place_race(in_place):

    '''
    Inputs: place string you want in CCC from the census, race string you want from the census
    Output: dataframe with renamed columns
    '''
    
    # Get ACS 2022 5 year Table B03002 in select places
    df = pd.DataFrame(
        c.acs5.get(
            list(race_in_variables.keys()),
            {'for': 'place:'+in_place, 'in': 'state:06'},
            year=2022
        )
    )

    # Rename the DataFrame columns again using the dict
    df = df.rename(columns=race_in_variables)
    # There is a census bug??? nh_white_moe is coming back as an object not a float64 on every call to this table
    df['nh_white_moe'] = df['nh_white_moe'].astype(np.float64)
    
    return df

In [82]:
def make_tidy_data(df, place_var):

    '''
    Massage the data so bar charts with error lines work
    Inputs: data, the type of place you are pivoting the data on (either tract or place)
    '''

    #Manipulate the processed tract data into tidy data for charts
    rt_out = pd.melt(df,
                     [place_var],
                     value_vars,
                     var_name="race",
                     value_name="race_pct"
                    )
    
    rt_out_moe = pd.melt(df,
                     [place_var],
                     value_vars_moe,
                     var_name="race_moe",
                     value_name="race_pct_moe"
                    )
    
    rt_out = pd.concat([rt_out, rt_out_moe], axis=1)
    # concat does not remove the duplicate placetype column, so remove it, then clean and sort
    rt_out = rt_out.loc[:,~rt_out.columns.duplicated()].copy().drop(columns=["race_moe"]).sort_values(by=place_var)
    #clean up names ugh this is not the best way to do this??
    rt_out = rt_out.replace(race_export_names)
    rt_out = rt_out.replace(place_export_names)
    
    return rt_out

In [101]:
def combine_places(place_1, place_2):

    '''
    Combine two census places that have already been processed for pcts but haven't been changed into chart data format yet
    '''
    df_1 = place_1.copy()
    df_2 = place_2.copy()
    df = pd.DataFrame()
    
    #Total the totals
    df["total"] = df_1["total"]+df_2["total"]
    df["total_moe"] = np.sqrt(df_1['total_moe']**2 + df_2['total_moe']**2)
    
    ### CALCULATE PROPORTIONS
    # Iterate through the racial/ethnic groups we're using
    for group in ['nh_white', 'nh_black', 'nh_asian', 'nh_other', 'hispanic']:

        df[group] = df_1[group] + df_2[group]
        df[f'{group}_moe'] = np.sqrt(df_1[f'{group}_moe']**2 + df_2[f'{group}_moe']**2)
        
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']
    
    return df

In [130]:
df_richmond = get_place_race("60620")
df_n_richmond = get_place_race("52162") 
df_elcerrito = get_place_race("21796")
df_pittsburg = get_place_race("57456")
df_baypoint = get_place_race("04415")
df_lafayette = get_place_race("39122")

In [140]:
#Get Table B08122 for Richmond City and North Richmond
# N Richmond: 52162
# Richmond City: 60620
# El Cerrito: 21796
# Pittsburg: 57456
# Bay Point: 04415
# Lafayette: 39122

#Richmond
df_r_processed = process_acs_df(df_richmond)
#N Richmond
df_n_processed = process_acs_df(df_n_richmond)
#Combine Richmond and N Richmond
df_r_combo = combine_places(df_r_processed, df_n_processed)
df_r_combo.insert(0, "place", "Richmond")
df_richmond_out = make_tidy_data(df_r_combo, "place")

#El Cerrito
df_ec_processed = process_acs_df(df_elcerrito)
df_elcerrito_out = make_tidy_data(df_ec_processed, "place")
#Lafayette
df_l_processed = process_acs_df(df_lafayette)
df_lafayette_out = make_tidy_data(df_l_processed, "place")

#Pittsburg
df_p_processed = process_acs_df(df_pittsburg)
#Bay Point
df_bp_processed = process_acs_df(df_baypoint)
df_pbp_combo = combine_places(df_p_processed, df_bp_processed)
df_pbp_combo.insert(0, "place", "Pittsburg/Bay Point")
df_pbp_out = make_tidy_data(df_pbp_combo, "place")

df_place_bars = pd.concat([df_richmond_out,df_pbp_out,df_elcerrito_out,df_lafayette_out])

In [139]:
df_pbp_out

Unnamed: 0,place,race,race_pct,race_pct_moe
0,Pittsburg/Bay Point,White,0.172664,0.01575
1,Pittsburg/Bay Point,Black,0.136998,0.012513
2,Pittsburg/Bay Point,Asian,0.157162,0.01459
3,Pittsburg/Bay Point,Other,0.049144,0.008863
4,Pittsburg/Bay Point,Latinae,0.484032,0.026803


In [141]:
# Make a grouped bar chart of all the places
bars = alt.Chart(df_place_bars).mark_bar(size=10).encode(
    x=alt.X('race_pct:Q').scale(domain=(0,1)).axis(format='%').title("Percentage of population"),
    y='race:N',
    color=alt.Color('race:N').legend(None)
).properties(
    width=300,
    height=70
)

#Show MOE
error_bars = alt.Chart().mark_errorbar().encode(
    x=alt.X('race_pct:Q').scale(zero=False),
    xError=('race_pct_moe:Q'),
    y='race:N'
)

alt.layer(bars, error_bars, data=df_place_bars).facet(
    row='place:N'
)

### Race per county

An identical grouped bar chart but for the entire county, as a relative comparison.

In [6]:
# Melt county processed data into tidy data for chart
id_vars = ["county"]
# value_vars = ['pct_nh_white','pct_nh_black','pct_nh_asian','pct_nh_other','pct_hispanic']
rc_out = pd.melt(df_county_processed,
                 id_vars,
                 value_vars,
                 var_name="race",
                 value_name="race_pct"
                )
# value_vars_moe = ['pct_nh_white_moe','pct_nh_black_moe','pct_nh_asian_moe','pct_nh_other_moe','pct_hispanic_moe']
rc_out_moe = pd.melt(df_county_processed,
                 id_vars,
                 value_vars_moe,
                 var_name="race_moe",
                 value_name="race_pct_moe"
                )

rc_out = pd.concat([rc_out, rc_out_moe], axis=1)
# drop columns the chart doesn't need
rc_out = rc_out.drop(columns=["county","race_moe"])
#clean up names ugh this is not the best way to do this
rc_out = rc_out.replace(race_export_names)

In [7]:
# Make a grouped bar chart of census tracts
bars = alt.Chart(rc_out, title="Race and Ethnicity for Contra Costa County").mark_bar(size=10).encode(
    x=alt.X('race_pct:Q').scale(domain=(0,1)).axis(format='%').title("Percentage of population"),
    y='race:N',
    color=alt.Color('race:N').legend(None)
).properties(
    width=300,
    height=75
)

#Show MOE
error_bars = alt.Chart().mark_errorbar().encode(
    x=alt.X('race_pct:Q').scale(zero=False),
    xError=('race_pct_moe:Q'),
    y='race:N'
)

alt.layer(bars, error_bars, data=rc_out)

## Poverty by Race, per tract

In [143]:
def process_poverty_tracts(in_df):

    '''
    Input: dataframe from B17020
    Output:
    A modified version of in_df with proportions and margins of error calculated
    '''
    
    df = in_df.copy()

    ### CLEAN UNUSUAL MOES
    df = df.replace(-555555555.0, 0)

    ### CALCULATE PROPORTIONS
    # Iterate through the tenure
    for group in ['below_poverty', 'above_poverty']:
        # Calculate the proportion for this group
        df[f'pct_{group}'] = df[group] / df['total']
    
        # Calculate the MOE for this proportion
        df[f'pct_{group}_moe'] = (df[f'{group}_moe']**2 - df[f'pct_{group}']**2 * df['total_moe']**2)**0.5 / df['total']
  
        #NaN-out any tracts of too-low absolute n
        df.loc[df.total < 25, f'pct_{group}'] = float('NaN')
        df.loc[df.total < 25, f'pct_{group}_moe'] = float('NaN')
        
        #NaN-out any tracts of too-low moe
        df[f'pct_{group}_moe_ratio'] = df[f'pct_{group}_moe']/df[f'pct_{group}']
        df.loc[df[f'pct_{group}_moe_ratio'] > .4, f'pct_{group}'] = float('NaN')
        df.loc[df[f'pct_{group}_moe_ratio'] > .4, f'pct_{group}_moe'] = float('NaN')

    return df

In [144]:
#variables

poverty_out_columns = ['NAME', 'tract', 'total', 'pct_below_poverty', 'pct_below_poverty_moe']

def get_poverty_in_columns(race_letter_in):
    poverty_in_columns = {
        "GEO_ID": "GEO_ID",
        "NAME": "NAME",
        "B17020"+race_letter_in+"_001E": "total",
        "B17020"+race_letter_in+"_001M": "total_moe",
        "B17020"+race_letter_in+"_002E": "below_poverty",
        "B17020"+race_letter_in+"_002M": "below_poverty_moe",
        "B17020"+race_letter_in+"_010E": "above_poverty",
        "B17020"+race_letter_in+"_010M": "above_poverty_moe"
    }
    return poverty_in_columns

In [148]:
# Contra Costa Poverty by Race
# Everybody B17020
# White Non-Hispanic table B17020H
# Hispanic B17020I
# Black B17020B
# Asian B17020D

poverty_in_columns = get_poverty_in_columns("")

df_poverty_tracts_2022 = pd.DataFrame(
    c.acs5.get(
        list(poverty_in_columns.keys()),
        {'for': 'tract:*', 'in': 'state:06 county:013'},
        year=2022
    )
)
df_poverty_tracts_2022 = df_poverty_tracts_2022.rename(columns=poverty_in_columns)

df_pov_2022_proc = process_poverty_tracts(df_poverty_tracts_2022)

df_p_out = df_pov_2022_proc[poverty_out_columns]
df_p_out.insert(1, "GEOID", '06013' + df_p_out["tract"])

df_p_out.to_csv("poverty_tracts/bp_all_ccc.csv", index=False)