In [1]:
#import packages
import pandas as pd
import numpy as np
import geopandas as gpd
import matplotlib
from matplotlib import pyplot as plt

In [2]:
#####################
#####################
### EBLL_Data #######
#####################
#####################

# read in data
# rename columns

In [3]:
#read in the csv file of elevated blood lead levels by census tract in CA
EBLL_df = pd.read_csv(rf'C:\Users\amarq\Python_Projects\raw_data\Lead Testing by CA Census Tract 2013 to 2018.csv')
EBLL_df.shape

(7632, 4)

In [4]:
#rename columns
EBLL_renaming = {'County - Census Tract' : 'Tract',
       'Number of Children Under Age 6 with Elevated Lead Levels':'Number_EBLL',
       'Number of Tests that Children in Medi-Cal Ages 1 and 2 Should have Received':'Number_MediCal',
       'Number of Tests that Children in Medi-Cal Ages 1 and 2 Missed':'Number_Not_Tested'}

EBLL_df.rename(columns=EBLL_renaming, inplace = True)


In [7]:
EBLL_df['Tract'].head()

0    Alameda County - Census Tract 4003
1    Alameda County - Census Tract 4004
2    Alameda County - Census Tract 4005
3    Alameda County - Census Tract 4006
4    Alameda County - Census Tract 4007
Name: Tract, dtype: object

In [8]:
#create a county column
EBLL_df['County'] = EBLL_df['Tract'].str.split(" ", expand=True).drop(columns=[1,2,3,4,5,6,7])


In [21]:
#create a census tract column
EBLL_df['Census_Tract'] = EBLL_df['Tract'].str.split(" ", expand=True).drop(columns=[0,1,2,3,4,6,7])

0    4003
1    4004
2    4005
3    4006
4    4007
Name: Census_Tract, dtype: object

In [23]:
#drop the tract data since we've gotten the info we need
EBLL_df.drop(columns = 'Tract',inplace = True)

Unnamed: 0,Number_EBLL,Number_MediCal,Number_Not_Tested,County,Census_Tract
0,9,110,78,Alameda,4003
1,7,79,57,Alameda,4004
2,2,96,64,Alameda,4005
3,2,61,Redacted*,Alameda,4006
4,12,229,156,Alameda,4007
...,...,...,...,...,...
7627,3,194,106,Yuba,407
7628,1,211,132,Yuba,408
7629,3,305,188,Yuba,409.01
7630,1,377,258,Yuba,410


In [None]:
#keep only alameda county
EBLL_ac__df = EBLL_df[EBLL_df]

In [6]:
#bring in ACS Data
Age_of_Structure_df = pd.read_csv(rf'C:\Users\amarq\Python_Projects\raw_data\ACS_5Y_2018.B25126_AgeofStructureBuilt_Alameda County.csv')

In [7]:
#cleaning up data with goal of creating total columns for age of structure by age for renters, owners and both

#start by dropping all MOE columns
moe_columns = []
for i in Age_of_Structure_df.columns:
    if i.startswith('Margin') == True:
        moe_columns.append(i) 
    else:
        pass
Building_Ages_df = Age_of_Structure_df.drop(columns = moe_columns)
Building_Ages_df.columns

Index(['GEO_ID', 'Geographic Area Name', 'Estimate!!Total',
       'Estimate!!Total!!Owner occupied',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 2014 or later',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 2010 to 2013',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 2000 to 2009',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1990 to 1999',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1980 to 1989',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1970 to 1979',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1960 to 1969',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1950 to 1959',
       'Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 1940 to 1949',
       'Estimate!

In [8]:
#creating summation columns by using list comprehension and a list of the years/columns in a loop
year = ['2014','2010','2000','1990','1980','1970','1960','1950','1940','1939']
year_names = ['built_'+i for i in year]
renter_names = ['built_renter_'+i for i in year]

for i in range(len(year)): #loop over each year
    built_columns = [col for col in Building_Ages_df.columns if year[i] in col] #create list of columns with year[i] in it
    built_renter_columns = [col for col in Building_Ages_df.columns if (year[i] in col) and ('Renter' in col)] #create list of columns with year[i]and 'Renter' in it

    Building_Ages_df[year_names[i]] = Building_Ages_df.loc[:,built_columns].sum(axis='columns') #create new column populated by sum of columns with that year in it
    Building_Ages_df[renter_names[i]] = Building_Ages_df.loc[:,built_renter_columns].sum(axis='columns') #create new column populated by sum of columns with that year in it

In [9]:
#double checking that the summed thing worked

print(Building_Ages_df['Estimate!!Total!!Owner occupied!!Householder 15 to 34 years!!Built 2014 or later'].sum() +\
Building_Ages_df['Estimate!!Total!!Owner occupied!!Householder 35 to 64 years!!Built 2014 or later'].sum() +\
Building_Ages_df['Estimate!!Total!!Owner occupied!!Householder 65 years and over!!Built 2014 or later'].sum() +\
Building_Ages_df['Estimate!!Total!!Renter occupied!!Householder 15 to 34 years!!Built 2014 or later'].sum() +\
Building_Ages_df['Estimate!!Total!!Renter occupied!!Householder 35 to 64 years!!Built 2014 or later'].sum() +\
Building_Ages_df['Estimate!!Total!!Renter occupied!!Householder 65 years and over!!Built 2014 or later'].sum()
 == Building_Ages_df['built_2014'].sum())


#it worked


True


In [10]:
#creating dataframe of only the columns that I want
columns_to_keep = ['GEO_ID', 'Geographic Area Name', 'Estimate!!Total', 'Estimate!!Total!!Renter occupied'] +\
year_names + renter_names

ba_sums_df = Building_Ages_df[columns_to_keep].copy()

In [11]:
ba_sums_df.columns

Index(['GEO_ID', 'Geographic Area Name', 'Estimate!!Total',
       'Estimate!!Total!!Renter occupied', 'built_2014', 'built_2010',
       'built_2000', 'built_1990', 'built_1980', 'built_1970', 'built_1960',
       'built_1950', 'built_1940', 'built_1939', 'built_renter_2014',
       'built_renter_2010', 'built_renter_2000', 'built_renter_1990',
       'built_renter_1980', 'built_renter_1970', 'built_renter_1960',
       'built_renter_1950', 'built_renter_1940', 'built_renter_1939'],
      dtype='object')

In [12]:
#function from UC Berkeley's 2020 Data for Housing Team
# Census FTP Site
BASE_GEO = "https://www2.census.gov/geo/tiger/"
California_Tracts_GDF = rf'C:\Users\amarq\Python_Projects\out_data\calitracts.json'

def fetch_tract_geodata(BASE, year, state, out_file_name):
    '''
    The function below fetches Census cartographic boundary files from the Census FTP site. 
    The function reads the zipped geodata from the FTP url and saves it in memory as a pandas 
    geodataframe. Then it saves the geodata to a geojson file.
    
    Parameters:
        BASE (str): base url for the data
        year (int): the year of data
        state (str): 2-digit state FIPS code
        out_file_name (str): file name for the output geojson
        
    Returns:
    
        gdf: a geodataframe of Census tract boundaries
    
    '''
    # Start function
    print("...fetching Census tract boundaries")
    
    # Build enhanced base url
    param_geo = "GENZ"+str(year)+ "/shp/cb_"+str(year)+"_"+state+"_tract_500k.zip"
    
    # Read in tracts geodata
    tracts = gpd.read_file(BASE+param_geo)
    
    # Save tract boundaries to file as a geojson
    print(f"...saving tract boundaries to file: {out_file_name}")
    tracts.to_file(driver='GeoJSON', filename=out_file_name)
    print("saved")
    
    # Return geodataframe
    return tracts

In [13]:
tracts_gdf = fetch_tract_geodata(BASE_GEO,2018,'06',California_Tracts_GDF)

...fetching Census tract boundaries
...saving tract boundaries to file: C:\Users\amarq\Python_Projects\out_data\calitracts.json
saved


In [14]:
#create alameda county only census tracts
ac_tracts_gdf = tracts_gdf[tracts_gdf['COUNTYFP'] == '001']

In [15]:
ac_tracts_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
26,6,1,425101,1400000US06001425101,6001425101,4251.01,CT,590870,2045459,"POLYGON ((-122.31419 37.84231, -122.29923 37.8..."
27,6,1,428600,1400000US06001428600,6001428600,4286.0,CT,898967,1080420,"POLYGON ((-122.27993 37.76818, -122.27849 37.7..."
28,6,1,432600,1400000US06001432600,6001432600,4326.0,CT,1673450,0,"POLYGON ((-122.16751 37.72632, -122.16108 37.7..."
29,6,1,433200,1400000US06001433200,6001433200,4332.0,CT,3174901,0,"POLYGON ((-122.16667 37.71042, -122.15559 37.7..."
30,6,1,433900,1400000US06001433900,6001433900,4339.0,CT,816585,0,"POLYGON ((-122.12091 37.69998, -122.11723 37.7..."


In [17]:
#joining acs 'Geo_ID with the tracts geodatafromes 'AFFGEOID'. ac_tracts_gdf needs to be in the left
#first rename GEOID as AFFGEOID so that we can merge on that attribute
ba_sums_df.rename(columns={"GEO_ID": "AFFGEOID"},inplace=True) 
#second, perform the merge on the geodatagrame
ba_sums_gdf = ac_tracts_gdf.merge(ba_sums_df, on='AFFGEOID')

In [18]:
ba_sums_gdf.info()

<class 'geopandas.geodataframe.GeoDataFrame'>
Int64Index: 360 entries, 0 to 359
Data columns (total 33 columns):
 #   Column                            Non-Null Count  Dtype   
---  ------                            --------------  -----   
 0   STATEFP                           360 non-null    object  
 1   COUNTYFP                          360 non-null    object  
 2   TRACTCE                           360 non-null    object  
 3   AFFGEOID                          360 non-null    object  
 4   GEOID                             360 non-null    object  
 5   NAME                              360 non-null    object  
 6   LSAD                              360 non-null    object  
 7   ALAND                             360 non-null    int64   
 8   AWATER                            360 non-null    int64   
 9   geometry                          360 non-null    geometry
 10  Geographic Area Name              360 non-null    object  
 11  Estimate!!Total                   360 non-null    