In [77]:
#bring in packages
import pandas as pd
import numpy as np
import datetime
import geopandas as gpd

import matplotlib # base python plotting library
%matplotlib inline  
import matplotlib.pyplot as plt # more plotting stuff 

# geocoding libraries
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut

from time import sleep


Basic Cleaning

In [2]:
#read in data saved to hard drive
df = pd.read_csv(rf'~\Python_Projects\raw_data\City of Oakland Housing Habitability Complaints 2013 to 2018_raw.csv')

In [3]:
#drop columns without information
columns_to_drop = ['Project Name','Short Notes','Unnamed: 8']
df.drop(columns = columns_to_drop, inplace = True)

In [4]:
#make the description column lower case to improve later search function
df['Description'] = df['Description'].str.lower()

In [5]:
# need to understand if there are any duplicates in the record number
# df['Record Number'].nunique()
# there are 14789 records
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14789 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14789 non-null  object
 1   Record Number  14789 non-null  object
 2   Record Type    14789 non-null  object
 3   Address        14788 non-null  object
 4   Description    14779 non-null  object
 5   Status         14782 non-null  object
dtypes: object(6)
memory usage: 693.4+ KB


In [6]:
#drop all rows that have a null description since that's our variable of interest
df.dropna(subset = ['Description'],inplace= True)
df.info()
#14779 records left

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14779 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14779 non-null  object
 1   Record Number  14779 non-null  object
 2   Record Type    14779 non-null  object
 3   Address        14778 non-null  object
 4   Description    14779 non-null  object
 5   Status         14772 non-null  object
dtypes: object(6)
memory usage: 808.2+ KB


In [7]:
#drop the one record that doesn't have an address
df.dropna(subset = ['Address'],inplace= True)
df.info()
#14778 records

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14778 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14778 non-null  object
 1   Record Number  14778 non-null  object
 2   Record Type    14778 non-null  object
 3   Address        14778 non-null  object
 4   Description    14778 non-null  object
 5   Status         14771 non-null  object
dtypes: object(6)
memory usage: 808.2+ KB


In [8]:
#turn the date into a python datetime format
df['Filed Date'] = pd.to_datetime(df['Filed Date'])

In [9]:
#create new column of just the year for ease of use
df['Year'] = pd.DatetimeIndex(df['Filed Date']).year

In [10]:
df.groupby('Year').count()
#reduced records over time, (Note from Alex: I think this is related to a loss of inspection staff,
# shouldn't interpret it as fewer issues)

Unnamed: 0_level_0,Filed Date,Record Number,Record Type,Address,Description,Status
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013,4338,4338,4338,4338,4338,4338
2014,2367,2367,2367,2367,2367,2365
2015,1975,1975,1975,1975,1975,1974
2016,2016,2016,2016,2016,2016,2015
2017,2294,2294,2294,2294,2294,2291
2018,1788,1788,1788,1788,1788,1788


In [11]:
# dropping 2013 to match the 2014-2018 ACS 5 year sample data
df = df.loc[df['Year'] != 2013]

Exploring the distributions of repeat violators

In [12]:
df['Address'].describe()
# there are 2628 records where addresses are repeated 

count                             10440
unique                             8190
top       344 13TH ST, Oakland CA 94612
freq                                 54
Name: Address, dtype: object

In [13]:
#trying to understand how those 2628 records are distributed 
# dups.df = 
duplicates = df.groupby('Address').count()
duplicates = duplicates[duplicates['Filed Date'] > 1]

In [14]:
duplicates
#1761 addresses have repeat violations

Unnamed: 0_level_0,Filed Date,Record Number,Record Type,Description,Status,Year
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"0 OUTLOOK AVE, Oakland CA 94608",2,2,2,2,2,2
"0 SHEPHERD CANYON RD, Oakland CA 94611",2,2,2,2,2,2
"1 LAKESIDE DR, Oakland CA 94612",3,3,3,3,3,3
"1 NORTH HILL CT, Oakland CA 94618",2,2,2,2,2,2
"100 9TH ST, Oakland CA 94607",2,2,2,2,2,2
...,...,...,...,...,...,...
"9928 HOLLY ST, Oakland CA 94603",2,2,2,2,2,2
"9939 INTERNATIONAL BLVD, Oakland CA 94603",5,5,5,5,5,5
"9943 VOLTAIRE AVE, Oakland CA 94603",3,3,3,3,3,3
"9960 MACARTHUR BLVD, Oakland CA 94605",3,3,3,3,3,3


In [15]:
duplicates.groupby('Filed Date').count()
#most duplicate addresses are 2-4 times, only a few have many complaints

Unnamed: 0_level_0,Record Number,Record Type,Description,Status,Year
Filed Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,1039,1039,1039,1039,1039
3,249,249,249,249,249
4,73,73,73,73,73
5,34,34,34,34,34
6,14,14,14,14,14
7,10,10,10,10,10
8,1,1,1,1,1
9,1,1,1,1,1
10,2,2,2,2,2
11,5,5,5,5,5


Creating 'Healthy Housing Issue' Variable
- search the description variable for existence of any of a list of key words

In [16]:
# make variables if any of the following words show up in the string. mold, mildew, water, 
# leak, plumbing, moist, seepage, black, ventilation, flood, rot, condensation, corroded, and mildrew 
key_words = ['leak', 'plumbing', 'moist', 'seepage', 'black', 'ventilation', 'flood', 'rot', 'condensation',\
             'corroded', 'mildrew', 'mold', 'mildew', 'water', 'asbestos']

for i in range(len(key_words)): #loop through the key words to find if any of the words exist 
    df[key_words[i]] = df['Description'].str.find(key_words[i]) #create a new column for each word that
    #populates -1 if no, index if yes
    
for i in range(len(key_words)): #iterate through the length of the list of key words
    df[key_words[i]] = df[key_words[i]].apply(lambda x: 0 if x == -1 else 1) #apply a transformation to each column
    #to turn -1 into 0s and other numbers into 1s
        

In [17]:
#create new dataset that only keeps those records with a True in them.
#hh is shorthand for 'healthy housing'
df['HH_Complaint'] = 0
for i in range(len(key_words)):
    df['HH_Complaint'] += df[key_words[i]]
hh_df = df.loc[df['HH_Complaint'] > 0].copy(deep=True)
#note from Alex: I'm not entirely sure why this needs to be a deep copy?
hh_df.columns

Index(['Filed Date', 'Record Number', 'Record Type', 'Address', 'Description',
       'Status', 'Year', 'leak', 'plumbing', 'moist', 'seepage', 'black',
       'ventilation', 'flood', 'rot', 'condensation', 'corroded', 'mildrew',
       'mold', 'mildew', 'water', 'asbestos', 'HH_Complaint'],
      dtype='object')

In [18]:
#then, cleaning up the new df to get rid of the individual columns
hh_df.drop(columns = key_words, inplace = True)
hh_df.columns    

Index(['Filed Date', 'Record Number', 'Record Type', 'Address', 'Description',
       'Status', 'Year', 'HH_Complaint'],
      dtype='object')

In [19]:
#appears that 2886 of the total 10440 are housing habitability complaints
print(df.shape)
print(hh_df.shape)

(10440, 23)
(3494, 8)


In [20]:
#condensing hh_df to only focus on each address, removing any duplicates
hh_df = hh_df.groupby('Address').sum().copy()
hh_df.reset_index(inplace = True)

In [21]:
hh_df.head()

Unnamed: 0,Address,Year,HH_Complaint
0,"0 LINDEN ST, Oakland CA 94602",2014,1
1,"0 THORNDALE DR, Oakland CA 94603",2015,1
2,"100 9TH ST, 208, Oakland CA 94607",2018,1
3,"100 9TH ST, Oakland CA 94607",4036,2
4,"1000 43RD ST, UNIT 7, Oakland CA 94608",2017,3


In [22]:
hh_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2886 entries, 0 to 2885
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Address       2886 non-null   object
 1   Year          2886 non-null   int64 
 2   HH_Complaint  2886 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 67.8+ KB


Preparing Addresses for Geocoding

In [25]:
# need to turn the addresses into a recoded address column that doesn't have any unit information

#first, split the address before and after oakland
address_split = hh_df['Address'].str.split(', Oakland', expand=True)
#second, split the first half of the address along thet 
address_split_first = address_split[0].str.split(',', expand=True)
#then, join the first part of 'address_split_first' and the second part of 'address_split' with Oakland in the middle
hh_df['Address_recode'] = address_split_first[0] + ', Oakland,' + address_split[1]

Geocoding Data Frame to Turn Addresses into Lat/Long

In [36]:
# following this: https://stackoverflow.com/questions/31414481/new-column-with-coordinates-using-geopy-pandas
geolocator = Nominatim(user_agent="OaklandGeocoder")


In [37]:
#create blank columns for the lat and long
hh_df['latitude'] = np.nan
hh_df['longitude'] = np.nan

In [72]:
#first time I tried this it did many of them but eventually got a timed out error from the host
#timed out error solved via this (https://gis.stackexchange.com/questions/173569/avoid-time-out-error-nominatim-geopy-open-street-maps)
#second time there was a "'NoneType' object has not attribute 'latitude'" error after 521 rows
# the error was becuase 1605 M L KING JR WY, Oakland CA doesn't bring anything back. Fixed by adding a 
# if location is None code section
def geocode_location(location, attempt=1, max_attempts=5):
   #function needed to deal with the occasional timed out error as Nominatim can be slow
   try:
        return geolocator.geocode(location)
   except GeocoderTimedOut:
        if attempt <= max_attempts:
            return do_geocode(address, attempt=attempt+1)
        raise 

#populating the lat and long for the hh data
for i in range(1026,2886): #loop over each row
    location = geocode_location(hh_df.at[i, 'Address_recode']) #grab the geolocation of each row, assign to location
    if location is None: #needed to skip over the few options that couldn't be coded
        sleep(1) #insert one second pause so they don't lock you out per Nominatim's usage agreement
    else:        
        hh_df.at[i, 'latitude'] = location.latitude #assign the latitude 
        hh_df.at[i, 'longitude'] = location.longitude #assign the longitude
        sleep(1) #insert one second pause so they don't lock you out per Nominatim's usage agreement



Turning DataFrame into Census Tract Level Data
- should include columns such as number of total complaints and number of healthy housing complaints
- should easily merge into the ACS census tract data

In [75]:
#function from UC Berkeley's 2020 Data for Housing Team to get Census tracts
# Census FTP Site
BASE_GEO = "https://www2.census.gov/geo/tiger/"
California_Tracts_GDF = rf'C:\Users\amarq\Python_Projects\out_data\calitracts.json'

def fetch_tract_geodata(BASE, year, state, out_file_name):
    '''
    The function below fetches Census cartographic boundary files from the Census FTP site. 
    The function reads the zipped geodata from the FTP url and saves it in memory as a pandas 
    geodataframe. Then it saves the geodata to a geojson file.
    
    Parameters:
        BASE (str): base url for the data
        year (int): the year of data
        state (str): 2-digit state FIPS code
        out_file_name (str): file name for the output geojson
        
    Returns:
    
        gdf: a geodataframe of Census tract boundaries
    
    '''
    # Start function
    print("...fetching Census tract boundaries")
    
    # Build enhanced base url
    param_geo = "GENZ"+str(year)+ "/shp/cb_"+str(year)+"_"+state+"_tract_500k.zip"
    
    # Read in tracts geodata
    tracts = gpd.read_file(BASE+param_geo)
    
    # Save tract boundaries to file as a geojson
    print(f"...saving tract boundaries to file: {out_file_name}")
    tracts.to_file(driver='GeoJSON', filename=out_file_name)
    print("saved")
    
    # Return geodataframe
    return tracts

In [78]:
#fetching california tracks
tracts_gdf = fetch_tract_geodata(BASE_GEO,2018,'06',California_Tracts_GDF)

...fetching Census tract boundaries
...saving tract boundaries to file: C:\Users\amarq\Python_Projects\out_data\calitracts.json
saved


In [79]:
#create alameda county only census tracts
ac_tracts_gdf = tracts_gdf[tracts_gdf['COUNTYFP'] == '001']

In [80]:
#read in the Oakland boundaries
oakland_bounds = gpd.read_file(rf'C:\Users\amarq\Python_Projects\raw_data\City of Oakland City Limits.geojson')

In [81]:
#confirm the CRS are the same, they are not
print(ac_tracts_gdf.crs)
print(oakland_bounds.crs)

epsg:4269
epsg:4326


In [82]:
#changing the oakland bounds CRS to be the same as census
oakland_bounds.to_crs(ac_tracts_gdf.crs,inplace=True)
oakland_bounds.crs == ac_tracts_gdf.crs

True

In [84]:
# adding the census tract information into our main dataset
ac_tracts_gdf.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER,geometry
26,6,1,425101,1400000US06001425101,6001425101,4251.01,CT,590870,2045459,"POLYGON ((-122.31419 37.84231, -122.29923 37.8..."
27,6,1,428600,1400000US06001428600,6001428600,4286.0,CT,898967,1080420,"POLYGON ((-122.27993 37.76818, -122.27849 37.7..."
28,6,1,432600,1400000US06001432600,6001432600,4326.0,CT,1673450,0,"POLYGON ((-122.16751 37.72632, -122.16108 37.7..."
29,6,1,433200,1400000US06001433200,6001433200,4332.0,CT,3174901,0,"POLYGON ((-122.16667 37.71042, -122.15559 37.7..."
30,6,1,433900,1400000US06001433900,6001433900,4339.0,CT,816585,0,"POLYGON ((-122.12091 37.69998, -122.11723 37.7..."


In [85]:
#turn our dataframe into a GPD gdf
hh_gdf = gpd.GeoDataFrame(hh_df, geometry=gpd.points_from_xy(hh_df.longitude, hh_df.latitude))

In [91]:
#set the CRS
hh_gdf.crs = ac_tracts_gdf.crs
print(hh_gdf.crs)

In [93]:
#perform inner spatial join
hh_tracts_gdf = gpd.sjoin(ac_tracts_gdf,hh_gdf)

In [96]:
hh_tracts_gdf.shape
#so, lost about a hundred rows that weren't properly geocoded.

(2776, 17)

In [97]:
#saving final geocoded dataframe
hh_tracts_gdf.to_csv(r'C:\Users\amarq\Python_Projects\raw_data\Oakland Housing Habitability Complaints_2014to2018_geocoded.csv', index = False)