In [3]:
#bring in packages
import pandas as pd
import numpy as np
import datetime

import matplotlib # base python plotting library
%matplotlib inline  
import matplotlib.pyplot as plt # more plotting stuff 

# geocoding libraries
from geopy.geocoders import GoogleV3
import geopy.distance
import googlemaps

Basic Cleaning

In [4]:
#read in data saved to hard drive
df = pd.read_csv(rf'~\Python_Projects\raw_data\City of Oakland Housing Habitability Complaints 2013 to 2018_raw.csv')

In [5]:
#drop columns without information
columns_to_drop = ['Project Name','Short Notes','Unnamed: 8']
df.drop(columns = columns_to_drop, inplace = True)

In [6]:
#make the description column lower case to improve later search function
df['Description'] = df['Description'].str.lower()

In [7]:
# need to understand if there are any duplicates in the record number
# df['Record Number'].nunique()
# there are 14789 records
df.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14789 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14789 non-null  object
 1   Record Number  14789 non-null  object
 2   Record Type    14789 non-null  object
 3   Address        14788 non-null  object
 4   Description    14779 non-null  object
 5   Status         14782 non-null  object
dtypes: object(6)
memory usage: 693.4+ KB


In [8]:
#drop all rows that have a null description since that's our variable of interest
df.dropna(subset = ['Description'],inplace= True)
df.info()
#14779 records left

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14779 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14779 non-null  object
 1   Record Number  14779 non-null  object
 2   Record Type    14779 non-null  object
 3   Address        14778 non-null  object
 4   Description    14779 non-null  object
 5   Status         14772 non-null  object
dtypes: object(6)
memory usage: 808.2+ KB


In [9]:
#drop the one record that doesn't have an address
df.dropna(subset = ['Address'],inplace= True)
df.info()
#14778 records

<class 'pandas.core.frame.DataFrame'>
Int64Index: 14778 entries, 0 to 14788
Data columns (total 6 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   Filed Date     14778 non-null  object
 1   Record Number  14778 non-null  object
 2   Record Type    14778 non-null  object
 3   Address        14778 non-null  object
 4   Description    14778 non-null  object
 5   Status         14771 non-null  object
dtypes: object(6)
memory usage: 808.2+ KB


In [10]:
#turn the date into a python datetime format
df['Filed Date'] = pd.to_datetime(df['Filed Date'])

In [11]:
#create new column of just the year for ease of use
df['Year'] = pd.DatetimeIndex(df['Filed Date']).year

In [12]:
df.groupby('Year').count()
#reduced records over time, (Note from Alex: I think this is related to a loss of inspection staff,
# shouldn't interpret it as fewer issues)

Unnamed: 0_level_0,Filed Date,Record Number,Record Type,Address,Description,Status
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2013,4338,4338,4338,4338,4338,4338
2014,2367,2367,2367,2367,2367,2365
2015,1975,1975,1975,1975,1975,1974
2016,2016,2016,2016,2016,2016,2015
2017,2294,2294,2294,2294,2294,2291
2018,1788,1788,1788,1788,1788,1788


Exploring the distributions of repeat violators

In [13]:
df['Address'].describe()
# there are 2628 records where addresses are repeated 

count                             14778
unique                            12150
top       344 13TH ST, Oakland CA 94612
freq                                 54
Name: Address, dtype: object

In [14]:
#trying to understand how those 2628 records are distributed 
# dups.df = 
duplicates = df.groupby('Address').count()
duplicates = duplicates[duplicates['Filed Date'] > 1]

In [15]:
duplicates
#1761 addresses have repeat violations

Unnamed: 0_level_0,Filed Date,Record Number,Record Type,Description,Status,Year
Address,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"0 17TH ST, OAKLAND CA",2,2,2,2,2,2
"0 BALBOA DR, OAKLAND CA",3,3,3,3,3,3
"0 E 8TH ST, OAKLAND CA",2,2,2,2,2,2
"0 ELLINGTON WY, OAKLAND CA",2,2,2,2,2,2
"0 HIGH ST, OAKLAND CA",2,2,2,2,2,2
...,...,...,...,...,...,...
"9943 VOLTAIRE AVE, Oakland CA 94603",3,3,3,3,3,3
"9945 C ST, OAKLAND CA",2,2,2,2,2,2
"9950 GIBRALTAR RD, OAKLAND CA",2,2,2,2,2,2
"9960 MACARTHUR BLVD, Oakland CA 94605",3,3,3,3,3,3


In [16]:
duplicates.groupby('Filed Date').count()
#most duplicate addresses are 2-4 times, only a few have many complaints

Unnamed: 0_level_0,Record Number,Record Type,Description,Status,Year
Filed Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2,1325,1325,1325,1325,1325
3,282,282,282,282,282
4,78,78,78,78,78
5,34,34,34,34,34
6,15,15,15,15,15
7,11,11,11,11,11
8,1,1,1,1,1
9,1,1,1,1,1
10,2,2,2,2,2
11,5,5,5,5,5


Creating 'Healthy Housing Issue' Variable
- search the description variable for existence of any of a list of key words

In [17]:
# make variables if any of the following words show up in the string. mold, mildew, water, 
# leak, plumbing, moist, seepage, black, ventilation, flood, rot, condensation, corroded, and mildrew 
key_words = ['leak', 'plumbing', 'moist', 'seepage', 'black', 'ventilation', 'flood', 'rot', 'condensation',\
             'corroded', 'mildrew', 'mold', 'mildew', 'water', 'asbestos']

for i in range(len(key_words)): #loop through the key words to find if any of the words exist 
    df[key_words[i]] = df['Description'].str.find(key_words[i]) #create a new column for each word that
    #populates -1 if no, index if yes
    
for i in range(len(key_words)): #iterate through the length of the list of key words
    df[key_words[i]] = df[key_words[i]].apply(lambda x: 0 if x == -1 else 1) #apply a transformation to each column
    #to turn -1 into 0s and other numbers into 1s
        

In [18]:
#quick look at which words had the most hits
# need to figure this out, maybe learn the pivot table method? or a fancier groupbye?

In [24]:
#create new dataset that only keeps those records with a True in them.
#hh is shorthand for 'healthy housing'
df['HH_Complaint'] = 0
for i in range(len(key_words)):
    df['HH_Complaint'] += df[key_words[i]]
hh_df = df.loc[df['HH_Complaint'] > 0]
hh_df.columns

Index(['Filed Date', 'Record Number', 'Record Type', 'Address', 'Description',
       'Status', 'Year', 'leak', 'plumbing', 'moist', 'seepage', 'black',
       'ventilation', 'flood', 'rot', 'condensation', 'corroded', 'mildrew',
       'mold', 'mildew', 'water', 'asbestos', 'HH_Complaint'],
      dtype='object')

In [27]:
#then, cleaning up the new df to get rid of the individual columns
hh_df.drop(columns = key_words, inplace = True)
hh_df.columns    


Index(['Filed Date', 'Record Number', 'Record Type', 'Address', 'Description',
       'Status', 'Year', 'HH_Complaint'],
      dtype='object')

In [28]:
#appears that a minimum of 4144 of the total 14778, about 28%, are housing habitability complaints
print(df.shape)
print(hh_df.shape)

(14778, 23)
(4144, 8)


Geocoding Data Frame to Turn Addresses into Lat/Long

In [39]:
# following this tutorial https://pyshark.com/geocoding-in-python/
API = '' #Alex created an API key, should I be worried about putting this on github?
geolocator = GoogleV3(api_key=API)


In [40]:
#testing API locator on first record
first = hh_df['Address'].iloc[0]
location = geolocator.geocode(first)

GeocoderQueryError: Your request was denied.

Turning DataFrame into Census Tract Level Data
- should include columns such as number of total complaints and number of healthy housing complaints
- should easily merge into the ACS census tract data