# Geocoding the evictions data

In [8]:
#import libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import geopy
from geopy.geocoders import Nominatim
import time as time

## Instructions for Conda users: installing geopy and geopandas

Conda does not come with geopandas and geopy libraries installed.  Additionally, there are dependency conflicts when 
using 'conda install' to install these libraries and use them with Jupyter Notebook.  Please see this article for instructions on how to install geopandas with a virtual environment: https://medium.com/@sourav_raj/ultimate-easiest-way-to-install-geopandas-on-windows-add-to-jupyter-notebook-which-will-a4b11223f4f2. You can use the same instructions for installing geopy in the same virtual environment. Using a virtual environment corrects the dependency issues.

This is the article I used to do the geocoding: https://towardsdatascience.com/geocode-with-python-161ec1e62b89

Enjoy! Happy to help if anyone has any questions about installation of these libraries with a virtual environment.

## Geocode a single address

In [9]:
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode('Champ de Mars, Paris, France')

In [10]:
print('Latitude = {}, Longitude = {}'.format(location.latitude, location.longitude))

Latitude = 48.85614465, Longitude = 2.297820393322227


## Reading in data and preparing for geocoding

In [11]:
# NYC OpenData url: https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4
# setting the link of our data (a static csv file) as the name 'url'
url = 'https://data.cityofnewyork.us/api/views/6z8x-wfk4/rows.csv?accessType=DOWNLOAD'

# reading in our data as a pandas dataframe and setting the name to'evictions_raw'
evictions_raw = pd.read_csv(url)

In [12]:
# preview data
evictions_raw.head()

Unnamed: 0,COURT_INDEX_NUMBER,DOCKET_NUMBER,EVICTION_ADDRESS,EVICTION_APT_NUM,EXECUTED_DATE,MARSHAL_FIRST_NAME,MARSHAL_LAST_NAME,RESIDENTIAL_COMMERCIAL_IND,BOROUGH,EVICTION_ZIP
0,75224/16K,61415,110 ROCHESTER AVENUE,5B,03/22/2017,Justin,Grossman,Residential,BROOKLYN,11213
1,55632/18,169708,2955 SHELL ROAD,11H,06/01/2018,Alfred,Locascio,Residential,BROOKLYN,11224
2,B030905/17,383589,766 BRADY AVENUE,336,11/14/2017,Richard,McCoy,Residential,BRONX,10462
3,R50224/17B,78385,925 TOMKINS AVENUE,,05/16/2017,Ileana,Rivera,Residential,STATEN ISLAND,10305
4,B68455/17,87213,2690 MORRIS AVENUE,6C,03/14/2018,Ileana,Rivera,Residential,BRONX,10468


In [13]:
# create new column for full address for geocoing
evictions_raw['FULL_ADDRESS'] = None

In [14]:
# add concatenated address column to dataframe
for i in range(len(evictions_raw)):
    evictions_raw.loc[i, 'FULL_ADDRESS'] = str(evictions_raw.loc[i, 'EVICTION_ADDRESS']) + ', ' \
                                    + str(evictions_raw.loc[i, 'BOROUGH']) + ', ' \
                                    + str(evictions_raw.loc[i, 'EVICTION_ZIP'])

In [15]:
evictions_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 66339 entries, 0 to 66338
Data columns (total 11 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   COURT_INDEX_NUMBER          66339 non-null  object
 1   DOCKET_NUMBER               66339 non-null  int64 
 2   EVICTION_ADDRESS            66339 non-null  object
 3   EVICTION_APT_NUM            55371 non-null  object
 4   EXECUTED_DATE               66339 non-null  object
 5   MARSHAL_FIRST_NAME          66339 non-null  object
 6   MARSHAL_LAST_NAME           66339 non-null  object
 7   RESIDENTIAL_COMMERCIAL_IND  66339 non-null  object
 8   BOROUGH                     66339 non-null  object
 9   EVICTION_ZIP                66339 non-null  int64 
 10  FULL_ADDRESS                66339 non-null  object
dtypes: int64(2), object(9)
memory usage: 5.6+ MB


In [16]:
# test on addresses
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode(str(evictions_raw.loc[0, 'FULL_ADDRESS']))
print('Latitude = {}, Longitude = {}'.format(location.latitude, location.longitude))

Latitude = 40.67535327272727, Longitude = -73.92782518181818


In [17]:
# check datapoint in new FULL_ADDRESS row
evictions_raw.loc[3, 'FULL_ADDRESS']

'925 TOMKINS AVENUE, STATEN ISLAND, 10305'

## Geocode dataset

In [18]:
# geocode the whole dataset - takes ~ 20hours because there are 66,323 rows of addresses

evictions = evictions_raw.copy()

# time code block
start = time.time()

from geopy.extra.rate_limiter import RateLimiter

# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
evictions['LOCATION'] = evictions['FULL_ADDRESS'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
evictions['POINT'] = evictions['LOCATION'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
evictions[['LATITUDE', 'LONGITUDE', 'ALTITUDE']] = \
                pd.DataFrame(evictions['POINT'].tolist(), index=evictions.index)

end = time.time()
print ("Time elapsed:", end - start)

KeyboardInterrupt: 

In [None]:
# look at sample of geocoded data
evictions.loc[50000:50100,]

In [None]:
# check percentage of None values that the geocoder was unable to read
evictions['POINT'].isna().sum()/len(evictions)*100

In [None]:
# send output to csv
evictions.to_csv('evictions_geocoded_Nominatum.csv')