# Geocoding the evictions data

In [None]:
#import libraries
import pandas as pd
import geopandas as gpd
import numpy as np
import geopy
from geopy.geocoders import Nominatim
import time as time

## Instructions for Conda users: installing geopy and geopandas

Conda does not come with geopandas and geopy libraries installed.  Additionally, there are dependency conflicts when 
using 'conda install' to install these libraries and use them with Jupyter Notebook.  Please see this article for instructions on how to install geopandas with a virtual environment: https://medium.com/@sourav_raj/ultimate-easiest-way-to-install-geopandas-on-windows-add-to-jupyter-notebook-which-will-a4b11223f4f2. You can use the same instructions for installing geopy in the same virtual environment. Using a virtual environment corrects the dependency issues.

This is the article I used to do the geocoding: https://towardsdatascience.com/geocode-with-python-161ec1e62b89

Enjoy! Happy to help if anyone has any questions about installation of these libraries with a virtual environment.

## Geocode a single address

In [None]:
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode('Champ de Mars, Paris, France')

In [None]:
print('Latitude = {}, Longitude = {}'.format(location.latitude, location.longitude))

## Reading in data and preparing for geocoding

In [None]:
# NYC OpenData url: https://data.cityofnewyork.us/City-Government/Evictions/6z8x-wfk4
# setting the link of our data (a static csv file) as the name 'url'
url = 'https://data.cityofnewyork.us/api/views/6z8x-wfk4/rows.csv?accessType=DOWNLOAD'

# reading in our data as a pandas dataframe and setting the name to'evictions_raw'
evictions_raw = pd.read_csv(url)

In [None]:
# preview data
evictions_raw.head()

In [None]:
# create new column for full address for geocoing
evictions_raw['FULL_ADDRESS'] = None

In [None]:
# add concatenated address column to dataframe
for i in range(len(evictions_raw)):
    evictions_raw.loc[i, 'FULL_ADDRESS'] = str(evictions_raw.loc[i, 'EVICTION_ADDRESS']) + ', ' \
                                    + str(evictions_raw.loc[i, 'BOROUGH']) + ', ' \
                                    + str(evictions_raw.loc[i, 'EVICTION_ZIP'])

In [None]:
evictions_raw.info()

In [None]:
# test on addresses
locator = Nominatim(user_agent='myGeocoder')
location = locator.geocode(str(evictions_raw.loc[0, 'FULL_ADDRESS']))
print('Latitude = {}, Longitude = {}'.format(location.latitude, location.longitude))

In [None]:
# check datapoint in new FULL_ADDRESS row
evictions_raw.loc[3, 'FULL_ADDRESS']

## Geocode dataset

In [None]:
# geocode the whole dataset - takes ~ 20hours because there are 66,323 rows of addresses

evictions = evictions_raw.copy()

# time code block
start = time.time()

from geopy.extra.rate_limiter import RateLimiter

# 1 - conveneint function to delay between geocoding calls
geocode = RateLimiter(locator.geocode, min_delay_seconds=1)
# 2- - create location column
evictions['LOCATION'] = evictions['FULL_ADDRESS'].apply(geocode)
# 3 - create longitude, laatitude and altitude from location column (returns tuple)
evictions['POINT'] = evictions['LOCATION'].apply(lambda loc: tuple(loc.point) if loc else None)
# 4 - split point column into latitude, longitude and altitude columns
evictions[['LATITUDE', 'LONGITUDE', 'ALTITUDE']] = \
                pd.DataFrame(evictions['POINT'].tolist(), index=evictions.index)

end = time.time()
print ("Time elapsed:", end - start)

In [None]:
# look at sample of geocoded data
evictions.loc[50000:50100,]

In [None]:
# check percentage of None values that the geocoder was unable to read
evictions['POINT'].isna().sum()/len(evictions)*100

In [None]:
# send output to csv
evictions.to_csv('evictions_geocoded_Nominatum.csv')