In [1]:
# Libraries for data munging and reverse geocoding
import pandas as pd
import numpy as np
from geopy.geocoders import ArcGIS
from geopy.extra.rate_limiter import RateLimiter

In [11]:
# Ingest testing data set
data_url = "https://raw.githubusercontent.com/ncov19-us/ds/staging/drive_thru_testing_locations/us-drive-thru-testing-locations.csv"
df = pd.read_csv(data_url, index_col=0)
print(df.shape)
df.head()

(487, 4)


Unnamed: 0,Name,URL,Latitude,Longitude
0,Collaborative Effort of Health Care Providers,https://www.adn.com/alaska-news/anchorage/2020...,61.18287,-149.837269
1,Alaska Healthcare,https://www.tomsguide.com/news/drive-through-c...,61.18262,-149.83806
2,Fairbanks\' Foundation Health Partners,http://www.newsminer.com/alerts/fairbanks-laun...,64.83998,-147.71432
3,Fairbanks Memorial Hospital,https://www.adn.com/alaska-news/2020/03/25/som...,64.832649,-147.741562
4,Church of the Highlands- Grant\'s Mill,https://www.google.com/amp/s/www.wvtm13.com/am...,33.516999,-86.655847


In [12]:
# Create geocoder and impose rate limit to keep providers happy (not sure if it works lol)
geolocator = ArcGIS(user_agent="drive-thru-testing-reverse-geocoding")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

In [13]:
# Create Coordinates series (tuple of latitude and longitude values)
df["Coordinates"] = list(zip(*[df["Latitude"], df["Longitude"]]))
df["Coordinates"] = df["Coordinates"].astype(str)
df.head()

Unnamed: 0,Name,URL,Latitude,Longitude,Coordinates
0,Collaborative Effort of Health Care Providers,https://www.adn.com/alaska-news/anchorage/2020...,61.18287,-149.837269,"(61.1828699, -149.837269)"
1,Alaska Healthcare,https://www.tomsguide.com/news/drive-through-c...,61.18262,-149.83806,"(61.18261999999999, -149.83806)"
2,Fairbanks\' Foundation Health Partners,http://www.newsminer.com/alerts/fairbanks-laun...,64.83998,-147.71432,"(64.83998000000001, -147.71432)"
3,Fairbanks Memorial Hospital,https://www.adn.com/alaska-news/2020/03/25/som...,64.832649,-147.741562,"(64.832649, -147.74156200000004)"
4,Church of the Highlands- Grant\'s Mill,https://www.google.com/amp/s/www.wvtm13.com/am...,33.516999,-86.655847,"(33.51699910000001, -86.6558468)"


In [14]:
# Define a convenience function to clean and reverse geocode coordinates cell contents, returns address
def reverse_geocode(cell_contents):
    cellcontents = cell_contents.replace("(","").replace(")","")
    location = geolocator.reverse(cellcontents)
    return location.address

In [14]:
# Test on first ten rows of dataframe
df.iloc[:10]["Coordinates"].apply(reverse_geocode)

0    4115 Lake Otis Pkwy, Anchorage, Alaska 99508, USA
1        2304 E 42nd Ave, Anchorage, Alaska 99508, USA
2           895 Noble St, Fairbanks, Alaska 99701, USA
3       1651 W Cowles St, Fairbanks, Alaska 99701, USA
4     316 Highlands Ln, Birmingham, Alabama 35210, USA
5    4301 W Markham St, Little Rock, Arkansas 72205...
6           500 S 40th St, Rogers, Arkansas 72758, USA
7        2302 College Ave, Conway, Arkansas 72034, USA
8        5797 E Mayo Blvd, Phoenix, Arizona 85054, USA
9    13400 E Shea Blvd, Scottsdale, Arizona 85259, USA
Name: Coordinates, dtype: object

In [15]:
# Apply to whole dataframe and save results as new column
df["Addresses"] = df["Coordinates"].apply(reverse_geocode)

In [16]:
# Display results
df.head()

Unnamed: 0,Name,URL,Latitude,Longitude,Coordinates,Addresses
0,Collaborative Effort of Health Care Providers,https://www.adn.com/alaska-news/anchorage/2020...,61.18287,-149.837269,"(61.1828699, -149.837269)","4115 Lake Otis Pkwy, Anchorage, Alaska 99508, USA"
1,Alaska Healthcare,https://www.tomsguide.com/news/drive-through-c...,61.18262,-149.83806,"(61.18261999999999, -149.83806)","2304 E 42nd Ave, Anchorage, Alaska 99508, USA"
2,Fairbanks\' Foundation Health Partners,http://www.newsminer.com/alerts/fairbanks-laun...,64.83998,-147.71432,"(64.83998000000001, -147.71432)","895 Noble St, Fairbanks, Alaska 99701, USA"
3,Fairbanks Memorial Hospital,https://www.adn.com/alaska-news/2020/03/25/som...,64.832649,-147.741562,"(64.832649, -147.74156200000004)","1651 W Cowles St, Fairbanks, Alaska 99701, USA"
4,Church of the Highlands- Grant\'s Mill,https://www.google.com/amp/s/www.wvtm13.com/am...,33.516999,-86.655847,"(33.51699910000001, -86.6558468)","316 Highlands Ln, Birmingham, Alabama 35210, USA"


In [17]:
# Check for no data values
df["Addresses"].isna().sum()

0

In [19]:
df_copy = df.copy()

df_copy["Addresses"] = df_copy["Addresses"].str.replace(", USA", "")

df_copy["State"] = df_copy["Addresses"].str.split(",").str[-1].str.replace('\d+', '')

df_copy["City"] = df_copy["Addresses"].str.split(",").str[-2]

df_copy["Street Address"] = df_copy["Addresses"].str.split(",").str[-3]

df_copy.drop(["Addresses"], axis=1, inplace=True)

df_copy.head()

Unnamed: 0,Name,URL,Latitude,Longitude,Coordinates,State,City,Street Address
0,Collaborative Effort of Health Care Providers,https://www.adn.com/alaska-news/anchorage/2020...,61.18287,-149.837269,"(61.1828699, -149.837269)",Alaska,Anchorage,4115 Lake Otis Pkwy
1,Alaska Healthcare,https://www.tomsguide.com/news/drive-through-c...,61.18262,-149.83806,"(61.18261999999999, -149.83806)",Alaska,Anchorage,2304 E 42nd Ave
2,Fairbanks\' Foundation Health Partners,http://www.newsminer.com/alerts/fairbanks-laun...,64.83998,-147.71432,"(64.83998000000001, -147.71432)",Alaska,Fairbanks,895 Noble St
3,Fairbanks Memorial Hospital,https://www.adn.com/alaska-news/2020/03/25/som...,64.832649,-147.741562,"(64.832649, -147.74156200000004)",Alaska,Fairbanks,1651 W Cowles St
4,Church of the Highlands- Grant\'s Mill,https://www.google.com/amp/s/www.wvtm13.com/am...,33.516999,-86.655847,"(33.51699910000001, -86.6558468)",Alabama,Birmingham,316 Highlands Ln


In [34]:
df_copy["State"].str.len().value_counts(dropna=False)

10    88
9     66
13    57
15    50
14    50
11    44
12    37
16    32
6     22
8     19
7     18
22     4
Name: State, dtype: int64

In [35]:
df_copy["City"].str.len().value_counts(dropna=False)

11    85
8     77
10    68
9     62
7     46
12    43
13    35
6     30
14    17
17     7
16     6
15     6
5      5
Name: City, dtype: int64

In [36]:
df_copy["Street Address"].str.len().value_counts(dropna=False)

15    58
16    52
13    49
14    47
18    41
19    35
12    32
17    31
0     28
22    20
11    15
21    14
20    13
10    12
23    10
8      9
24     5
7      4
6      2
25     2
26     2
9      2
3      1
27     1
28     1
36     1
Name: Street Address, dtype: int64

In [37]:
# Save to csv file
df_copy.to_csv("../drive_thru_testing_locations/locations-with-addresses.csv", index=0)