Using
[https://dash.geocod.io/import] to forward geocode `GEOCODE_input.csv`

In [17]:
import pandas as pd
from geopy import *
import geopandas
from geopy.extra.rate_limiter import RateLimiter

In [18]:
df = pd.read_csv('../data/alberta/ltc_odhf.csv')
geos = pd.read_csv('../data/alberta/GEOCODE_output.csv')

In [19]:
df['geo_lat'] = geos.Latitude
df['geo_lon'] = geos.Longitude

In [20]:
ind_rpl = df[df['geo_lat'].notnull() & df['latitude'].isnull()].index
print("Number of lats that will be replaced by geocoding:", len(ind_rpl), "\n",
     "Total number of null lats:", len(df[df['latitude'].isnull()].index))

Number of lats that will be replaced by geocoding: 108 
 Total number of null lats: 108


In [21]:
df.latitude.fillna(df['geo_lat'], inplace = True)
df.longitude.fillna(df['geo_lon'], inplace = True)

**Assessing accuracy of geocoder**

In [22]:
df['lat_error'] = abs(df.latitude - df.geo_lat)
df['lon_error'] = abs(df.longitude - df.geo_lon)

In [23]:
df.lat_error

0      0.003911
1      0.000000
2      0.000021
3      0.000000
4      0.000000
         ...   
172    2.112813
173    3.724698
174    0.000000
175    0.000000
176    0.000000
Name: lat_error, Length: 177, dtype: float64

In [24]:
check = df[df[df[['lat_error','lon_error']]>1].any(1)]

In [25]:
check.to_csv('../data/alberta/geocode_checker.csv')

**Manually check & adjust erroneous lat/lon:**

In [26]:
df2 = df.set_index('Facility Name')

In [27]:
df2.at['Big Country Hospital','longitude'] = -110.4793
df2.at['Hardisty Health Centre','longitude'] = -111.307616
df2.at['Provost Health Centre','longitude'] = -110.26484
df2.at['Peace River Community Health Centre','longitude'] = -117.354442

**Clean & Export:**

* drop unneccessary columns
* clean up operator type 

In [28]:
df2.columns

Index(['Unnamed: 0', 'Location', 'Facility Address', 'Operator Name',
       'Operator Type', 'DSL-3', 'DSL-4', 'DSL-4-Dementia',
       'LTC-Auxilary Hospital', 'LTC-Nursing Home', 'facility_name', 'index',
       'source_facility_type', 'odhf_facility_type', 'provider', 'street_no',
       'street_name', 'postal_code', 'city', 'province',
       'source_format_str_address', 'CSDname', 'latitude', 'longitude',
       'outbreak_status', 'geo_lat', 'geo_lon', 'lat_error', 'lon_error'],
      dtype='object')

In [29]:
df2.drop(columns = ['Unnamed: 0',
                    'DSL-3', 'DSL-4', 'DSL-4-Dementia',
       'geo_lat', 'geo_lon', 'lat_error', 'lon_error','index',
       'source_facility_type', 'odhf_facility_type', 'provider', 'street_no',
       'street_name', 'postal_code', 'city', 'province','CSDname',
       'source_format_str_address'], inplace = True)

In [30]:
df2['Operator Type'].replace({'Alberta Health Alberta Health Services Services':'Alberta Health Services'}, inplace = True)

In [31]:
df2['Operator Type'].unique()

array(['Alberta Health Services', 'Voluntary', 'Private', 'RHA'],
      dtype=object)

In [32]:
df2.to_csv('../data/alberta/ltc_odhf_clean.csv')