# Geocoding Organisations

For a number of planned analyses, we require the latitude and longitude of the organisations in the data, not just their names and addresses. To do this we will first use a geocoding service to identify lat lng from address strings.

In [1]:
import ratelim
import json
import os
import pandas as pd
import psycopg2

from geopy.geocoders import GoogleV3
from ipywidgets import FloatProgress
from IPython.display import display

key = os.environ.get('GOOGLE_MAPS_KEY')

In [2]:
# Google geocoder
geocoder = GoogleV3(api_key=key)

In [193]:
# Read in config file with DB params
with open('../scripts/config.json') as f:
    conf = json.load(f)
    
# Define a connection string
conn_string = 'host={} dbname={} user={} password={}'.format(conf.get('host'),
                                                             conf.get('database'),
                                                             conf.get('user'),
                                                             conf.get('passw'))

# Create a connection object
conn = psycopg2.connect(conn_string)

# SQL string that unpacks nested JSON arrays
#sql_str = """
#SELECT id, name, addresses from gtr.organisations;
#"""

# chunksize returns an iterator that reads chunk number of rows at a time
#df = pd.read_sql(sql_str,
#                      conn)

In [4]:
 df.head()

Unnamed: 0,id,name,addresses
0,BAB46AB6-90FC-4145-B885-0153BEA15A73,A-Tech Fabrications Limited,"{'address': [{'region': 'North East', 'type': ..."
1,BB979A0C-0F8D-4B3F-A8D0-06A5F0D45FB0,Life Sciences,{'address': []}
2,BCD9069A-6739-4039-86DB-042CB13254CB,Zurich Insurance plc,"{'address': [{'region': 'Unknown', 'line2': '3..."
3,B69FACF5-5AB9-4D97-88CD-0A38C32935FF,Limagrain UK Ltd,"{'address': [{'region': 'East Midlands', 'line..."
4,B76FE143-1E07-40B0-8932-028207296A78,China Earthquake Administration,"{'address': [{'region': 'Outside UK', 'line2':..."


In [5]:
# API rate limits with billing enabled
@ratelim.greedy(100000, 60 * 60 * 24)
@ratelim.greedy(10, 1)
def geocode(address_dict):
    try:
        pc = address_dict['address'][0].get('postCode', None)
        return geocoder.geocode(pc)
    except:
        address = address_str(address_dict)
        if address:
            return geocoder.google(address_str)
        else:
            return None

In [6]:
def postcode(address_entry):
    try:
        return address_entry['address'][0].get('postCode', None)
    except:
        return None

In [7]:
def address_str(address_dict):
    
    address_keys = [
    'line1',
    'line2',
    'line3',
    'county',
    'postCode',
    ]
    
    l = []
    address_list = address_dict.get('address', None)
    if address_list is None:
        return None
    try:
        if address_list[0] is not None:
            [l.append(add_list[0][key]) if key in address_keys else '' for key in address_list[0]]            
    except:
        return None
    
    return ", ".join(l)

In [8]:
def check_address(address_dict):
    address_list = address_dict.get('address', None)
    if address_list is None:
        return False
    try:
        if address_list[0] is not None:
            return True
    except:
        return False

In [9]:
df['postcode'] = df.addresses.map(postcode)
n_postcodes = df.postcode.count()
n_no_postcodes = len(df.postcode) - n_postcodes
print('{} organisations with postcodes, {} without'.format(n_postcodes, n_no_postcodes))

18266 organisations with postcodes, 7299 without


We will be unable to geocode 7,299 organisations by using only a postcode in the geocode request. Lets check how many of those without postcodes, also lack other address details. For instance, organisations from outside the UK would lack a postcode, but can still be geocoded using a string lookup.

In [10]:
# Full dataframe check
df.addresses.map(check_address).value_counts()

True     21205
False     4360
Name: addresses, dtype: int64

In [11]:
# Subset of those without postcodes
df[df.postcode.isnull()].addresses.map(check_address).value_counts()

False    4360
True     2939
Name: addresses, dtype: int64

Of the 7,299 organisations without a postcode, 4,360 also have no address data. This leaves 2,939 organisations with no postcode but some data in the address field. Therefore the maximum number of geocoded organisations will be 21,205. When geocoding, we will default to the use of a postcode where possible, and fallback to an address string where a postcode isn't present.

In [16]:
latlng = []
num_addresses = df.addresses.shape[0]
f = FloatProgress(min=0, max=num_addresses)
display(f)

for address in df.addresses:
    g = geocode(address)
    try:
        latlng.append((g.latitude, g.longitude))
    except AttributeError:
        latlng.append(None)
    f.value += 1
    

In [202]:
cur = conn.cursor()

In [203]:
# Add relevant columns
cur.execute("ALTER TABLE gtr.organisations ADD COLUMN lat numeric;")
cur.execute("ALTER TABLE gtr.organisations ADD COLUMN lng numeric;")
conn.commit()

In [204]:
# Function to get tuple by position, returning
# None where there isn't a valid entry
def get_tuple_value(x, pos):
    try:
        return x[pos]
    except TypeError:
        return None

In [205]:
lat_list = df['latlng'].apply(get_tuple_value, args=(0,)).astype('str').tolist()
lng_list = df['latlng'].apply(get_tuple_value, args=(1,)).astype('str').tolist()

In [209]:
# We use the data list to be sure of the template length
records_list_template = ','.join(['(%s)'] * len(lat_list))

In [212]:
# Insert lat values
insert_query = "insert into gtr.organisations (lat) values {0}".format(records_list_template)
cur.execute(insert_query, lat_list)

In [208]:
# Insert lng values
insert_query = "insert into gtr.organisations (lng) values {0!s}".format(records_list_template)
cur.execute(insert_query, lng_list)

In [211]:
conn.commit()