# Geocoding Organisations

For a number of planned analyses, we require the latitude and longitude of the organisations in the data, not just their names and addresses. To do this we will first use a geocoding service to identify lat lng from address strings.

In [466]:
import pickle
import ratelim
import json
import numpy as np
import os
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

from geopy.geocoders import GoogleV3
from ipywidgets import FloatProgress
from IPython.display import display

key = os.environ.get('GOOGLE_MAPS_KEY')

In [3]:
# Google geocoder
geocoder = GoogleV3(api_key=key)

In [4]:
# Read in config file with DB params
with open('../scripts/config.json') as f:
    conf = json.load(f)
    
# Define a connection string
conn_string = 'host={} dbname={} user={} password={}'.format(conf.get('host'),
                                                             conf.get('database'),
                                                             conf.get('user'),
                                                             conf.get('passw'))

# Create a connection object
conn = psycopg2.connect(conn_string)

# SQL string that unpacks nested JSON arrays
sql_str = """
SELECT id, name, addresses from gtr.organisations;
"""

# chunksize returns an iterator that reads chunk number of rows at a time
df = pd.read_sql(sql_str,
                 conn)

In [5]:
def country(x):
    try:
        return x.get('address', None)[0].get('country', None).lower()
    except:
        return None

In [356]:
# API rate limits with billing enabled
@ratelim.greedy(5, 1)
def geocode(address_dict):
    # Get the list out of the dict
    address_list = address_dict['address']
    
    # If no country need to assume UK
    if address_list and address_list[0].get('country', 'uk').lower() in uk_names:
        try:
            pc = address_dict['address'][0]['postCode']
            return geocoder.geocode(pc)
        except:
            address = address_str(address_dict)
            if address:
                return geocoder.geocode(address)
            else:
                return "here"
    else:
        address = address_str(address_dict, uk=False)
        if address:
            return geocoder.geocode(address)
        else:
            return "here2"

In [7]:
def postcode(address_entry):
    try:
        return address_entry['address'][0].get('postCode', None)
    except:
        return None

In [95]:
def address_str(address_dict, uk=True):
    
    uk_address_keys = [
    'line1',
    'line2',
    'line3',
    'county',
    'postCode',
    ]
    
    other_address_keys = [
        'line1',
        'county',
        'country',
    ]
    
    if uk:
        address_keys = uk_address_keys
    else:
        address_keys = other_address_keys
    
    l = []
    address_list = address_dict.get('address', None)
    if address_list is None:
        return None
    try:
        if address_list[0] is not None:
            [l.append(address_list[0][key]) if key in address_keys else '' for key in address_list[0]]            
    except:
        return None
    
    return ", ".join(l)

In [9]:
def check_address(address_dict):
    address_list = address_dict.get('address', None)
    if address_list is None:
        return False
    try:
        if address_list[0] is not None:
            return True
    except:
        return False

In [390]:
# Function to get tuple by position, returning
# None where there isn't a valid entry
def get_tuple_value(x, pos):
    try:
        return x[pos]
    except TypeError:
        return None

In [10]:
# All the ways people put down UK in the country field
uk_names = ['united kingdom',
 'uk',
 'wales',
 'england',
 'scotland',
 'united kingom',
 'northern ireland',
 'united kindgom',
 'buckinghamshire',
 'yorkshire',
 'london',
 'u.k.',
 'united ,kingdom',
 'berkshire',
 'jersey',
 ' united kingdom',
 'u.k']

In [11]:
df['postcode'] = df.addresses.map(postcode)
n_postcodes = df.postcode.count()
n_no_postcodes = len(df.postcode) - n_postcodes
print('{} organisations with postcodes, {} without'.format(n_postcodes, n_no_postcodes))

18266 organisations with postcodes, 7299 without


We will be unable to geocode 7,299 organisations by using only a postcode in the geocode request. Lets check how many of those without postcodes, also lack other address details. For instance, organisations from outside the UK would lack a postcode, but can still be geocoded using a string lookup.

In [12]:
# Full dataframe check
df.addresses.map(check_address).value_counts()

True     21205
False     4360
Name: addresses, dtype: int64

In [13]:
# Subset of those without postcodes
df[df.postcode.isnull()].addresses.map(check_address).value_counts()

False    4360
True     2939
Name: addresses, dtype: int64

Of the 7,299 organisations without a postcode, 4,360 also have no address data. This leaves 2,939 organisations with no postcode but some data in the address field. Therefore the maximum number of geocoded organisations will be 21,205. When geocoding, we will default to the use of a postcode where possible, and fallback to an address string where a postcode isn't present.

In [387]:
"""latlng = []
num_addresses = df.addresses.shape[0]
f = FloatProgress(min=0, max=num_addresses)
display(f)

for address in df.addresses:
    g = geocode(address)
    try:
        latlng.append((g.latitude, g.longitude))
    except AttributeError:
        latlng.append(None)
    f.value += 1
"""   

In [394]:
# Save as a pickled object
pickle.dump( latlng, open( "latlng.p", "wb" ) )

In [474]:
df_latlng = pd.DataFrame(np.asarray(latlng))

In [480]:
df_result = df.merge(df_latlng, left_index=True, right_index=True)
df_result.rename(columns={0: 'latlng'}, inplace=True)
df_result = df_result[df_result.latlng.notnull()]
df_result[['lat', 'lng']] = df_result.latlng.apply(pd.Series)
df_result.drop(['postcode', 'addresses', 'latlng'], axis=1, inplace=True)

In [481]:
df_result

Unnamed: 0,id,name,lat,lng
0,BAB46AB6-90FC-4145-B885-0153BEA15A73,A-Tech Fabrications Limited,54.604095,-1.574308
2,BCD9069A-6739-4039-86DB-042CB13254CB,Zurich Insurance plc,50.881574,-1.242902
3,B69FACF5-5AB9-4D97-88CD-0A38C32935FF,Limagrain UK Ltd,53.482100,-0.248484
4,B76FE143-1E07-40B0-8932-028207296A78,China Earthquake Administration,39.977022,116.306198
5,B80095E4-9D0F-4398-8A0E-07EB418EFFF6,Newmills Engineering Ltd,51.504765,-2.484122
6,B966BD7A-F468-4C74-A694-0842CB8F884F,Maastricht University,50.844288,5.688604
8,BA32CA96-1A26-415F-B3B2-0DC1DDFCE74F,Synthomer Ltd,51.784388,0.119136
9,BA59F886-138D-41EB-B9B1-0C79E45C61D3,Committee on Climate Change,51.494349,-0.140087
10,BA7D8BB2-4B17-4D57-AC78-001BA0F40F77,Agility Global Limited,51.650564,-0.149673
11,C059DB15-1836-430B-A993-0759206ABBB8,Openreach BT,51.515741,-0.097966


In [482]:
engine = create_engine('postgresql://{}:{}@{}/{}'.format(conf.get('user'),
                                                         conf.get('passw'),
                                                         conf.get('host'),
                                                         conf.get('database')))

conn = engine.connect()

In [483]:
df_result.to_sql('orgs_latlng',
                 conn,
                 schema='gtr',
                 if_exists='replace',
                 index=False)