# Geocoding Organisations

For a number of planned analyses, we require the latitude and longitude of the organisations in the data, not just their names and addresses. To do this we will first use a geocoding service to identify lat lng from address strings.

In [226]:
import pickle
import ratelim
import json
import numpy as np
import os
import pandas as pd
import psycopg2
from sqlalchemy import create_engine

from geopy.geocoders import GoogleV3
from ipywidgets import FloatProgress
from IPython.display import display

key = os.environ.get('GOOGLE_MAPS_KEY')

In [103]:
# Google geocoder
geocoder = GoogleV3(api_key=key)

In [3]:
# Read in config file with DB params
with open('../scripts/config.json') as f:
    conf = json.load(f)
    
# Define a connection string
conn_string = 'host={} dbname={} user={} password={}'.format(conf.get('host'),
                                                             conf.get('database'),
                                                             conf.get('user'),
                                                             conf.get('passw'))

# Create a connection object
conn = psycopg2.connect(conn_string)

# SQL string that unpacks nested JSON arrays
sql_str = """
SELECT id, name, addresses from gtr.organisations;
"""

# chunksize returns an iterator that reads chunk number of rows at a time
df = pd.read_sql(sql_str,
                 conn)

In [5]:
def country(x):
    try:
        return x.get('address', None)[0].get('country', None).lower()
    except:
        return None

In [152]:
# API rate limits with billing enabled
@ratelim.greedy(5, 1)
def geocode(address_dict):
    # Get the list out of the dict
    address_list = address_dict['address']

    # If no country need to assume UK
    if address_list and address_list[0].get('country', 'uk').lower() in uk_names:
        try:
            pc = address_dict['address'][0]['postCode']
            return geocoder.geocode(pc)
        except:
            address = address_str(address_dict)
            if address:
                return geocoder.geocode(address)
            else:
                return None
    else:
        address = address_str(address_dict, uk=False)
        if address:
            g = geocoder.geocode(address)
        else:
            return None

In [105]:
def get_add_part(address_entry, part):
    try:
        return address_entry['address'][0].get('{}'.format(part), None)
    except:
        return None

In [106]:
def postcode(address_entry):
    return get_add_part(address_entry, 'postcode')

In [107]:
def country(address_entry):
    c = get_add_part(address_entry, 'country')
    if c == None:
        return 'uk'
    else:
        return c.lower()

In [252]:
def line1(address_entry):
    return get_add_part(address_entry, 'line1')

In [264]:
def address_str(address_dict, uk=True):
    
    uk_address_keys = [
    'line1',
    'line2',
    'line3',
    'county',
    'postCode',
    ]
    
    other_address_keys = [
        'line1',
        'line2',
        'line3',
        'line4',
        'line5'
        'county',
        'country',
    ]
    
    if uk:
        address_keys = uk_address_keys
    else:
        address_keys = other_address_keys
    
    l = []
    address_list = address_dict.get('address', None)
    if address_list is None:
        return None
    try:
        if address_list[0] is not None:
            [l.append(address_list[0][key]) if key in address_keys else '' for key in address_list[0]]            
    except:
        return None
    
    return ", ".join(l)

In [109]:
def check_address(address_dict):
    address_list = address_dict.get('address', None)
    if address_list is None:
        return False
    try:
        if address_list[0] is not None:
            return True
    except:
        return False

In [390]:
# Function to get tuple by position, returning
# None where there isn't a valid entry
def get_tuple_value(x, pos):
    try:
        return x[pos]
    except TypeError:
        return None

In [79]:
# All the ways people put down UK in the country field
uk_names = ['united kingdom',
 'uk',
 'wales',
 'england',
 'scotland',
 'united kingom',
 'northern ireland',
 'united kindgom',
 'buckinghamshire',
 'yorkshire',
 'london',
 'u.k.',
 'united ,kingdom',
 'berkshire',
 'jersey',
 ' united kingdom',
 'u.k',
 '|united kingdom']

In [11]:
df['postcode'] = df.addresses.map(postcode)
n_postcodes = df.postcode.count()
n_no_postcodes = len(df.postcode) - n_postcodes
print('{} organisations with postcodes, {} without'.format(n_postcodes, n_no_postcodes))

18266 organisations with postcodes, 7299 without


We will be unable to geocode 7,299 organisations by using only a postcode in the geocode request. Lets check how many of those without postcodes, also lack other address details. For instance, organisations from outside the UK would lack a postcode, but can still be geocoded using a string lookup.

In [12]:
# Full dataframe check
df.addresses.map(check_address).value_counts()

True     21205
False     4360
Name: addresses, dtype: int64

In [13]:
# Subset of those without postcodes
df[df.postcode.isnull()].addresses.map(check_address).value_counts()

False    4360
True     2939
Name: addresses, dtype: int64

Of the 7,299 organisations without a postcode, 4,360 also have no address data. This leaves 2,939 organisations with no postcode but some data in the address field. Therefore the maximum number of geocoded organisations will be 21,205. When geocoding, we will default to the use of a postcode where possible, and fallback to an address string where a postcode isn't present.

In [387]:
"""latlng = []
num_addresses = df.addresses.shape[0]
f = FloatProgress(min=0, max=num_addresses)
display(f)

for address in df.addresses:
    g = geocode(address)
    try:
        latlng.append((g.latitude, g.longitude))
    except AttributeError:
        latlng.append(None)
    f.value += 1
"""   

In [394]:
# Save as a pickled object
pickle.dump( latlng, open( "latlng.p", "wb" ) )

In [34]:
df_latlng = pd.DataFrame(np.asarray(latlng))

In [480]:
df_result = df.merge(df_latlng, left_index=True, right_index=True)
df_result.rename(columns={0: 'latlng'}, inplace=True)
df_result = df_result[df_result.latlng.notnull()]
df_result[['lat', 'lng']] = df_result.latlng.apply(pd.Series)
df_result.drop(['postcode', 'addresses', 'latlng'], axis=1, inplace=True)

In [481]:
df_result

Unnamed: 0,id,name,lat,lng
0,BAB46AB6-90FC-4145-B885-0153BEA15A73,A-Tech Fabrications Limited,54.604095,-1.574308
2,BCD9069A-6739-4039-86DB-042CB13254CB,Zurich Insurance plc,50.881574,-1.242902
3,B69FACF5-5AB9-4D97-88CD-0A38C32935FF,Limagrain UK Ltd,53.482100,-0.248484
4,B76FE143-1E07-40B0-8932-028207296A78,China Earthquake Administration,39.977022,116.306198
5,B80095E4-9D0F-4398-8A0E-07EB418EFFF6,Newmills Engineering Ltd,51.504765,-2.484122
6,B966BD7A-F468-4C74-A694-0842CB8F884F,Maastricht University,50.844288,5.688604
8,BA32CA96-1A26-415F-B3B2-0DC1DDFCE74F,Synthomer Ltd,51.784388,0.119136
9,BA59F886-138D-41EB-B9B1-0C79E45C61D3,Committee on Climate Change,51.494349,-0.140087
10,BA7D8BB2-4B17-4D57-AC78-001BA0F40F77,Agility Global Limited,51.650564,-0.149673
11,C059DB15-1836-430B-A993-0759206ABBB8,Openreach BT,51.515741,-0.097966


In [375]:
engine = create_engine('postgresql://{}:{}@{}/{}'.format(conf.get('user'),
                                                         conf.get('passw'),
                                                         conf.get('host'),
                                                         conf.get('database')))

conn = engine.connect()

In [483]:
df_result.to_sql('orgs_latlng',
                 conn,
                 schema='gtr',
                 if_exists='replace',
                 index=False)

# From here delete

In [32]:
latlng = pickle.load(open('latlng.p', 'rb'))

In [22]:
g = geocoder.geocode("SG8 7RE")
g

Location(Church St, Thriplow, Royston, Cambridgeshire SG8 7RE, UK, (52.0997269, 0.1046752, 0.0))

In [185]:
df_result = df.merge(df_latlng, left_index=True, right_index=True)
df_result.rename(columns={0: 'latlng'}, inplace=True)

In [281]:
df_result[~df_result.country.isin(uk_names)].country.unique()

array(['china', 'netherlands', 'australia', 'canada', 'austria', 'ireland',
       'japan', 'norway', 'brazil', 'sri lanka', 'switzerland', 'france',
       'united states', 'kenya', 'new zealand',
       'taiwan, province of china', 'zimbabwe', 'portugal', 'germany',
       'estonia', 'india', 'russian federation', 'singapore', 'nigeria',
       'spain', 'belgium', 'poland', 'italy', 'south africa', 'ethiopia',
       'bolivia, plurinational state of', 'denmark', 'uganda', 'viet nam',
       'kyrgyzstan', 'malawi', 'malaysia', 'mexico', 'argentina',
       'hungary', 'el salvador', 'bangladesh', 'greece', 'ukraine',
       'cambodia', 'chile', 'peru', 'thailand', 'mozambique', 'ghana',
       'israel', 'cuba', 'sierra leone', 'morocco', 'hong kong', 'zambia',
       'cameroon', 'tanzania, united republic of', 'sweden', 'iceland',
       'kazakhstan', 'korea, republic of', 'bosnia and herzegovina',
       'ecuador', 'finland', 'croatia', 'saudi arabia', 'madagascar',
       'slovenia',

In [254]:
df_result['country'] = df.addresses.map(country)
df_result['line1'] = df_result.addresses.map(line1)

In [255]:
df_other_countries = df_result[~df_result.country.map(lambda x: x.lower()).isin(uk_names)]

In [425]:
usa = ['usa', 'united states']
df_usa = df_other_countries[df_other_countries['country'].isin(usa)]

In [309]:
num_addresses = df_usa.iloc[166:].addresses.shape[0]
f = FloatProgress(min=0, max=num_addresses)
display(f)

for address in df_usa.iloc[166:].addresses:
    g = geocode(address)
    try:
        us_latlng.append((g.latitude, g.longitude))
    except AttributeError:
        us_latlng.append(None)
    f.value += 1

In [427]:
df_usa.reset_index(drop=True, inplace=True)
df_usa.drop('latlng', axis=1, inplace=True)
df_usa = df_usa.merge(df_latlng, left_index=True, right_index=True)
df_usa.rename(columns={0: 'latlng'}, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app


In [428]:
df_usa

Unnamed: 0,id,name,addresses,country,line1,latlng
0,2795AB81-FD33-413D-BBE4-127D1ACE6210,University of Chicago,"{'address': [{'line4': 'Chicago', 'line1': 'Un...",united states,University of Chicago,"(41.8150541, -87.6018039)"
1,55044D17-2C14-4D68-BED1-119271B476B4,Rosenstiel School of Marine & Atmospheri,"{'address': [{'line4': 'Miami', 'line1': 'Rose...",united states,Rosentiel Sch of Marine & Atmospheric,"(25.7326204, -80.1626698)"
2,58E3A7F8-5B1E-48D2-BDC5-0E835B3FA948,Xerox Corporation,"{'address': [{'line4': 'New York 14580', 'line...",united states,800 Phillips Road,"(43.2228451, -77.4170792)"
3,5FC4C032-4778-4A82-95CD-13F5897F599B,Joint Bioenergy Institute,"{'address': [{'line1': '5885 Hollis Street', '...",united states,5885 Hollis Street,"(37.8405026, -122.2899738)"
4,527728F1-2397-4BCC-AA10-1137F55A1D9E,Aginova Inc,"{'address': [{'line1': '2226 Central Ave.,', '...",united states,"2226 Central Ave.,","(34.48994, -93.0586689)"
5,1003BE2C-9E2D-4A5D-A13D-058587870722,Auburn University,"{'address': [{'line4': 'Auburn', 'line1': '301...",united states,301 Funchess hall,"(32.6000095, -85.4823262)"
6,B3C4BA7F-0E8E-4F2F-88C2-04277E328EB0,Search for a Common Ground,{'address': [{'line1': '1601 Connecticut Ave N...,united states,"1601 Connecticut Ave NW, Suite 200","(38.9113709, -77.04408049999999)"
7,AFE5C6FD-3500-44F6-B100-184B5F2FD0D7,Microsoft Research,"{'address': [{'line4': 'Redmond', 'line1': 'On...",united states,One Microsoft Way,"(47.6393096, -122.1283559)"
8,C1DA4809-2264-4894-9C3C-190942EACF3C,Mississippi State University,{'address': [{'line1': 'Mississippi State Univ...,united states,Mississippi State University,"(33.4555279, -88.7903868)"
9,B8480401-8DDE-472E-9BFB-13DEC29BD816,Louisiana State University,"{'address': [{'line4': 'Baton Rouge', 'line1':...",united states,Louisiana State University,"(30.4539947, -91.1848407)"


In [435]:
df_usa.to_sql('orgs_latlng_us',
              conn,
              schema='gtr',
              if_exists='replace',
              index=False)