# Classifying Organisations
## Business and Non-Business Classification

One of the interesting aspects of the Gateway to Research data is that it includes a significant number of non-academic organisations. We suspect that these are, in most cases, likely to be private companies that have received funding from Innoate UK. It may also include organisations that have collaborated with universities on specific projects.

Our first appraoch to identifying academic organisations and non-academic organisations will be to use keywords. For instance, companies can be identified via words such as 'ltd', 'plc' or 'inc', whilst academic organisations will often contain 'university' or 'school'. We also include 'council' in our first round of words identifying non-business organisations. 

In [3]:
import chwrapper
import json
import numpy as np
import os
import pandas as pd
import psycopg2
import ratelim
import requests

from sqlalchemy import create_engine

In [256]:
pd.options.display.max_rows = 250

In [10]:
def check_type(c_str, company=True, custom_list=None):
    """Takes a string and checks whether it is a name representing a company or not.
    Set company=False to check whether the string DOES NOT represent a company."""
    words = set(c_str.lower().split())
    return is_company(words) if company else is_other(words, custom_list)
 
def is_company(words):
    """Takes words and compares them to a list of company identifiers. Returns True if
    any of the words match those in the identifiers list."""
    identifiers = set(['ltd.',
                       'ltd',
                       'limited',
                       'plc',
                       'plc.',
                       'inc',
                       'inc.',
                       'company',
                       'corporation'])
    return bool(words.intersection(identifiers))

def is_other(words, custom_list=None):
    """Takes words and compares them to a list of otber identifiers (denoting
    academic or public organisation keywords. Returns True if any of the words
    match those in the identifiers list."""
    if not custom_list:
        identifiers = set(['university',
                           'uni',
                           'universities',
                           'school',
                           'sch',
                           'faculty',
                           'council',
                           'nhs',
                           'academic',
                           'science',
                           'sciences',
                           'sci',
                           'office',
                           'institute'])
    else:
        identifiers=set(custom_list)
        
    return bool(words.intersection(identifiers))

def print_stats(df):
    # Number of organisations that are other (academic, public sector etc.)
    print('Other:\t\t{}'.format(len(df[df.other == True])))

    # Number of organisations that aren't companies
    print('Companies:\t{}'.format(len(df[df.company == True])))

    # Unclassified
    print('Unclassified:\t{}'.format(len(df[(df.company == False) &
                                        (df.other == False)])))

    # Clashes
    print('Clashes:\t{}'.format(len(df[(df.company == True) &
                                       (df.other == True)])))
    
def unpack_address(address_series):
    """Unpacks GtR nested addresses. Returns a pandas.Series object"""
    s = address_series.apply(lambda x: x.get('address', None)).apply(unpack_list)
    s = s.apply(dict_2_string)
    return s

def check_postcodes(row):
    """Check a dataframe rows postcode entry against that in a Companies House JSON return"""
    return ch_postcodes(row.ch_json)

def gtr_postcodes(address_series):
    """Returns a postcode entry from a row in a GtR derived address series"""
    s = address_series.apply(lambda x: x.get('address', None)).apply(unpack_list)
    return s.apply(val_from_dict, key='postCode')

def ch_postcodes(ch_json):
    """Returns all postcode entries from a Companies House JSON return"""
    items = ch_json.get('items', None)
    if items is None:
        return None
    else:
        return [x.get('address').get('postal_code') for x in items]
    
def oc_postcodes(oc_json):
    """Returns all postcodes from an Open Corporates company search results document as a list"""
    try:
        companies = oc_json.get('results').get('companies')
    except AttributeError:
        companies = False
    
    if companies:
        l = []
        for company in companies:
            try:
                l.append(company.get('company').get('registered_address').get('postal_code'))
            except AttributeError:
                l.append(None)
        return [x for x in l if x is not None]
    else:
        return []
    
def val_from_dict(d, key):
    try:
        return d.get(key, None)
    except AttributeError:
        return None

def unpack_list(l, ind=0):
    try:
        return l[ind]
    except IndexError:
        return None

def dict_2_string(d):
    try:
        return ' '.join("{!s}".format(val) for val in d.values())
    except AttributeError:
        return None

def org_name_counter(df):
    """Counts of each string occurence in name column"""
    return df.name.value_counts()

def unclassified(df):
    """Returns a df of other and company columns where both are False in df"""
    return df[(df.company == False) & (df.other == False)]

@ratelim.patient(600, 300)
def search_org(org_name, s=None):
    """Search companies house for an organisation by name (org_name)"""
    if not s:
        r = chwrapper.Search().search_companies(org_name)
    else:
        r = s.search_companies(org_name)
    return r.json()

@ratelim.greedy(1, 2)
def search_open_corporates(org_name, country_code=None, api_token=None):
    """Search the Open Corporates database using a string"""
    base_url = 'https://api.opencorporates.com/companies/search'
    params = {'q': org_name,
              'country_code': country_code,
              'api_token': api_token}
    
    r = requests.get(base_url, params=params)
    return r.json()

In [66]:
r = search_open_corporates("Nesta", country_code='gb', api_token=oc_key)

In [67]:
r['results']

{'companies': [{'company': {'branch_status': None,
    'company_number': '08506131',
    'company_type': 'Private Limited Company',
    'created_at': '2013-04-26T11:12:10+00:00',
    'current_status': 'Dissolved',
    'dissolution_date': '2014-09-30',
    'inactive': True,
    'incorporation_date': '2013-04-26',
    'jurisdiction_code': 'gb',
    'name': 'BWB (NO.3) LIMITED',
    'opencorporates_url': 'https://opencorporates.com/companies/gb/08506131',
    'previous_names': [{'company_name': 'NESTA LIMITED',
      'end_date': '2013-07-22',
      'start_date': '2013-04-26'}],
    'registered_address': {'country': 'United Kingdom',
     'locality': 'London',
     'postal_code': 'EC4M 6YH',
     'region': None,
     'street_address': '2-6 Cannon Street'},
    'registered_address_in_full': '2-6 Cannon Street, London, EC4M 6YH',
    'registry_url': 'https://beta.companieshouse.gov.uk/company/08506131',
    'restricted_for_marketing': None,
    'retrieved_at': '2016-03-28T07:04:06+00:00',
  

In [4]:
sql_str = """
    SELECT id, name, addresses 
      FROM gtr.organisations
"""

# Read in config file with DB params
with open('../scripts/config.json') as f:
    conf = json.load(f)
    
# Define a connection string
conn_string = 'host={} dbname={} user={} password={}'.format(conf.get('host'),
                                                             conf.get('database'),
                                                             conf.get('user'),
                                                             conf.get('passw'))

# Create a connection object
conn = psycopg2.connect(conn_string)

In [5]:
# Dataframe from SQL data
df = pd.read_sql(sql_str, conn)

# Lowercase all the string names for easier matching
df['name'] = df.name.map(lambda x: x.lower())

# Column of boolean values (true if is company)
df['company'] = df.name.apply(check_type)

# Column of boolean values if (true if is not company)
df['other'] = df.name.apply(check_type, company=False)

# Get postcodes
df['postcodes'] = gtr_postcodes(df.addresses)

# Unpack the addess values
df['addresses'] = unpack_address(df.addresses)

print_stats(df)

Other:		4214
Companies:	11357
Unclassified:	10063
Clashes:	69


So, we have 4,214 other organisations, 11,357 companies and 10,063 unclassified organisations. We'll take a closer look at the unclassified organisations first, to see whether we can add to our classification lists. We only have 69 clashes, which is great to see, as it shows there is a clear seperation of nomenclature betweent the types of organisations at this point.

In [6]:
# Counts of each string occurence in name column
unclassified(df)

Unnamed: 0,id,name,addresses,company,other,postcodes
4,B76FE143-1E07-40B0-8932-028207296A78,china earthquake administration,2F9F0FB5-0E96-42A6-9071-F76DD08C99FA China MAI...,False,False,
9,BA59F886-138D-41EB-B9B1-0C79E45C61D3,committee on climate change,6113B4BF-FF29-4BE9-937A-6E93AC2F3E19 SW1P 1JA ...,False,False,SW1P 1JA
11,C059DB15-1836-430B-A993-0759206ABBB8,openreach bt,A90ABD3A-335B-4455-B796-9146A9A385A7 EC1A 7AJ ...,False,False,EC1A 7AJ
16,026F1607-31FD-4EE2-B366-1470B441EAC7,economics,,False,False,
20,C8A76B9E-4F39-490B-BCCF-009F592A756C,csr corporate social responsibility,6914032F-C07E-4ED7-9F81-21ABA2BAB84F BT3 9DT M...,False,False,BT3 9DT
28,D42BD99F-AD31-43D6-8FBE-0B96F1E266D3,resonance104.4fm,1FF31D87-553C-47CB-A4A9-E0757191427E SE1 1LB M...,False,False,SE1 1LB
30,CF4A4186-2B62-41F1-BC4E-0151FE4C5784,metabolic biology,,False,False,
33,D5F4323F-A8DF-4869-B0F0-0EB45A7A7F95,electricity supply board (esb),11E150E4-2064-42A2-B01B-6AA8DCD69B64 MAIN_ADDR...,False,False,
36,D72C440C-6E4E-4FEE-94D9-0652BB970C3B,soco music project,58CDE28A-B252-4CD7-8016-BBC0DEEF1F76 SO14 6GZ ...,False,False,SO14 6GZ
39,D819E77B-1EC0-4952-BA54-04F155598E41,forum of mobilty centre,77738AC0-C325-478D-81F6-0691292B3A0A TN26 2JX ...,False,False,TN26 2JX


The list above shows that there are some very common erroneous entries in the organisation name field in Gateway to Research. We will assign all entries in the name column of the df DataFrame object as other = True and company = False, with the exception of 'unknown' and 'unlisted'.

In [14]:
org_counts = org_name_counter(df)
org_counts[(org_counts >= 5)]

unlisted                                 99
research                                 31
economics                                27
psychology                               25
history                                  24
chemistry                                21
physics                                  20
computer science                         19
english                                  17
geography                                17
law                                      17
philosophy                               17
education                                16
music                                    15
mathematics                              15
sociology                                14
grants administration                    13
politics                                 13
business school                          13
biological sciences                      13
research office                          12
unknown                                  12
finance                         

In [15]:
other_list = list(org_counts[org_counts >= 5].index)
other_list.remove('unlisted')
other_list.remove('unknown')

In [16]:
df.loc[(df.company == False) & (df.other == False), 'other'] = df[(df.company == False) &
                                                     (df.other == False)].name.isin(other_list)

In [17]:
print_stats(df)

Other:		4654
Companies:	11357
Unclassified:	9623
Clashes:	69


Let's just repeat this for those names appearing three time or more and less than 5 times

In [20]:
# Counts of each string occurence in name column
org_counts = org_name_counter(df)
org_counts[(org_counts >= 3) & (org_counts < 5)]

finance office                             4
materials                                  4
marketing                                  4
sch of modern languages                    4
research support services                  4
school of health sciences                  4
scottish association for marine science    4
administration                             4
anthropology                               4
psychiatry                                 4
sch of chemistry                           4
civil engineering                          4
institute of education                     4
school of chemistry                        4
school of physics and astronomy            4
biomedical sciences                        4
school of biological sciences              4
school of humanities                       4
biology                                    4
sch of social sciences                     4
architecture                               4
school of social sciences                  4
theology a

This time, there is one entry to exclude from the classification:
- BAE Systems (features 3 times)

In [21]:
other_list = list(org_counts[org_counts >= 3].index)
other_list.remove('unlisted')
other_list.remove('unknown')
other_list.remove('bae systems')

In [22]:
df.loc[(df.company == False) & (df.other == False), 'other'] = df[(df.company == False) &
                                                     (df.other == False)].name.isin(other_list)

In [23]:
print_stats(df)

Other:		4785
Companies:	11357
Unclassified:	9492
Clashes:	69


And once more, for org names appearing twice

In [24]:
# Counts of each string occurence in name column
org_counts = org_name_counter(df)
org_counts[(org_counts >= 2) & (org_counts < 3)]

department of physics                                2
art and design                                       2
denso corporation                                    2
materials engineering research laboratory limited    2
spend360 international limited                       2
london & scandinavian metallurgical co limited       2
medpharm ltd                                         2
mrc cognition and brain sciences unit                2
xceleron ltd                                         2
mrc toxicology unit                                  2
babraham institute                                   2
hispanic studies                                     2
university of oxford                                 2
stork technologies                                   2
criminology                                          2
management science                                   2
pipehawk plc                                         2
broad institute                                      2
agricultur

In [25]:
print('Num entries where name is >= 2 and < 3:\t{}'.format(
        org_counts[(org_counts >= 2) & (org_counts < 3)].sum()))

Num entries where name is >= 2 and < 3:	824


### Classifying with Companies House Data

At this point, manually assigning identifiers will become too problematic; there are a lot of strings to look at, and the differentiation between what is a company and what isn't is becoming less clear. An alternative method of classification for these remaining organisations is to use the Companies House API to compare organisation name strings against company names on their database. This will by no means classify all of the remainder of the organisations, but it may reduce the number remaining slightly. Specifically, we will try to classify organisations that have a specific UK postcode entry to be matched to Companies House data.

In [234]:
# first get a random sample that contain UK postcodes
# This is so we can easily test our appraoch on a smaller sample
df_rand = unclassified(df).sample(1000)
df_rand = df_rand[df_rand.postcodes.notnull()]
num_pc_entries = len(df_rand)
print("This run has {} random entries containing postcode information.".format(num_pc_entries))

This run has 619 random entries containing postcode information.


We use a simple process to define a match. First, the name of the organisation is taken from the dataframe, this is then passed as a search query to the Companies House API. If this search returns a match, then the postcode of that organisation is compared to the one returned by Companies House.

The caveat to this approach is that some organisations will have different postcodes registered with Companies House than they do on GtR.

In [235]:
df_rand['ch_json'] = df_rand.name.apply(search_org)
df_rand['ch_postcodes'] = df_rand.ch_json.apply(ch_postcodes)

In [245]:
df_rand.head()

Unnamed: 0,id,name,addresses,company,other,postcodes,ch_json,ch_postcodes
12029,D91F10D1-0DFD-4A2E-9C0B-62C0B91EA840,renew northwest,97183A3A-3B79-4952-8341-90263571A5B0 L1 4DQ Un...,False,False,L1 4DQ,"{'kind': 'search#companies', 'total_results': ...",[OL9 7NZ]
7367,703E38C8-1C6E-4EE3-9AFD-6CCA38A6F956,dr williams's trust and library,C247F5BC-1489-4401-B53F-8A6D82009DC6 WC1H 0AR ...,False,False,WC1H 0AR,"{'kind': 'search#companies', 'total_results': ...",[]
204,1F97D59E-5F53-4FB1-969C-04B180029CD2,odgers conservation,D38FD631-7F47-432E-9DF4-806FD0E8BE45 TA11 6BN ...,False,False,TA11 6BN,"{'kind': 'search#companies', 'total_results': ...",[BS1 5HH]
7434,BCC12FEE-7A90-4AFE-8594-713906D8289E,regen sw (south west),C8CFD016-12F7-4648-9277-26ED41B27A54 EX4 4RN M...,False,False,EX4 4RN,"{'kind': 'search#companies', 'total_results': ...",[]
6363,A24EE84A-BAE3-4612-8E2B-085DD0056BC6,british assoc for adoption and fostering,C57BD228-9433-49D5-842D-222DB35F0281 EC1N 8TS ...,False,False,EC1N 8TS,"{'kind': 'search#companies', 'total_results': ...",[B3 2BJ]


In [246]:
for index, row in df_rand.iterrows():
    df_rand.loc[index,'is_ch_company'] = row.postcodes in row.ch_postcodes

In [256]:
print('Out of {} organisations, {} were matched as companies.'.format(
        len(df_rand),
        len(df_rand[df_rand.is_ch_company == True])))

Out of 619 organisations, 134 were matched as companies.


We can now apply the same approach to the full dataframe

In [None]:
df['ch_json'] = df.name.apply(search_org)
df['ch_postcodes'] = df.ch_json.apply(ch_postcodes)

In [9]:
for index, row in df.iterrows():
    df.loc[index,'is_ch_company'] = row.postcodes in row.ch_postcodes

In [10]:
print('Out of {} organisations, {} were matched as companies.'.format(
        len(df[(df.company == False) & (df.other == False)]),
        len(df[(df.is_ch_company == True) & (df.company == False) & (df.other == False)])))

Out of 9492 organisations, 1495 were matched as companies.


In [30]:
# Update company column with new matches
for index, row in df.iterrows():
     if row.company:
        df.loc[index, 'company'] = row.company
     elif not row.other:
        df.loc[index, 'company'] = row.is_ch_company
     else:
        df.loc[index, 'company'] = False
        
df.drop('is_ch_company', axis=1, inplace=True)
print_stats(df)

Other:		4785
Companies:	12852
Unclassified:	7997
Clashes:	69


Our final approach to organisational classification is to use the Open Corporates database. The reason for doing so is that the database contains nformation regarding dissolved companies, whereas Companies House do not currently release this via their API (though they may have plans to do so).

In [8]:
oc_key = os.environ["OPEN_CORPS_KEY"]

In [76]:
# Update company column with new matches
l = []
for index, row in df.iterrows():
    if not row.company and not row.other:
        r = search_open_corporates(row.name,
                                   country_code='gb',
                                   api_token=oc_key)
        l.append(r)
    else:
        l.append(False)

# New column from results list        
df['oc_results'] = pd.Series(l)

In [211]:
# Column of postcodes to match on
df['oc_postcodes'] = df.oc_results.apply(oc_postcodes)

In [212]:
for index, row in df.iterrows():
    df.loc[index,'is_oc_company'] = row.postcodes in row.oc_postcodes

In [252]:
df[df.is_oc_company == True]

Unnamed: 0,id,name,addresses,company,other,postcodes,ch_json,ch_postcodes,oc_results,oc_postcodes,is_oc_company


The Open Corporates search didn't classify any of the remainder of the organisations. We now take a final step of checking the small number of clashes in the dataframe, where an organisation is classed as both a company and an organisation.

In [23]:
df[(df.company == True) & (df.other == True)]

Unnamed: 0,id,name,addresses,company,other,postcodes,ch_json,ch_postcodes,oc_results,oc_postcodes,is_oc_company
557,E0242D92-6AA5-489E-93BF-647C834422C9,sphere science ltd,HP20 2RS MAIN_ADDRESS 1459448184000 14 Bourbon...,True,True,HP20 2RS,"{'start_index': 0, 'total_results': 1, 'page_n...",[HP20 2RS],False,[],False
940,59B462B7-8D1A-43F0-9E11-C8210A295255,gardline marine sciences limited,NR30 3NG Great Yarmouth MAIN_ADDRESS 145944818...,True,True,NR30 3NG,"{'start_index': 0, 'total_results': 2, 'page_n...","[NR30 3NG, NR30 3NG]",False,[],False
945,5154184E-5F5F-4E78-BDA7-C37CC089FC21,success at school limited,NW2 5QU London MAIN_ADDRESS 1459448184000 124 ...,True,True,NW2 5QU,"{'start_index': 0, 'total_results': 2, 'page_n...","[NW2 5QU, B70 9QG]",False,[],False
1510,0D2FC2CE-DE17-4F8B-ACE2-32EB737D6387,linnaeus plant sciences inc,C1D8C174-469B-4ED1-A4DA-A48AA514A2C6 V6R 4R8 C...,True,True,V6R 4R8,"{'start_index': 0, 'total_results': 0, 'page_n...",[],False,[],False
2100,85C0137D-EFE0-4C4F-A4C6-8B31C4F586A5,field science limited,BS9 2RN Bristol United Kingdom MAIN_ADDRESS 14...,True,True,BS9 2RN,"{'start_index': 0, 'total_results': 2, 'page_n...","[BS9 2RN, B98 8LG]",False,[],False
2214,2B82458D-330C-4EB8-A6A5-EC8905115CEB,the capita group plc / nhs choices,SW1H 0XA LONDON United Kingdom MAIN_ADDRESS 14...,True,True,SW1H 0XA,"{'start_index': 0, 'total_results': 0, 'page_n...",[],False,[],False
2781,B020994B-44EA-4B36-8CF5-50ECC1785DD4,elife sciences publications ltd,"CB2 1JP MAIN_ADDRESS 1459448184000 1st Floor, ...",True,True,CB2 1JP,"{'start_index': 0, 'total_results': 0, 'page_n...",[],False,[],False
2897,0D9CAEAF-C667-4EED-8AE3-C63A0D71E244,t-ray science inc,6385F398-6982-46C2-ADFC-E310911A8C3D Canada MA...,True,True,,"{'start_index': 0, 'total_results': 0, 'page_n...",[],False,[],False
3120,E6AE9487-83B8-4FEA-9E6B-94E1757F1287,science city bristol ltd,5081BAA1-E349-48B8-8B37-CC784D8A2206 BS16 7FR ...,True,True,BS16 7FR,"{'start_index': 0, 'total_results': 1, 'page_n...",[BS6 7AW],False,[],False
3341,BB919D63-84AF-416E-BE6C-85893EDD3BF4,science practice ltd,EC2A 4NQ London MAIN_ADDRESS 1459448184000 Lon...,True,True,EC2A 4NQ,"{'start_index': 0, 'total_results': 1, 'page_n...",[EC2A 4NQ],False,[],False


In [39]:
other = [1375, 10225]
mask = df['other'].isin(other)
df.loc[(~mask) & (df.company == True) & (df.other == True), 'other'] = False

In [43]:
for x in other:
    df.set_value(x, 'company', False) 

In [44]:
print_stats(df)

Other:		4718
Companies:	12850
Unclassified:	7997
Clashes:	0


In [45]:
# Save to pickle for easy loading
df.to_pickle('df_organisation_type.p')

In [47]:
# Read in config file with DB params
with open('../scripts/config.json') as f:
    conf = json.load(f)
    
engine = create_engine('postgresql://{}:{}@{}/{}'.format(conf.get('user'),
                                                         conf.get('passw'),
                                                         conf.get('host'),
                                                         conf.get('database')))

conn = engine.connect()

df[['id', 'company', 'other']].to_sql('orgs_type',
                                      conn,
                                      schema='gtr',
                                      if_exists='replace',
                                      index=False)