### Matching Maritime & Coastguard Agency recruitment & placement agencies to companies house data

In [1]:
import requests
import io
import pandas as pd
import numpy as np
import probablepeople as pp
from bs4 import BeautifulSoup

In [2]:
# Function to strip stopwords from company names

def strip_stopwords(raw_name):    
    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', 
        'HOLDINGS', 'UK', 'TRADING', 'LTD.' }
    return(' '.join([raw_name_part for raw_name_part in raw_name.split() if raw_name_part not in company_stopwords]))

# Function to extract postcode as last 2 substrings in address
# To Fix: Not perfect if postcode not last entry in address

def extract_postcode(address):
    return ' '.join(address.split()[-2:])

In [3]:
# Maritime and Coastguard Agency - List of approved recruitment and placement agencies

url = "https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca"

In [4]:
# Download the csv file, extract name and drop erroroneous last row with all nulls

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href$='.csv']")]
    for target in targets:
        response = req.get(target)    
        mdf = pd.read_csv(io.BytesIO(response.content))
        mdf = mdf.dropna(how='all')

In [5]:
# Extract Postcode and Cleanse Name Column of Stopwords
# Create unique index column needed by Splink from datafram index

mdf['ADDRESS & CONTACT DETAILS']=mdf['ADDRESS & CONTACT DETAILS'].astype('string')
mdf['Postcode'] = mdf.apply(lambda row: extract_postcode(row['ADDRESS & CONTACT DETAILS']), axis=1)
mdf['Location'] = mdf['LOCATION'].str.upper()
mdf['CompanyName'] = mdf['COMPANY'].str.upper()
mdf['CompanyName'].replace('', np.nan, inplace=True)
mdf['CompanyName'] = mdf['CompanyName'].astype(str)
mdf['CompanyName'] = mdf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
mdf['unique_id'] = mdf.index

In [6]:
# Subset down to required columns
# Number of agencies to match

mdf = mdf[['Postcode','CompanyName','Location','unique_id']]
len(mdf)

103

In [7]:
# Read basic company details

cdf = pd.read_csv('basic_slim.csv')
cdf = cdf.rename(columns={"RegAddress.PostCode": "Postcode", 'RegAddress.PostTown': 'Location'})
cdf['CompanyName'] = cdf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
cdf['CompanyName'].replace('', np.nan, inplace=True)
cdf['unique_id'] = cdf.index

In [8]:
# Subset down to required columns
# Number of companies to match against

cdf = cdf[['Postcode','CompanyName','Location','unique_id']]
len(cdf)

5184795

In [9]:
# Number of exact matches

exact = mdf.merge(cdf,left_on=['Postcode','CompanyName'], right_on=['Postcode','CompanyName'],
          suffixes=('_left', '_right'))
len(exact)

50

In [11]:
import recordlinkage

In [12]:
# Block on postcode
# Number of possible links

indexer = recordlinkage.Index()
indexer.block("Postcode")
candidate_links = indexer.index(cdf, mdf)
len(candidate_links)

150220

In [13]:
# Number of matches on postcode and name

compare_cl = recordlinkage.Compare()
compare_cl.string("CompanyName", "CompanyName", method='jarowinkler',threshold=0.85)
compare_cl.exact("Postcode","Postcode")
features = compare_cl.compute(candidate_links, cdf, mdf)

# Name exact or approx match and postcode matches
matches = features[(features[0]==1) & (features[1]==1)]
len(matches)

121

In [14]:
# Rename indexes to allow join

matches.index.names = ['cdf','mdf']
cdf.index.names= ['cdf']
mdf.index.names= ['mdf']

# Lookup both names
matches = matches.join(cdf, how='inner')
matches = matches.join(mdf, how='inner', rsuffix='_mdf')

# Select those with only approx match not exact match
approx = matches[matches['CompanyName']!=matches['CompanyName_mdf']]
len(approx)

71

In [16]:
# List of those companies matched at least once
found = matches.index.unique(level='mdf')
len(found)

59

In [17]:
# Select those companies not found 
notfound = mdf.loc[(mdf.index.isin(found, level='mdf') == False)]
len(notfound)

44

In [19]:
# Postcode matches but name doesn't
postmatches = features[(features[0]!=1) & (features[1]==1)]
postmatches.index.names = ['cdf','mdf']

postmatches = postmatches.join(cdf, how='outer')
postmatches = postmatches.join(mdf, how='inner', rsuffix='_mdf')
len(postmatches)

150099

In [19]:
import splink

In [20]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("CompanyName",distance_threshold_or_thresholds=[0.9]),
    ],    
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}

In [21]:
# To fix: typically doesn't calculate u value as doesn't find any matches

linker = DuckDBLinker([mdf, cdf], settings, input_table_aliases=["mdf", "cdf"])
linker.estimate_u_using_random_sampling(target_rows=1e6)

----- Estimating u probabilities using random sampling -----
u probability not trained for CompanyName - Exact match (comparison vector value: 2). This usually means the comparison level was never observed in the training data.

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - CompanyName (some u values are not trained, no m values are trained).


In [22]:
linker.estimate_parameters_using_expectation_maximisation("l.Postcode = r.Postcode")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.Postcode = r.Postcode

Parameter estimates will be made for the following comparison(s):
    - CompanyName

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.527 in the m_probability of CompanyName, level `Exact match`
Iteration 2: Largest change in params was 0.379 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`
Iteration 3: Largest change in params was 0.045 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`
Iteration 4: Largest change in params was -0.013 in the m_probability of CompanyName, level `Exact match`
Iteration 5: Largest change in params was -0.00466 in the m_probability of CompanyName, level `Exact match`
Iteration 6: Largest change in params was -0.00168 in the m_probability of CompanyName, level `Exact match`

<EMTrainingSession, blocking on l.Postcode = r.Postcode, deactivating comparisons >

In [23]:
results = linker.predict()


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'CompanyName':
    u values not fully trained


In [24]:
resultsdf = results.as_pandas_dataframe()
bespoke = resultsdf[resultsdf['CompanyName_l']=='AIR RESOURCES']
linker.waterfall_chart(bespoke.to_dict(orient='records'), filter_nulls=False)

In [None]:
df_splink = linker.predict(threshold_match_probability = 0.001).as_pandas_dataframe()
len(df_splink)