# Chapter 6 - Organisation Matching

## Step 1 - Data Acquisition

### Companies House Basic Data

In [1]:
import requests
import json
import zipfile
import io
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
# UK Companies House Basic Company Data download page

url="http://download.companieshouse.gov.uk/en_output.html"

In [3]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe

df_comptotal = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='BasicCompanyData-']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
            print(zipsnapshot.namelist()[0])
            df_comp = pd.read_csv(tempfile, dtype='unicode')
            df_comp = df_comp[['RegAddress.PostCode','RegAddress.PostTown','CompanyName']]
            df_comptotal = pd.concat([df_comptotal, df_comp], ignore_index=True)

### Maritime and Coastguard Agency

In [44]:
# Maritime and Coastguard Agency - List of approved recruitment and placement agencies

url = "https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca"

In [45]:
# Download the csv file, extract name and drop erroroneous last row with all nulls

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href$='.csv']")]
    for target in targets:
        response = req.get(target)    
        mdf = pd.read_csv(io.BytesIO(response.content))
        mdf = mdf.dropna(how='all')

### Saving to Local Storage

In [74]:
#df_comptotal.to_csv('basic_raw.csv', index=False)
#cdf = pd.read_csv('basic_raw.csv')

#mdf.to_csv('mdf_raw.csv', index=False)
mdf = pd.read_csv('mdf_raw.csv')

## Step 2 - Data Standardization

### Companies House Basic Data

In [8]:
# Function to remove stopwards from company names

def strip_stopwords(raw_name):    
    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', 
        'HOLDINGS', 'UK', 'TRADING', 'LTD.', 'PLC','LLP' }
    return(' '.join([raw_name_part for raw_name_part in raw_name.split() if raw_name_part not in company_stopwords]))

In [85]:
cdf['CompanyName'] = cdf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
cdf = cdf.rename(columns={"RegAddress.PostCode": "Postcode", 'RegAddress.PostTown': 'Location'})

In [86]:
cdf['unique_id'] = cdf.index
cdf = cdf[['Postcode','CompanyName','Location','unique_id']]

### Maritime and Coastguard Agency

In [53]:
# Function to extract postcode using regular expression

import re

def extract_postcode(address):
    pattern = re.compile(r'([A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2})')
    postcode = pattern.search(address)
    if(postcode is not None):
         return postcode.group()
    else:
         return None

In [75]:
mdf['Location'] = mdf['LOCATION'].str.upper()
mdf['CompanyName'] = mdf['COMPANY'].str.upper()

In [76]:
mdf['CompanyName'] = mdf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
mdf['Postcode'] = mdf.apply(lambda row: extract_postcode(row['ADDRESS & CONTACT DETAILS']), axis=1)
mdf['Location'] = mdf.apply(lambda row: row['Location'] if row['Location']!='0' else None, axis=1)

In [77]:
mdf = mdf.dropna()
mdf['unique_id'] = mdf.index

In [78]:
mdf = mdf[['Postcode','CompanyName','Location','unique_id']]
len(mdf)

94

### Saving to Local Storage

In [79]:
#cdf.to_csv('basic_clean.csv')
#cdf = pd.read_csv('basic_clean.csv')

mdf.to_csv('mdf_clean.csv', index=False)
mdf = pd.read_csv('mdf_clean.csv')

# Step 3 - Record Blocking and Attribute Comparison

In [87]:
exact = mdf.merge(cdf,left_on=['CompanyName','Location','Postcode'], right_on=['CompanyName','Location','Postcode'],
          suffixes=('_m', '_c'))
exact

Unnamed: 0,Postcode,CompanyName,Location,unique_id_m,unique_id_c
0,EH7 4HG,ADVANCE GLOBAL RECRUITMENT,EDINBURGH,2,186608
1,PO6 4PR,ADVANCED RESOURCE MANAGERS,PORTSMOUTH,3,189061
2,M3 5FS,AIR RESOURCES,MANCHESTER,4,222539
3,CO4 9RS,ALIS GLOBAL,COLCHESTER,5,265985
4,NN2 6NY,BELINDA KING CREATIVE PRODUCTIONS,NORTHAMPTON,8,594523
5,BH25 5SJ,BESPOKE CREW,NEW MILTON,9,615803
6,EH12 6DE,CARA LEES YACHT CREW,EDINBURGH,11,884835
7,SO15 1ST,CARNIVAL,SOUTHAMPTON,12,898423
8,BR2 9AT,CELOX YACHTING,BROMLEY,14,936580
9,G51 2SE,CLYDE MARINE RECRUITMENT,GLASGOW,16,1062396


In [88]:
import splink

In [109]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
        "l.CompanyName = r.CompanyName",
    ],
    "comparisons": [
        cl.exact_match("CompanyName"),
 #       cl.jaro_winkler_at_thresholds("CompanyName",[0.9]),
        cl.exact_match("Location"),
    ],    
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}
linker = DuckDBLinker([mdf, cdf], settings, input_table_aliases=["mdf", "cdf"])

In [None]:
# Use 1e7 targets rows to ensure u estimation

linker.estimate_u_using_random_sampling(target_rows=1e6)

----- Estimating u probabilities using random sampling -----


In [None]:
linker.estimate_parameters_using_expectation_maximisation("l.Postcode = r.Postcode")

In [112]:
df_splink = linker.predict(threshold_match_probability=0.5).as_pandas_dataframe()
len(df_splink)


You have called predict(), but there are some parameter estimates which have neither been estimated or specified in your settings dictionary.  To produce predictions the following untrained trained parameters will use default values.
Comparison: 'CompanyName':
    u values not fully trained


0

In [32]:
df_splink

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,CompanyName_l,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r,gamma_Postcode,bf_Postcode
0,19.011623,0.999998,cdf,mdf,1062396,16,CLYDE MARINE RECRUITMENT,CLYDE MARINE RECRUITMENT,2,1808.470584,GLASGOW,GLASGOW,1,13.284868,G51 2SE,G51 2SE,1,219966.516804
1,19.011623,0.999998,cdf,mdf,2909536,50,LUNA ROSSA PRODUCTIONS,LUNA ROSSA PRODUCTIONS,2,1808.470584,LONDON,LONDON,1,13.284868,N20 9RT,N20 9RT,1,219966.516804
2,19.011623,0.999998,cdf,mdf,1608162,29,ERSG,ERSG,2,1808.470584,BROMLEY,BROMLEY,1,13.284868,BR1 1WA,BR1 1WA,1,219966.516804
3,19.011623,0.999998,cdf,mdf,1608163,29,ERSG,ERSG,2,1808.470584,BROMLEY,BROMLEY,1,13.284868,BR1 1WA,BR1 1WA,1,219966.516804
4,19.011623,0.999998,cdf,mdf,1623923,31,ETPM,ETPM,2,1808.470584,ABERDEEN,ABERDEEN,1,13.284868,AB12 3AX,AB12 3AX,1,219966.516804
5,15.55431,0.999979,cdf,mdf,2452767,44,JAMES FISHER (ABERDEEN),JAMES FISHER MARINE,1,164.648088,BARROW-IN-FURNESS,BARROW-IN-FURNESS,1,13.284868,LA14 1HR,LA14 1HR,1,219966.516804
6,15.55431,0.999979,cdf,mdf,2452774,44,JAMES FISHER EVERARD,JAMES FISHER MARINE,1,164.648088,BARROW-IN-FURNESS,BARROW-IN-FURNESS,1,13.284868,LA14 1HR,LA14 1HR,1,219966.516804
7,15.55431,0.999979,cdf,mdf,2452775,44,JAMES FISHER,JAMES FISHER MARINE,1,164.648088,BARROW-IN-FURNESS,BARROW-IN-FURNESS,1,13.284868,LA14 1HR,LA14 1HR,1,219966.516804
8,19.011623,0.999998,cdf,mdf,2452776,44,JAMES FISHER MARINE,JAMES FISHER MARINE,2,1808.470584,BARROW-IN-FURNESS,BARROW-IN-FURNESS,1,13.284868,LA14 1HR,LA14 1HR,1,219966.516804
9,15.55431,0.999979,cdf,mdf,2452778,44,JAMES FISHER MFE,JAMES FISHER MARINE,1,164.648088,BARROW-IN-FURNESS,BARROW-IN-FURNESS,1,13.284868,LA14 1HR,LA14 1HR,1,219966.516804


# Step 4 - Match Classification