# Chapter 6 - Organisation Matching

## Step 1 - Data Acquisition

### Companies House Basic Data

In [1]:
import requests
import json
import zipfile
import io
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
# UK Companies House Basic Company Data download page

url="http://download.companieshouse.gov.uk/en_output.html"

In [3]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe

df_ct = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='BasicCompanyData-']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
            print(zipsnapshot.namelist()[0])
            df_c = pd.read_csv(tempfile, dtype='unicode')
            df_c = df_c[['RegAddress.PostCode','RegAddress.PostTown','CompanyName']]
            df_ct = pd.concat([df_ct, df_c], ignore_index=True)

### Maritime and Coastguard Agency

In [2]:
# Maritime and Coastguard Agency - List of approved recruitment and placement agencies

url = "https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca"

In [3]:
# Download the csv file, extract name and drop erroroneous last row with all nulls

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href$='.csv']")]
    for target in targets:
        response = req.get(target)    
        df_m = pd.read_csv(io.BytesIO(response.content))
        df_m = df_m.dropna(how='all')

### Saving to Local Storage

In [5]:
#df_ct.to_csv('basic_raw.csv', index=False)
df_c = pd.read_csv('basic_raw.csv')

#df_m.to_csv('mari_raw.csv', index=False)
df_m = pd.read_csv('mari_raw.csv')

## Step 2 - Data Standardization

### Companies House Basic Data

In [6]:
# Function to remove stopwards from company names

def strip_stopwords(raw_name):    
    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', 
        'HOLDINGS', 'UK', 'TRADING', 'LTD.', 'PLC','LLP' }
    return(' '.join([raw_name_part for raw_name_part in raw_name.split() if raw_name_part not in company_stopwords]))

In [7]:
df_c['CompanyName'] = df_c.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
df_c = df_c.rename(columns={"RegAddress.PostCode": "Postcode", 'RegAddress.PostTown': 'Location'})

In [8]:
df_c['unique_id'] = df_c.index
df_c = df_c[['Postcode','CompanyName','Location','unique_id']]

### Maritime and Coastguard Agency

In [9]:
# Function to extract postcode using regular expression

import re

def extract_postcode(address):
    pattern = re.compile(r'([A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2})')
    postcode = pattern.search(address)
    if(postcode is not None):
         return postcode.group()
    else:
         return None

In [10]:
df_m['Location'] = df_m['LOCATION'].str.upper()
df_m['CompanyName'] = df_m['COMPANY'].str.upper()

In [11]:
df_m['CompanyName'] = df_m.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
df_m['Postcode'] = df_m.apply(lambda row: extract_postcode(row['ADDRESS & CONTACT DETAILS']), axis=1)
df_m['Location'] = df_m.apply(lambda row: row['Location'] if row['Location']!='0' else None, axis=1)

In [12]:
df_m = df_m.dropna()
df_m['unique_id'] = df_m.index

In [13]:
df_m = df_m[['Postcode','CompanyName','Location','unique_id']]
len(df_m)

94

### Saving to Local Storage

In [2]:
#df_c.to_csv('basic_clean.csv')
df_c = pd.read_csv('basic_clean.csv')

#df_m.to_csv('mari_clean.csv', index=False)
df_m = pd.read_csv('mari_clean.csv')

# Step 3 - Record Blocking and Attribute Comparison

In [3]:
exact = df_m.merge(df_c,left_on=['CompanyName','Location','Postcode'], right_on=['CompanyName','Location','Postcode'],
          suffixes=('_m', '_c'))
exact

Unnamed: 0.1,Postcode,CompanyName,Location,unique_id_m,Unnamed: 0,unique_id_c
0,EH7 4HG,ADVANCE GLOBAL RECRUITMENT,EDINBURGH,2,186608,186608
1,PO6 4PR,ADVANCED RESOURCE MANAGERS,PORTSMOUTH,3,189061,189061
2,M3 5FS,AIR RESOURCES,MANCHESTER,4,222539,222539
3,CO4 9RS,ALIS GLOBAL,COLCHESTER,5,265985,265985
4,NN2 6NY,BELINDA KING CREATIVE PRODUCTIONS,NORTHAMPTON,8,594523,594523
5,BH25 5SJ,BESPOKE CREW,NEW MILTON,9,615803,615803
6,EH12 6DE,CARA LEES YACHT CREW,EDINBURGH,11,884835,884835
7,SO15 1ST,CARNIVAL,SOUTHAMPTON,12,898423,898423
8,BR2 9AT,CELOX YACHTING,BROMLEY,14,936580,936580
9,G51 2SE,CLYDE MARINE RECRUITMENT,GLASGOW,16,1062396,1062396


In [4]:
import splink

In [5]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
        "l.CompanyName = r.CompanyName",
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("CompanyName",[0.9,0.8]),
        cl.exact_match("Location"),
    ],
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}
linker = DuckDBLinker([df_m, df_c], settings, input_table_aliases=["_m", "_c"])

In [6]:
# Use 1e7 targets rows to ensure u estimation

linker.estimate_u_using_random_sampling(target_rows=1e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - CompanyName (no m values are trained).
    - Location (no m values are trained).


In [7]:
linker.estimate_parameters_using_expectation_maximisation("l.Postcode = r.Postcode")
#linker.estimate_parameters_using_expectation_maximisation("l.CompanyName = r.CompanyName")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.Postcode = r.Postcode

Parameter estimates will be made for the following comparison(s):
    - CompanyName
    - Location

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was -0.379 in the m_probability of CompanyName, level `Exact match`
Iteration 2: Largest change in params was 0.519 in the m_probability of CompanyName, level `All other comparisons`
Iteration 3: Largest change in params was 0.365 in the m_probability of CompanyName, level `All other comparisons`
Iteration 4: Largest change in params was 0.533 in probability_two_random_records_match
Iteration 5: Largest change in params was 0.389 in probability_two_random_records_match
Iteration 6: Largest change in params was 0.0209 in probability_two_random_records_match
Iteration 7: Largest change in params was 0.000732 in proba

<EMTrainingSession, blocking on l.Postcode = r.Postcode, deactivating comparisons >

In [11]:
#linker.load_settings_from_json("Chapter6emOnlyPostcode.json")

In [13]:
df_splink = linker.predict(threshold_match_probability=0.9).as_pandas_dataframe()
len(df_splink)

46

In [14]:
df_splink[df_splink['CompanyName_l']!=df_splink['CompanyName_r']]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,CompanyName_l,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r,match_key


In [15]:
df_splink[df_splink['Postcode_l']!=df_splink['Postcode_r']]

Unnamed: 0,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,CompanyName_l,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r,match_key
37,3.396617,0.91328,_c,_m,136776,1,ABLY RESOURCES,ABLY RESOURCES,3,3252.321816,GLASGOW,GLASGOW,1,32.37774,G1 3PT,G2 1PB,1
38,3.396617,0.91328,_c,_m,136777,1,ABLY RESOURCES,ABLY RESOURCES,3,3252.321816,GLASGOW,GLASGOW,1,32.37774,G1 3PT,G2 1PB,1
39,3.396617,0.91328,_c,_m,2698146,47,KUIPER,KUIPER,3,3252.321816,ABERDEEN,ABERDEEN,1,32.37774,AB10 1XL,AB15 6BL,1
40,3.396617,0.91328,_c,_m,3749643,63,PRECISE CONSULTANTS,PRECISE CONSULTANTS,3,3252.321816,LONDON,LONDON,1,32.37774,EC2M 4YT,E1 6DY,1
41,3.396617,0.91328,_c,_m,1430320,27,DRILLMAR RESOURCES,DRILLMAR RESOURCES,3,3252.321816,INVERURIE,INVERURIE,1,32.37774,AB51 3QQ,AB51 5NQ,1
42,3.396617,0.91328,_c,_m,2459873,45,JANSEN MARITIME CREW,JANSEN MARITIME CREW,3,3252.321816,BRIGHTON,BRIGHTON,1,32.37774,BN1 1YR,BN1 4ST,1
43,3.396617,0.91328,_c,_m,3285183,55,MYMUYBUENO RECRUITMENT,MYMUYBUENO RECRUITMENT,3,3252.321816,LONDON,LONDON,1,32.37774,W6 0LJ,SW3 6RD,1
44,3.396617,0.91328,_c,_m,4197604,70,SEAMARINER,SEAMARINER,3,3252.321816,SOUTHAMPTON,SOUTHAMPTON,1,32.37774,SO45 1DD,SO45 1TA,1
45,3.396617,0.91328,_c,_m,3466430,60,OGENUS OFFSHORE,OGENUS OFFSHORE,3,3252.321816,ABERDEEN,ABERDEEN,1,32.37774,AB21 0BH,AB25 2UX,1


In [16]:
results = df_m.merge(df_splink,left_on=['unique_id'], right_on=['unique_id_r'],how='left',
          suffixes=('_m', '_s'))

In [17]:
sub = results.merge(exact,left_on=['unique_id_l'], right_on=['unique_id_c'],how='right')

In [18]:
sub

Unnamed: 0.1,Postcode_x,CompanyName_x,Location_x,unique_id,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,...,bf_Location,Postcode_l,Postcode_r,match_key,Postcode_y,CompanyName_y,Location_y,unique_id_m,Unnamed: 0,unique_id_c
0,EH7 4HG,ADVANCE GLOBAL RECRUITMENT,EDINBURGH,2,3.396617,0.91328,_c,_m,186608.0,2.0,...,32.37774,EH7 4HG,EH7 4HG,0,EH7 4HG,ADVANCE GLOBAL RECRUITMENT,EDINBURGH,2,186608,186608
1,PO6 4PR,ADVANCED RESOURCE MANAGERS,PORTSMOUTH,3,3.396617,0.91328,_c,_m,189061.0,3.0,...,32.37774,PO6 4PR,PO6 4PR,0,PO6 4PR,ADVANCED RESOURCE MANAGERS,PORTSMOUTH,3,189061,189061
2,M3 5FS,AIR RESOURCES,MANCHESTER,4,3.396617,0.91328,_c,_m,222539.0,4.0,...,32.37774,M3 5FS,M3 5FS,0,M3 5FS,AIR RESOURCES,MANCHESTER,4,222539,222539
3,CO4 9RS,ALIS GLOBAL,COLCHESTER,5,3.396617,0.91328,_c,_m,265985.0,5.0,...,32.37774,CO4 9RS,CO4 9RS,0,CO4 9RS,ALIS GLOBAL,COLCHESTER,5,265985,265985
4,NN2 6NY,BELINDA KING CREATIVE PRODUCTIONS,NORTHAMPTON,8,3.396617,0.91328,_c,_m,594523.0,8.0,...,32.37774,NN2 6NY,NN2 6NY,0,NN2 6NY,BELINDA KING CREATIVE PRODUCTIONS,NORTHAMPTON,8,594523,594523
5,BH25 5SJ,BESPOKE CREW,NEW MILTON,9,3.396617,0.91328,_c,_m,615803.0,9.0,...,32.37774,BH25 5SJ,BH25 5SJ,0,BH25 5SJ,BESPOKE CREW,NEW MILTON,9,615803,615803
6,EH12 6DE,CARA LEES YACHT CREW,EDINBURGH,11,3.396617,0.91328,_c,_m,884835.0,11.0,...,32.37774,EH12 6DE,EH12 6DE,0,EH12 6DE,CARA LEES YACHT CREW,EDINBURGH,11,884835,884835
7,SO15 1ST,CARNIVAL,SOUTHAMPTON,12,3.396617,0.91328,_c,_m,898423.0,12.0,...,32.37774,SO15 1ST,SO15 1ST,0,SO15 1ST,CARNIVAL,SOUTHAMPTON,12,898423,898423
8,BR2 9AT,CELOX YACHTING,BROMLEY,14,3.396617,0.91328,_c,_m,936580.0,14.0,...,32.37774,BR2 9AT,BR2 9AT,0,BR2 9AT,CELOX YACHTING,BROMLEY,14,936580,936580
9,G51 2SE,CLYDE MARINE RECRUITMENT,GLASGOW,16,3.396617,0.91328,_c,_m,1062396.0,16.0,...,32.37774,G51 2SE,G51 2SE,0,G51 2SE,CLYDE MARINE RECRUITMENT,GLASGOW,16,1062396,1062396


In [19]:
results[results['match_weight'].notnull()]

Unnamed: 0,Postcode,CompanyName,Location,unique_id,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,...,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r,match_key
1,G2 1PB,ABLY RESOURCES,GLASGOW,1,3.396617,0.91328,_c,_m,136776.0,1.0,...,ABLY RESOURCES,3.0,3252.321816,GLASGOW,GLASGOW,1.0,32.37774,G1 3PT,G2 1PB,1
2,G2 1PB,ABLY RESOURCES,GLASGOW,1,3.396617,0.91328,_c,_m,136777.0,1.0,...,ABLY RESOURCES,3.0,3252.321816,GLASGOW,GLASGOW,1.0,32.37774,G1 3PT,G2 1PB,1
3,EH7 4HG,ADVANCE GLOBAL RECRUITMENT,EDINBURGH,2,3.396617,0.91328,_c,_m,186608.0,2.0,...,ADVANCE GLOBAL RECRUITMENT,3.0,3252.321816,EDINBURGH,EDINBURGH,1.0,32.37774,EH7 4HG,EH7 4HG,0
4,PO6 4PR,ADVANCED RESOURCE MANAGERS,PORTSMOUTH,3,3.396617,0.91328,_c,_m,189061.0,3.0,...,ADVANCED RESOURCE MANAGERS,3.0,3252.321816,PORTSMOUTH,PORTSMOUTH,1.0,32.37774,PO6 4PR,PO6 4PR,0
5,M3 5FS,AIR RESOURCES,MANCHESTER,4,3.396617,0.91328,_c,_m,222539.0,4.0,...,AIR RESOURCES,3.0,3252.321816,MANCHESTER,MANCHESTER,1.0,32.37774,M3 5FS,M3 5FS,0
6,CO4 9RS,ALIS GLOBAL,COLCHESTER,5,3.396617,0.91328,_c,_m,265985.0,5.0,...,ALIS GLOBAL,3.0,3252.321816,COLCHESTER,COLCHESTER,1.0,32.37774,CO4 9RS,CO4 9RS,0
9,NN2 6NY,BELINDA KING CREATIVE PRODUCTIONS,NORTHAMPTON,8,3.396617,0.91328,_c,_m,594523.0,8.0,...,BELINDA KING CREATIVE PRODUCTIONS,3.0,3252.321816,NORTHAMPTON,NORTHAMPTON,1.0,32.37774,NN2 6NY,NN2 6NY,0
10,BH25 5SJ,BESPOKE CREW,NEW MILTON,9,3.396617,0.91328,_c,_m,615803.0,9.0,...,BESPOKE CREW,3.0,3252.321816,NEW MILTON,NEW MILTON,1.0,32.37774,BH25 5SJ,BH25 5SJ,0
12,EH12 6DE,CARA LEES YACHT CREW,EDINBURGH,11,3.396617,0.91328,_c,_m,884835.0,11.0,...,CARA LEES YACHT CREW,3.0,3252.321816,EDINBURGH,EDINBURGH,1.0,32.37774,EH12 6DE,EH12 6DE,0
13,SO15 1ST,CARNIVAL,SOUTHAMPTON,12,3.396617,0.91328,_c,_m,898423.0,12.0,...,CARNIVAL,3.0,3252.321816,SOUTHAMPTON,SOUTHAMPTON,1.0,32.37774,SO15 1ST,SO15 1ST,0


In [20]:
results[results['match_weight'].isnull()]

Unnamed: 0,Postcode,CompanyName,Location,unique_id,match_weight,match_probability,source_dataset_l,source_dataset_r,unique_id_l,unique_id_r,...,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r,match_key
0,W1T 2NS,19 LONDON - AS 19 YACHT CREW,LONDON,0,,,,,,,...,,,,,,,,,,
7,AB21 0BH,ARAMARK,ABERDEEN,6,,,,,,,...,,,,,,,,,,
8,PO15 5TU,ARCHER RESOURCING,FAREHAM,7,,,,,,,...,,,,,,,,,,
11,CO3 8PH,C POWER ENERGY,COLCHESTER,10,,,,,,,...,,,,,,,,,,
14,SO14 3JZ,CB MEDIA (YOTSPOT),SOUTHAMPTON,13,,,,,,,...,,,,,,,,,,
16,AB21 7GQ,CLAN PARTNERS (INCORPORATING GLOBAL RESOURCE M...,ABERDEEN,15,,,,,,,...,,,,,,,,,,
18,HU10 7WG,CP MARINE,KINGSTON UPON HULL,17,,,,,,,...,,,,,,,,,,
19,BS31 1TP,CREW AND CONCIERGE,BRISTOL,18,,,,,,,...,,,,,,,,,,
20,M50 2EQ,CREW BOARD,SALFORD,19,,,,,,,...,,,,,,,,,,
22,HU9 1TY,CROWN CREWING (UK),KINGSTON UPON HULL,21,,,,,,,...,,,,,,,,,,


In [24]:
#linker.save_settings_to_json("Chapter6nopost.json")

{'link_type': 'link_only',
 'blocking_rules_to_generate_predictions': ['l.Postcode = r.Postcode',
  'l.CompanyName = r.CompanyName'],
 'comparisons': [{'output_column_name': 'CompanyName',
   'comparison_levels': [{'sql_condition': '"CompanyName_l" IS NULL OR "CompanyName_r" IS NULL',
     'label_for_charts': 'Null',
     'is_null_level': True},
    {'sql_condition': '"CompanyName_l" = "CompanyName_r"',
     'label_for_charts': 'Exact match',
     'm_probability': 0.0003093648667639807,
     'u_probability': 1.6613216411731322e-07},
    {'sql_condition': 'jaro_winkler_similarity("CompanyName_l", "CompanyName_r") >= 0.9',
     'label_for_charts': 'Jaro_winkler_similarity >= 0.9',
     'm_probability': 0.0001450779737996377,
     'u_probability': 1.993585969407759e-06},
    {'sql_condition': 'jaro_winkler_similarity("CompanyName_l", "CompanyName_r") >= 0.8',
     'label_for_charts': 'Jaro_winkler_similarity >= 0.8',
     'm_probability': 0.0004991882512688222,
     'u_probability': 0.000

# Step 4 - Match Classification

In [13]:
linker.missingness_chart()

In [16]:
linker.m_u_parameters_chart()

In [15]:
linker.unlinkables_chart()

In [17]:
linker.match_weights_chart()

In [21]:
# Show waterfall charts for not exact matches

linker.waterfall_chart(df_splink.to_dict(orient="records"))

In [22]:
record = {'unique_id': 1,
 'Postcode': "BH15 4QE",
 'CompanyName': "VANTAGE YACHT RECRUITMENT",
 'Location': "POOLE"
}

df_inc = linker.find_matches_to_new_records([record], blocking_rules=[]).as_pandas_dataframe()
df_inc.sort_values("match_weight", ascending=False)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,CompanyName_l,CompanyName_r,gamma_CompanyName,bf_CompanyName,Location_l,Location_r,gamma_Location,bf_Location,Postcode_l,Postcode_r
0,3.396617,0.91328,93,1,VANTAGE YACHT RECRUITMENT,VANTAGE YACHT RECRUITMENT,3,3252.321816,POOLE,POOLE,1,32.37774,BH15 4QE,BH15 4QE


In [23]:
linker.waterfall_chart(df_inc.to_dict(orient="records"))

In [57]:
import jellyfish as jf

In [74]:
jf.jaro_winkler_similarity('VIKING MARITIME (INCLUDING VIKING CREW','VIKING MARITIME')

0.8789473684210527

In [None]:
# postcode match
# 0.8 jaro
