# Chapter 6 - Organisation Matching

## Step 1 - Data Acquisition

### Companies House Basic Data

In [None]:
import requests
import json
import zipfile
import io
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [None]:
# UK Companies House Basic Company Data download page

url="http://download.companieshouse.gov.uk/en_output.html"

In [None]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe

df_ct = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='BasicCompanyData-']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
            print(zipsnapshot.namelist()[0])
            df_c = pd.read_csv(tempfile, dtype='unicode')
            df_c = df_c[['RegAddress.PostCode','CompanyName']]
            df_ct = pd.concat([df_ct, df_c], ignore_index=True)

### Maritime and Coastguard Agency

In [None]:
# Maritime and Coastguard Agency - List of approved recruitment and placement agencies

url = "https://www.gov.uk/government/publications/recruitment-and-placement-agencies-approved-by-the-mca"

In [None]:
# Download the csv file, extract name and drop erroroneous last row with all nulls

with requests.Session() as req:
    r = req.get(url)
    soup = BeautifulSoup(r.content, 'html.parser')
    targets = [f"{item['href']}" for item in soup.select(
        "a[href$='.csv']")]
    for target in targets:
        response = req.get(target)    
        df_m = pd.read_csv(io.BytesIO(response.content))
        df_m = df_m.dropna(how='all')

### Saving to Local Storage

In [None]:
#df_ct.to_csv('basic_raw.csv', index=False)
df_c = pd.read_csv('basic_raw.csv')

#df_m.to_csv('mari_raw.csv', index=False)
df_m = pd.read_csv('mari_raw.csv')

## Step 2 - Data Standardization

In [None]:
#%pip install matplotlib

In [None]:
import matplotlib.pyplot as plt

plt.hist(df_m.apply(lambda row: len(row['ADDRESS & CONTACT DETAILS'].split(',')), axis=1).tolist())

### Companies House Basic Data

In [None]:
# Function to remove stopwards from company names

def strip_stopwords(raw_name):    
    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', 
        'HOLDINGS', 'UK', 'TRADING', 'LTD.', 'PLC','LLP' }
    return(' '.join([raw_name_part for raw_name_part in raw_name.split() if raw_name_part not in company_stopwords]))

In [None]:
# Strip company name and rename postcode column

df_c['CompanyName'] = df_c.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
df_c = df_c.rename(columns={"RegAddress.PostCode": "Postcode"})

In [None]:
# Remove unnecessary columns and add index

df_c = df_c[['Postcode','CompanyName']]
df_c['unique_id'] = df_c.index

### Maritime and Coastguard Agency

In [None]:
# Function to extract postcode using regular expression

import re
def extract_postcode(address):
    pattern = re.compile(r'([A-Z]{1,2}[0-9][A-Z0-9]? [0-9][ABD-HJLNP-UW-Z]{2})')
    postcode = pattern.search(address)
    if(postcode is not None):
         return postcode.group()
    else:
         return None

In [None]:
# Company name to uppercase

df_m['CompanyName'] = df_m['COMPANY'].str.upper()

In [None]:
# Strip company name and extract postcode

df_m['CompanyName'] = df_m.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
df_m['Postcode'] = df_m.apply(lambda row: extract_postcode(row['ADDRESS & CONTACT DETAILS']), axis=1)

In [None]:
# Remove unnecessary columns, drop rows with missing data and add index

df_m = df_m[['Postcode','CompanyName']]
df_m = df_m.dropna()
df_m['unique_id'] = df_m.index

len(df_m)

### Saving to Local Storage

In [None]:
df_c.to_csv('basic_clean.csv', index=False)
df_c = pd.read_csv('basic_clean.csv')

df_m.to_csv('mari_clean.csv', index=False)
df_m = pd.read_csv('mari_clean.csv')
len(df_m)

# Step 3 - Record Blocking and Attribute Comparison

In [None]:
import splink

In [None]:
# Predict only on exact company name or postcode match

from splink.duckdb.linker import DuckDBLinker
from splink.duckdb import comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
        "l.CompanyName = r.CompanyName",
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("CompanyName",[0.9,0.8]),
    ],
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}
linker = DuckDBLinker([df_m, df_c], settings, input_table_aliases=["_m", "_c"])

In [None]:
linker.cumulative_num_comparisons_from_blocking_rules_chart()

In [None]:
# Use 1 million targets rows to ensure u estimation

linker.estimate_u_using_random_sampling(max_pairs=1e7)

In [None]:
linker.estimate_parameters_using_expectation_maximisation("l.Postcode = r.Postcode")

In [None]:
#linker.save_model_to_json("Chapter6_Splink_Settings.json", overwrite=True)
linker.load_settings("Chapter6_Splink_Settings.json")

In [None]:
linker.match_weights_chart()

In [None]:
linker.m_u_parameters_chart()

# Step 4 - Match Classification

In [None]:
# Calculate predictions

df_pred = linker.predict(threshold_match_probability=0.1).as_pandas_dataframe()
len(df_pred)

In [None]:
len(pd.unique(df_pred['CompanyName_r']))

In [None]:
postname = df_pred[(df_pred['CompanyName_l']==df_pred['CompanyName_r']) & (df_pred['Postcode_l']==df_pred['Postcode_r'])]
len(postname)

In [None]:
len(pd.unique(postname['CompanyName_r']))

In [None]:
notname = df_pred[df_pred['CompanyName_l']!=df_pred['CompanyName_r']]
len(notname)

In [None]:
len(pd.unique(notname['CompanyName_r']))

In [None]:
notpost = df_pred[df_pred['Postcode_l']!=df_pred['Postcode_r']]
len(notpost)

In [None]:
len(pd.unique(notpost['CompanyName_r']))

In [None]:
results = df_m.merge(df_pred,left_on=['unique_id'], right_on=['unique_id_r'],how='left',
          suffixes=('_m', '_p'))
results[results['match_weight'].isnull()]

In [None]:
linker.waterfall_chart(df_pred.to_dict(orient="records"))

# Step 5 - Resolve New Entities

In [None]:
record = {'unique_id': 1,
 'Postcode': "BH15 4QE",
 'CompanyName': "VANTAGE YACHT RECRUITMENT",
}

df_new = linker.find_matches_to_new_records([record], blocking_rules=[]).as_pandas_dataframe()
df_new.sort_values("match_weight", ascending=False)