### Climate Match matching entities to basic companies house records

In [1]:
import tabula
import pandas as pd
import numpy as np

In [2]:
url = "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/attachment_data/file/310599/legal_entities.pdf"

In [3]:
# Read pdf and extract GB Companies

df = tabula.read_pdf(url, pages='all', multiple_tables = False, encoding='cp1252')
cldf = df[0][df[0]['COUNTRY']=='GB']

In [4]:
# Function to remove stopwards from companies

def strip_stopwords(raw_name):    
    company_stopwords = { 'LIMITED', 'LTD', 'SERVICES', 'COMPANY', 'GROUP', 'PROPERTIES', 'CONSULTING', 
        'HOLDINGS', 'UK', 'TRADING', 'LTD.' }
    return(' '.join([raw_name_part for raw_name_part in raw_name.split() if raw_name_part not in company_stopwords]))

In [5]:
# Rename columns and strip stopwords
# Add unique id column Splink needs

# ToFix: SettingWithCopy Warning

cldf['Postcode'] = cldf['POSTCODE']
cldf['Location'] = cldf['CITY'].str.upper()
cldf['CompanyName'] = cldf['LEGAL ENTITY'].str.upper()
cldf['CompanyName'] = cldf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
cldf['unique_id'] = cldf.index

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pand

In [6]:
# Remove unwanted columns
# Number of climate companies to match

cldf = cldf[['Postcode','CompanyName','Location','unique_id']]
len(cldf)

334

In [7]:
# Read basic companies house data as prepared by Download Data notebook

cdf = pd.read_csv('basic_slim.csv')
cdf = cdf.rename(columns={"RegAddress.PostCode": "Postcode", 'RegAddress.PostTown': 'Location'})
cdf['CompanyName'] = cdf.apply(lambda row: strip_stopwords(row['CompanyName']), axis=1)
cdf['CompanyName'].replace('', np.nan, inplace=True)
cdf['unique_id'] = cdf.index

In [8]:
# Remove unwanted columns

cdf = cdf[['Postcode','CompanyName','Location','unique_id']]

In [9]:
# Number of exact matches

exact = cldf.merge(cdf,left_on=['Postcode','CompanyName'], right_on=['Postcode','CompanyName'],
          suffixes=('_left', '_right'))
len(exact)

56

In [10]:
import recordlinkage

In [11]:
indexer = recordlinkage.Index()
indexer.block("Postcode")
candidate_links = indexer.index(cdf, cldf)
len(candidate_links)

101605

In [12]:
compare_cl = recordlinkage.Compare()
compare_cl.string("CompanyName", "CompanyName", method='jarowinkler',threshold=0.85)
compare_cl.exact("Postcode","Postcode")
features = compare_cl.compute(candidate_links, cdf, cldf)

# Name exact or approx match and postcode matches
matches = features[(features[0]==1) & (features[1]==1)]
len(matches)

659

In [13]:
# Set index names to allow join 
matches.index.names = ['cdf','cldf']
cdf.index.names= ['cdf']
cldf.index.names= ['cldf']

# Lookup both names
matches = matches.join(cdf, how='inner')
matches = matches.join(cldf, how='inner', rsuffix='_cldf')

# Select those with only approx match not exact match
approx = matches[matches['CompanyName']!=matches['CompanyName_cldf']]
len(approx)

603

In [14]:
# List of those companies matched at least once
found = matches.index.unique(level='cldf')
len(found)

89

In [15]:
# Select those companies not found 
notfound = cldf.loc[(cldf.index.isin(found, level='cldf') == False)]
len(notfound)

245

In [16]:
import splink

In [17]:
from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Postcode = r.Postcode",
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("CompanyName",distance_threshold_or_thresholds=[0.9]),
    ],    
    "retain_intermediate_calculation_columns" : True,
    "retain_matching_columns" : True
}

In [18]:
linker = DuckDBLinker([cldf, cdf], settings, input_table_aliases=["cldf", "cdf"])
linker.estimate_u_using_random_sampling(target_rows=5e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - CompanyName (no m values are trained).


In [19]:
linker.estimate_parameters_using_expectation_maximisation("l.Postcode = r.Postcode")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.Postcode = r.Postcode

Parameter estimates will be made for the following comparison(s):
    - CompanyName

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was 0.525 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`
Iteration 2: Largest change in params was 0.247 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`
Iteration 3: Largest change in params was 0.000918 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`
Iteration 4: Largest change in params was 2.36e-06 in the m_probability of CompanyName, level `jaro_winkler_similarity >= 0.9`

EM converged after 4 iterations

Your model is fully trained. All comparisons have at least one estimate for their m and u values


<EMTrainingSession, blocking on l.Postcode = r.Postcode, deactivating comparisons >

In [20]:
df_splink = linker.predict(threshold_match_probability = 0.9).as_pandas_dataframe()
len(df_splink)

279

In [21]:
df_nonexact = df_splink[(df_splink['CompanyName_l']!=df_splink['CompanyName_r'])]

In [22]:
df_nonexact

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,CompanyName_l,CompanyName_r,gamma_CompanyName,bf_CompanyName,Postcode_l,Postcode_r
1,3.829464,0.934281,cdf,268002,cldf,28,ALLEN & OVERY (AUSTRALIA) LLP,ALLEN & OVERY LLP,1,142147.82345,E1 6AD,E1 6AD
2,3.829464,0.934281,cdf,268003,cldf,28,ALLEN & OVERY (BELGIUM) LLP,ALLEN & OVERY LLP,1,142147.82345,E1 6AD,E1 6AD
3,3.829464,0.934281,cdf,268005,cldf,28,ALLEN & OVERY (HOLDINGS),ALLEN & OVERY LLP,1,142147.82345,E1 6AD,E1 6AD
4,3.829464,0.934281,cdf,268007,cldf,28,ALLEN & OVERY (LONDON),ALLEN & OVERY LLP,1,142147.82345,E1 6AD,E1 6AD
5,3.829464,0.934281,cdf,268008,cldf,28,ALLEN & OVERY (SOUTH AFRICA) LLP,ALLEN & OVERY LLP,1,142147.82345,E1 6AD,E1 6AD
...,...,...,...,...,...,...,...,...,...,...,...,...
267,3.829464,0.934281,cdf,2605036,cldf,296,KNAUF,KNAUF GMBH,1,142147.82345,ME9 8SR,ME9 8SR
270,3.829464,0.934281,cdf,2658860,cldf,312,LAMB-WESTON/MEIJER,LAMB WESTON/MEIJER,1,142147.82345,PE13 2RN,PE13 2RN
273,3.829464,0.934281,cdf,4345855,cldf,457,STANDARD BANK LONDON,STANDARD BANK PLC,1,142147.82345,EC2V 7JE,EC2V 7JE
275,3.829464,0.934281,cdf,4378661,cldf,465,STOELZLE FLACONNAGE,STOLZLE FLACONNAGE,1,142147.82345,WF11 8AP,WF11 8AP


In [23]:
len(df_splink['CompanyName_r'].unique())

77