# Chapter 5 - Matching At Scale

## Step 1 - Data Acquisition

### Wikipedia

In [1]:
import requests
import json
import zipfile
import io
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup

In [2]:
# Get the Wikipedia webpage and find all tables.

url = "https://en.wikipedia.org/wiki/List_of_MPs_elected_in_the_2019_United_Kingdom_general_election"

website_page = requests.get(url).text
soup = BeautifulSoup(website_page,'html.parser')
tables = soup.find_all('table')

In [3]:
# Find the table with members returned, extract rows as a list of lists and load into dataframe

for table in tables:
    if 'Member returned' in table.text:
        headers = [header.text.strip() for header in table.find_all('th')]
        headers = headers[:5]
        dfrows = []
        table_rows = table.find_all('tr')    
        for row in table_rows:
            td = row.find_all('td')
            dfrow = []
            for element in td:
                if element.text!='\n':
                    dfrow.append(element.text)
                if element.select("a[class='image']"):
                    for link in element.select("a[title]"):
                        urltail = link['href']
                        url = f'https://en.wikipedia.org{urltail}'
            dfrow.append(url)
            dfrows.append(dfrow)
        headers.append('Wikilink')
df_w = pd.DataFrame(dfrows, columns=headers)
df_w = df_w.dropna()

In [4]:
def get_bday(url):
    wiki_page = requests.get(url).text
    soup = BeautifulSoup(wiki_page,'html.parser')
    bday = ''
    bdayelement = soup.select_one("span[class='bday']")
    if bdayelement is not None:
        bday = bdayelement.text
    return(bday)

In [5]:
df_w['Birthday'] = df_w.apply(lambda x: get_bday(x.Wikilink), axis=1)

In [6]:
df_w.head(n=5)

Unnamed: 0,Constituency,Party of incumbentbefore election,Member returned,Party of incumbentafter election,Notes,Wikilink,Birthday
1,Aberavon\n,Labour\n,Stephen Kinnock\n,Labour\n,Seat held\n,https://en.wikipedia.org/wiki/Stephen_Kinnock,1970-01-01
2,Aberconwy\n,Conservative\n,Robin Millar\n,Conservative\n,"Previous incumbent, Guto Bebb, did not stand\n",https://en.wikipedia.org/wiki/Robin_Millar_(po...,1968-10-15
3,Aberdeen North\n,Scottish National\n,Kirsty Blackman\n,Scottish National\n,Seat held\n,https://en.wikipedia.org/wiki/Kirsty_Blackman,1986-03-20
4,Aberdeen South\n,Conservative\n,Stephen Flynn\n,Scottish National\n,"Previous incumbent, Ross Thomson, did not stand\n",https://en.wikipedia.org/wiki/Stephen_Flynn_(S...,1988-10-13
5,Airdrie and Shotts\n,Scottish National\n,Neil Gray\n,Scottish National\n,Seat held\n,https://en.wikipedia.org/wiki/Neil_Gray,1986-03-16


### Companies House Persons of Significant Control

In [7]:
# UK Companies House Persons with Significant Control Download Page

url = "http://download.companieshouse.gov.uk/en_pscdata.html"

In [8]:
# Download snapshots, convert json to dataframe, remove unwanted columns and append to a single dataframe
# Ignore last file 23of23

df_psctotal = pd.DataFrame()
with requests.Session() as req:
        r = req.get(url)
        soup = BeautifulSoup(r.content, 'html.parser')
        snapshots = [f"{url[:38]}{item['href']}" for item in soup.select(
            "a[href*='psc-snapshot']")]
        for snapshot in snapshots:    
            response = requests.get(snapshot).content     
            zipsnapshot = zipfile.ZipFile(io.BytesIO(response))
            if(zipsnapshot.namelist()[0][-10:] != "23of23.txt"):
                tempfile = zipsnapshot.extract(zipsnapshot.namelist()[0])
                print(zipsnapshot.namelist()[0])
                df_psc = pd.json_normalize(pd.Series(open(tempfile, encoding="utf8").readlines()).apply(json.loads))
                df_psc = df_psc.dropna(subset=['data.name_elements.forename'])
                df_psc = df_psc[['company_number','data.name_elements.surname','data.name_elements.forename','data.date_of_birth.month','data.date_of_birth.year']]  
                df_psctotal = pd.concat([df_psctotal, df_psc], ignore_index=True)

psc-snapshot-2023-04-12_1of23.txt
psc-snapshot-2023-04-12_2of23.txt
psc-snapshot-2023-04-12_3of23.txt
psc-snapshot-2023-04-12_4of23.txt
psc-snapshot-2023-04-12_5of23.txt
psc-snapshot-2023-04-12_6of23.txt
psc-snapshot-2023-04-12_7of23.txt
psc-snapshot-2023-04-12_8of23.txt
psc-snapshot-2023-04-12_9of23.txt
psc-snapshot-2023-04-12_10of23.txt
psc-snapshot-2023-04-12_11of23.txt
psc-snapshot-2023-04-12_12of23.txt
psc-snapshot-2023-04-12_13of23.txt
psc-snapshot-2023-04-12_14of23.txt
psc-snapshot-2023-04-12_15of23.txt
psc-snapshot-2023-04-12_16of23.txt
psc-snapshot-2023-04-12_17of23.txt
psc-snapshot-2023-04-12_18of23.txt
psc-snapshot-2023-04-12_19of23.txt
psc-snapshot-2023-04-12_20of23.txt
psc-snapshot-2023-04-12_21of23.txt
psc-snapshot-2023-04-12_22of23.txt


### Saving to Local Storage

In [9]:
#df_w.to_csv('mps_wiki_bday_raw.csv', index=False)
df_w = pd.read_csv('mps_wiki_bday_raw.csv')
#df_psctotal.to_csv('psc_raw.csv')
df_psc = pd.read_csv('psc_raw.csv', dtype={'data.name_elements.surname':'string','data.name_elements.forename':'string'})

## Step 2 - Data Standardization

### Wikipedia

In [10]:
df_w = df_w.dropna()
df_w['Year'] = pd.to_datetime(df_w['Birthday']).dt.year.astype('int64')
df_w['Month'] = pd.to_datetime(df_w['Birthday']).dt.month.astype('int64')

In [11]:
df_w = df_w.rename(columns={ 'Member returned' : 'Fullname'})
df_w['Fullname'] = df_w['Fullname'].str.rstrip("\n")
df_w['Fullname'] = df_w['Fullname'].str.lstrip("\n")
df_w['Firstname'] = df_w['Fullname'].str.split().str[0]
df_w['Lastname'] = df_w['Fullname'].astype(str).apply(lambda x: ' '.join(x.split()[1:]))

In [12]:
df_w['unique_id'] = df_w.index
df_w["company_number"] = np.nan
df_w=df_w[['Firstname','Lastname','Month','Year','unique_id','company_number']]

In [13]:
df_w.head(n=5)

Unnamed: 0,Firstname,Lastname,Month,Year,unique_id,company_number
0,Stephen,Kinnock,1,1970,0,
1,Robin,Millar,10,1968,1,
2,Kirsty,Blackman,3,1986,2,
3,Stephen,Flynn,10,1988,3,
4,Neil,Gray,3,1986,4,


### Companies House Persons of Significant Control

In [14]:
# Rename and convert columns for matching

df_psc = df_psc.dropna()

df_psc['Year'] = df_psc['data.date_of_birth.year'].astype('int64')
df_psc['Month'] = df_psc['data.date_of_birth.month'].astype('int64')
df_psc['Firstname']=df_psc['data.name_elements.forename']
df_psc['Lastname']=df_psc['data.name_elements.surname']

# Create unique index column needed by Splink from dataframe index
# Subset down to required columns

df_psc['unique_id'] = df_psc.index
df_psc = df_psc[['Lastname','Firstname','company_number','Year','Month','unique_id']]

### Saving to Local Storage

In [2]:
#df_w.to_csv('mps_wiki_bday_clean.csv', index=False)
df_w = pd.read_csv('mps_wiki_bday_clean.csv')
#df_psc.to_csv('psc_clean.csv', index=False)
df_psc = pd.read_csv('psc_clean.csv')

# Step 3 - Record Blocking and Attribute Comparison

In [3]:
len(df_psc)

9956208

In [4]:
# Count of simple merge on matching Year and Month 

df_mp = df_w.merge(df_psc, on=['Year','Month'], suffixes=('_w', '_psc'))
len(df_mp)

10891131

In [5]:
# Calculate exact match using a simple join

df_result = df_w.merge(df_psc, on=['Lastname','Firstname','Year','Month'], suffixes=('_w', '_psc'))
df_result

Unnamed: 0,Firstname,Lastname,Month,Year,unique_id_w,company_number_w,company_number_psc,unique_id_psc
0,Robin,Millar,10,1968,1,,04569484,1231331
1,Robin,Millar,10,1968,1,,06975241,4255139
2,Leo,Docherty,10,1976,5,,08204196,621947
3,Wendy,Morton,11,1967,6,,02715837,3297130
4,Graham,Brady,5,1967,7,,07484717,6535629
...,...,...,...,...,...,...,...,...
257,Mark,Jenkinson,1,1982,638,,10791041,3537960
258,Mark,Jenkinson,1,1982,638,,12131090,5998989
259,Steven,Baker,6,1971,642,,07355501,234561
260,Mark,Garnier,2,1963,644,,13636383,8245158


In [6]:
# Splink settings to block on year and month matches and then compare First and Last names

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.year = r.year and l.month = r.month"
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("Firstname", [0.9]),
        cl.jaro_winkler_at_thresholds("Lastname", [0.9]),
        cl.exact_match("Month"),
        cl.exact_match("Year", term_frequency_adjustments=True),
    ],       
}

In [7]:
# Setup linker and profile columns

linker = DuckDBLinker([df_w, df_psc], settings, input_table_aliases=["df_w", "df_psc"])
linker.profile_columns(["Firstname","Lastname","Month","Year"], top_n=10, bottom_n=5)

In [8]:
# Estimate u values

linker.estimate_u_using_random_sampling(target_rows=1e7)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - Firstname (no m values are trained).
    - Lastname (no m values are trained).
    - Month (no m values are trained).
    - Year (no m values are trained).


In [9]:
## Calculate m values 

linker.estimate_parameters_using_expectation_maximisation("l.Lastname = r.Lastname and l.Month = r.Month")
linker.estimate_parameters_using_expectation_maximisation("l.Firstname = r.Firstname and  l.Year = r.Year")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.Lastname = r.Lastname and l.Month = r.Month

Parameter estimates will be made for the following comparison(s):
    - Firstname
    - Year

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 
    - Lastname
    - Month

Iteration 1: Largest change in params was 0.041 in the m_probability of Year, level `Exact match`
Iteration 2: Largest change in params was 0.00503 in the m_probability of Year, level `Exact match`
Iteration 3: Largest change in params was 0.00472 in the m_probability of Firstname, level `Jaro_winkler_similarity >= 0.9`
Iteration 4: Largest change in params was 0.00591 in the m_probability of Firstname, level `Jaro_winkler_similarity >= 0.9`
Iteration 5: Largest change in params was -0.00532 in the m_probability of Firstname, level `Exact match`
Iteration 6: Largest change in params was -0.00387 in the m_proba

<EMTrainingSession, blocking on l.Firstname = r.Firstname and  l.Year = r.Year, deactivating comparisons Firstname, Year>

In [10]:
linker.match_weights_chart()

In [11]:
linker.m_u_parameters_chart()

# Step 4 - Match Classification

In [12]:
# Predict matches and convert to dataframe

results = linker.predict(threshold_match_probability=0.99)
pres = results.as_pandas_dataframe()
pres = pres.merge(df_psc, left_on=['unique_id_l'], right_on=['unique_id'])
pres = pres.rename(columns={"Firstname_l": "Firstname_psc", "Lastname_l": "Lastname_psc", "Firstname_r":"Firstname_w", "Lastname_r":"Lastname_w"})
pres = pres[['match_weight','match_probability','Firstname_psc','Firstname_w', 'Lastname_psc','Lastname_w','company_number']]
pres

Unnamed: 0,match_weight,match_probability,Firstname_psc,Firstname_w,Lastname_psc,Lastname_w,company_number
0,12.558151,0.999834,Jamie,Jamie,Wallis,Wallis,08134008
1,12.638040,0.999843,Peter,Peter,Gibson,Gibson,06301682
2,14.787390,0.999965,Jack,Jack,Dromey,Dromey,10277215
3,13.799580,0.999930,Stephen,Stephen,Timms,Timms,01333367
4,12.972798,0.999876,Craig,Craig,Whittaker,Whittaker,10284374
...,...,...,...,...,...,...,...
337,12.419389,0.999817,James,James,Davies,Davies,12010734
338,14.302048,0.999950,David,David,Jones,Jones,12011911
339,11.719844,0.999704,Anthony,Anthony,Brown,Browne,12521856
340,12.972798,0.999876,Craig,Craig,Whittaker,Whittaker,14750574


In [13]:
# Select matches that aren't exact

pres[(pres['Lastname_psc']!=pres['Lastname_w']) | (pres['Firstname_psc']!=pres['Firstname_w'])]

Unnamed: 0,match_weight,match_probability,Firstname_psc,Firstname_w,Lastname_psc,Lastname_w,company_number
5,11.668858,0.999693,Richard,Richard,Thomas,Thomson,10609747
6,13.514815,0.999915,John,John,Mcdonnell,McDonnell,05350064
7,11.100448,0.999545,James,Jamie,Stone,Stone,SC042883
8,9.983912,0.999013,Jeffrey,Jeff,Smith,Smith,09530115
15,9.983912,0.999013,Jeffrey,Jeff,Smith,Smith,13311041
...,...,...,...,...,...,...,...
319,11.917547,0.999742,Mark,Mark,Gardner,Garnier,06540656
322,8.767288,0.997710,Johan,John,Glennmo,Glen,12254404
327,9.681758,0.998784,Theodora,Theo,Clarke,Clarke,08140289
331,8.767288,0.997710,Johan,John,Glennmo,Glen,14106498


In [14]:
pres.to_csv('pres.csv', index=False)

In [15]:
nonexact = pres[(pres['Lastname_psc']!=pres['Lastname_w']) | (pres['Firstname_psc']!=pres['Firstname_w'])]


In [16]:
nonexact.to_csv('nonexact.csv', index=False)