### Madien Speech Example

Matches UK House of Commons MPs Madien Speeches to UK Members of Parliament List

In [1]:
import splink
import pandas as pd

In [2]:
import probablepeople as pp

# Extract firstname, removing Honorific Prefix (eg Dr)

def clean_firstname(raw_name):
    first_name=raw_name
    if(type(raw_name)==str):
        for parsed_value, parsed_type in pp.parse(raw_name):
            if parsed_type == 'GivenName':
                first_name = parsed_value
    return first_name

In [3]:
# Download table of MP madien speeches from 2015
# Spreadsheet is from https://commonslibrary.parliament.uk/research-briefings/sn04588/
# Direct download into notebook disabled by cloudflare protection

df=pd.read_excel("SN04588.xlsx", skiprows=1, header=1, sheet_name="2015 - Present")

In [4]:
# Cleanse and Standardise Madien Speech table

df['Firstname'] = df.apply(lambda row: clean_firstname(row['First Name']), axis=1)
df['Surname']=df['Surname'].replace('-', ' ', regex=True)
df['Party'] = df['Party'].map({'Lab': 'Labour', 'Con': 'Conservative', 'LD': 'Liberal Democrat'})
df = df[['Firstname','Surname','Party','Constituency']]

#Strip spaces at start, end and convert to uppercase
df[df.columns] = df.apply(lambda x: x.str.strip())
df = df.apply(lambda x: x.astype(str).str.upper())

#Add unique_id column
df['unique_id'] = df.index

In [5]:
# Download all current Members of Parliament

df_mp=pd.read_csv("https://www.theyworkforyou.com/mps/?f=csv", header=0)

In [6]:
# Cleanse MP table

df_mp['Firstname'] = df_mp.apply(lambda row: clean_firstname(row['First name']), axis=1)
df_mp['Surname']=df_mp['Last name'].replace('-', ' ', regex=True)
df_mp = df_mp[['Firstname','Surname','Party','Constituency']]

#Convert to uppercase
df_mp = df_mp.apply(lambda x: x.astype(str).str.upper())

#Add unique_id column
df_mp['unique_id'] = df_mp.index

In [7]:
#Perfect matches on Constituency, Party, Firstname and Surname

df_result = df.merge(df_mp, left_on=['Constituency','Firstname','Surname','Party'], right_on=['Constituency','Firstname','Surname','Party'],
          suffixes=('_l', '_r'))
len(df_result)

253

In [8]:
#Perfect matches on Constituency, Firstname and Surname

df_result = df.merge(df_mp, left_on=['Constituency','Firstname','Surname'], right_on=['Constituency','Firstname','Surname'],
          suffixes=('_l', '_r'))
len(df_result)

324

In [9]:
#Simple merge on Consitutency

df_result = df.merge(df_mp, left_on=['Constituency'], right_on=['Constituency'],
          suffixes=('_l', '_r'))
len(df_result)

408

In [10]:
# Firstname matches but Surname doesn't match, maybe marriage

df_namemismatch = df_result[(df_result['Surname_l'] != df_result['Surname_r']) & (df_result['Firstname_l'] == df_result['Firstname_r'])]
df_namemismatch = df_namemismatch[['Firstname_l','Surname_l','Surname_r']]
df_namemismatch

Unnamed: 0,Firstname_l,Surname_l,Surname_r
8,ANUM,QAISAR JAVED,QAISAR
58,KATE,GRIFFITHS,KNIVETON
157,JAMES,FRITH,DALY
212,JULIA,DOCKERILL,LOPEZ
388,SUELLA,FERNANDES,BRAVERMAN


In [11]:
# Surname matches but Firstname doesn't match

df_namemismatch = df_result[(df_result['Surname_l'] == df_result['Surname_r']) & (df_result['Firstname_l'] != df_result['Firstname_r'])]
df_namemismatch = df_namemismatch[['Firstname_l','Firstname_r','Surname_l']]
df_namemismatch

Unnamed: 0,Firstname_l,Firstname_r,Surname_l
31,NICK,NICHOLAS,FLETCHER
138,FREYAL,FERYAL,CLARK
211,ANNALIESE,ANNELIESE,DODDS
291,NAZ,NASEEM,SHAH
320,TOM,THOMAS,TUGENDHAT
376,THANGHAM,THANGAM,DEBBONAIRE


In [12]:
import splink

In [13]:
# Splink settings to block on constituency and match Name and Party

from splink.duckdb.duckdb_linker import DuckDBLinker
from splink.duckdb import duckdb_comparison_library as cl
settings = {
    "link_type": "link_only",
    "blocking_rules_to_generate_predictions": [
        "l.Constituency = r.Constituency"
    ],
    "comparisons": [
        cl.jaccard_at_thresholds("Firstname",distance_threshold_or_thresholds=[0.9]),
        cl.exact_match("Surname"),
        cl.exact_match("Party"),
    ],
    "retain_intermediate_calculation_columns": True,
}

In [14]:
linker = DuckDBLinker([df, df_mp], settings, input_table_aliases=["df_l", "df_r"])
linker.estimate_u_using_random_sampling(target_rows=1e5)

----- Estimating u probabilities using random sampling -----

Estimated u probabilities using random sampling

Your model is not yet fully trained. Missing estimates for:
    - Firstname (no m values are trained).
    - Surname (no m values are trained).
    - Party (no m values are trained).


In [15]:
linker.estimate_parameters_using_expectation_maximisation("l.Constituency = r.Constituency")
linker.estimate_parameters_using_expectation_maximisation("l.Surname = r.Surname and l.Firstname = r.Firstname")


----- Starting EM training session -----

Estimating the m probabilities of the model by blocking on:
l.Constituency = r.Constituency

Parameter estimates will be made for the following comparison(s):
    - Firstname
    - Surname
    - Party

Parameter estimates cannot be made for the following comparison(s) since they are used in the blocking rules: 

Iteration 1: Largest change in params was 0.674 in probability_two_random_records_match
Iteration 2: Largest change in params was 0.129 in probability_two_random_records_match
Iteration 3: Largest change in params was 0.00806 in probability_two_random_records_match
Iteration 4: Largest change in params was -0.00725 in the m_probability of Surname, level `Exact match`
Iteration 5: Largest change in params was 0.00219 in the m_probability of Surname, level `All other comparisons`
Iteration 6: Largest change in params was -0.000275 in the m_probability of Surname, level `Exact match`
Iteration 7: Largest change in params was 2.97e-05 in t

<EMTrainingSession, blocking on l.Surname = r.Surname and l.Firstname = r.Firstname, deactivating comparisons Firstname, Surname>

In [16]:
linker.match_weights_chart()

In [17]:
# Number of matches 

df_splink = linker.predict(threshold_match_probability = 0.1).as_pandas_dataframe()
len(df_splink)

327

In [18]:
# Select not exact matches 

df_extra = df_splink[(df_splink['Firstname_l'] != df_splink['Firstname_r'])]
df_extra

Unnamed: 0,match_weight,match_probability,source_dataset_l,unique_id_l,source_dataset_r,unique_id_r,Firstname_l,Firstname_r,gamma_Firstname,bf_Firstname,Surname_l,Surname_r,gamma_Surname,bf_Surname,Party_l,Party_r,gamma_Party,bf_Party,Constituency_l,Constituency_r
42,1.903442,0.789076,df_l,108,df_r,107,FREYAL,FERYAL,1,28.408382,CLARK,CLARK,1,498.089929,LABOUR,LABOUR,1,2.643597,ENFIELD NORTH,ENFIELD NORTH
59,1.903442,0.789076,df_l,382,df_r,151,THANGHAM,THANGAM,1,28.408382,DEBBONAIRE,DEBBONAIRE,1,498.089929,LABOUR,LABOUR,1,2.643597,BRISTOL WEST,BRISTOL WEST
64,-2.090691,0.190131,df_l,181,df_r,159,ANNALIESE,ANNELIESE,1,28.408382,DODDS,DODDS,1,498.089929,NAN,LABOUR/CO-OPERATIVE,0,0.165898,OXFORD EAST,OXFORD EAST


In [19]:
# Show waterfall charts for not exact matches

linker.waterfall_chart(df_extra.to_dict(orient="records"))

In [20]:
import recordlinkage

# Number of potential links

indexer = recordlinkage.Index()
indexer.block("Constituency")
candidate_links = indexer.index(df, df_mp)
len(candidate_links)

408

In [21]:
# Number of exact matches

compare_cl = recordlinkage.Compare()
compare_cl.string("Firstname", "Firstname", method='jarowinkler',threshold=0.8, label="FirstnameJaro")
compare_cl.exact("Firstname", "Firstname", label="FirstnameExact")
compare_cl.exact("Surname", "Surname", label="Surname")
compare_cl.exact("Party", "Party", label="Party")
features = compare_cl.compute(candidate_links, df, df_mp)
len(features[(features['FirstnameExact']==1) & (features['Surname']==1)])

324

In [22]:
# Numbeer of jarowinkler matches

len(features[(features['FirstnameJaro']==1) & (features['Surname']==1)])

328

In [23]:
# Features match table for non exact matches

features[(features['FirstnameExact']!=1) & (features['FirstnameJaro']==1) & (features['Surname']==1)]

Unnamed: 0,Unnamed: 1,FirstnameJaro,FirstnameExact,Surname,Party
108,107,1.0,0,1,1
181,159,1.0,0,1,0
312,602,1.0,0,1,1
382,151,1.0,0,1,1


In [24]:
df.iloc[312]

Firstname                         TOM
Surname                     TUGENDHAT
Party                    CONSERVATIVE
Constituency    TONBRIDGE AND MALLING
unique_id                         312
Name: 312, dtype: object

In [25]:
df_mp.iloc[602]

Firstname                      THOMAS
Surname                     TUGENDHAT
Party                    CONSERVATIVE
Constituency    TONBRIDGE AND MALLING
unique_id                         602
Name: 602, dtype: object