# Chapter 10 - Clustering

In [1]:
import pandas as pd

## Simple Exact Match Clustering

In [2]:
table = [
    [0,'Michael','Shearer',1970],
    [1,'Michael','Shearer', 1970],
    [2,'Mike','Shearer', 1970],
    [3,'Michael','Shearer',1971],
    [4,'Michelle','Shearer',1971],
    [5,'Mike','Sheare', 1971]]
clmns = ['ID','Firstname','Lastname','Year']
df_ms = pd.DataFrame(table, columns = clmns)

In [3]:
df_ms['cluster'] = df_ms.groupby(['Firstname','Lastname']).ngroup()
df_ms

Unnamed: 0,ID,Firstname,Lastname,Year,cluster
0,0,Michael,Shearer,1970,0
1,1,Michael,Shearer,1970,0
2,2,Mike,Shearer,1970,3
3,3,Michael,Shearer,1971,0
4,4,Michelle,Shearer,1971,1
5,5,Mike,Sheare,1971,2


## Approximate Match Clustering

In [4]:
import itertools

df_combs = pd.DataFrame(list(itertools.combinations(table,2)), columns=['A','B'])
df_combs

Unnamed: 0,A,B
0,"[0, Michael, Shearer, 1970]","[1, Michael, Shearer, 1970]"
1,"[0, Michael, Shearer, 1970]","[2, Mike, Shearer, 1970]"
2,"[0, Michael, Shearer, 1970]","[3, Michael, Shearer, 1971]"
3,"[0, Michael, Shearer, 1970]","[4, Michelle, Shearer, 1971]"
4,"[0, Michael, Shearer, 1970]","[5, Mike, Sheare, 1971]"
5,"[1, Michael, Shearer, 1970]","[2, Mike, Shearer, 1970]"
6,"[1, Michael, Shearer, 1970]","[3, Michael, Shearer, 1971]"
7,"[1, Michael, Shearer, 1970]","[4, Michelle, Shearer, 1971]"
8,"[1, Michael, Shearer, 1970]","[5, Mike, Sheare, 1971]"
9,"[2, Mike, Shearer, 1970]","[3, Michael, Shearer, 1971]"


In [5]:
clmnsA = pd.MultiIndex.from_arrays([['A']*len(clmns), clmns])
clmnsB = pd.MultiIndex.from_arrays([['B']*len(clmns), clmns])

df_edges = pd.concat([pd.DataFrame(df_combs['A'].values.tolist(), columns = clmnsA),
                     pd.DataFrame(df_combs['B'].values.tolist(), columns = clmnsB)], axis=1)
df_edges

Unnamed: 0_level_0,A,A,A,A,B,B,B,B
Unnamed: 0_level_1,ID,Firstname,Lastname,Year,ID,Firstname,Lastname,Year
0,0,Michael,Shearer,1970,1,Michael,Shearer,1970
1,0,Michael,Shearer,1970,2,Mike,Shearer,1970
2,0,Michael,Shearer,1970,3,Michael,Shearer,1971
3,0,Michael,Shearer,1970,4,Michelle,Shearer,1971
4,0,Michael,Shearer,1970,5,Mike,Sheare,1971
5,1,Michael,Shearer,1970,2,Mike,Shearer,1970
6,1,Michael,Shearer,1970,3,Michael,Shearer,1971
7,1,Michael,Shearer,1970,4,Michelle,Shearer,1971
8,1,Michael,Shearer,1970,5,Mike,Sheare,1971
9,2,Mike,Shearer,1970,3,Michael,Shearer,1971


In [6]:
#%pip install jellyfish
import jellyfish as jf

def is_match(row):
    firstname_match = jf.jaro_winkler_similarity(row['A']['Firstname'], row['B']['Firstname']) > 0.9
    lastname_match = jf.jaro_winkler_similarity(row['A']['Lastname'], row['B']['Lastname']) > 0.9
    return firstname_match and lastname_match

df_edges['Match'] = df_edges.apply(is_match, axis=1)
df_edges

Unnamed: 0_level_0,A,A,A,A,B,B,B,B,Match
Unnamed: 0_level_1,ID,Firstname,Lastname,Year,ID,Firstname,Lastname,Year,Unnamed: 9_level_1
0,0,Michael,Shearer,1970,1,Michael,Shearer,1970,True
1,0,Michael,Shearer,1970,2,Mike,Shearer,1970,False
2,0,Michael,Shearer,1970,3,Michael,Shearer,1971,True
3,0,Michael,Shearer,1970,4,Michelle,Shearer,1971,True
4,0,Michael,Shearer,1970,5,Mike,Sheare,1971,False
5,1,Michael,Shearer,1970,2,Mike,Shearer,1970,False
6,1,Michael,Shearer,1970,3,Michael,Shearer,1971,True
7,1,Michael,Shearer,1970,4,Michelle,Shearer,1971,True
8,1,Michael,Shearer,1970,5,Mike,Sheare,1971,False
9,2,Mike,Shearer,1970,3,Michael,Shearer,1971,False


In [7]:
#%pip install networkx
import networkx as nx

G = nx.from_pandas_edgelist(df_edges[df_edges['Match']], source=('A','ID'), target=('B','ID'))
list(nx.connected_components(G))

[{0, 1, 3, 4}, {2, 5}]

# Sample Problem

## Step 1 - Data Acquisition

In [None]:
# Reload raw file from Chapter 5

df_psc = pd.read_csv('psc_raw.csv',dtype={'data.name_elements.surname':'string',
                                           'data.name_elements.forename':'string',
                                           'data.name_elements.middle_name':'string',
                                           'data.name_elements.title':'string',
                                           'data.nationality':'string'})

## Step 2 - Data Standardization

In [None]:
df_psc = df_psc.dropna(subset=['data.date_of_birth.year','data.date_of_birth.month'])
df_psc['Year'] = df_psc['data.date_of_birth.year'].astype('int64')
df_psc['Month'] = df_psc['data.date_of_birth.month'].astype('int64')

In [None]:
df_psc = df_psc.rename(columns=
                   {"data.name_elements.surname" : "Lastname",
                    "data.name_elements.forename" : "Firstname",
                    "data.name_elements.middle_name" : "Middlename",
                    "data.name_elements.title" : "Title",
                    "data.nationality" : "Nationality"})

In [None]:
df_psc = df_psc[['Lastname','Middlename','Firstname','company_number','Year','Month','Title','Nationality']]
df_psc['unique_id'] = df_psc.index

## Step 3 - Record Blocking and Attribute Comparison

In [None]:
from splink.duckdb.linker import DuckDBLinker
from splink.duckdb import comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.Year = r.Year and l.Month = r.Month and l.Lastname = r.Lastname"
    ],
    "comparisons": [
        cl.jaro_winkler_at_thresholds("Firstname", [0.9]),
        cl.jaro_winkler_at_thresholds("Middlename", [0.9]),
        cl.exact_match("Lastname"),
        cl.exact_match("Title"),
        cl.exact_match("Nationality"),
        cl.exact_match("Month"),
        cl.exact_match("Year", term_frequency_adjustments=True),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
    "max_iterations": 10,
    "em_convergence": 0.01,
    "additional_columns_to_retain": ["company_number"],
}
linker = DuckDBLinker(df_psc, settings)

### Data Analysis

In [None]:
linker.profile_columns(["Firstname","Middlename","Lastname","Title","Nationality","Month","Year"], top_n=10, bottom_n=5)

### Test Blocking Rules

In [None]:
linker.count_num_comparisons_from_blocking_rule("l.Lastname = r.Lastname and l.Month = r.Month and l.Title = r.Title and l.Nationality = r.Nationality")

In [None]:
linker.count_num_comparisons_from_blocking_rule("l.Firstname = r.Firstname and l.Year = r.Year and l.Middlename = r.Middlename")

In [None]:
linker.count_num_comparisons_from_blocking_rule("l.Lastname = r.Lastname and l.Middlename = r.Middlename")

In [None]:
linker.count_num_comparisons_from_blocking_rule("l.Firstname = r.Firstname and l.Month = r.Month and l.Year = r.Year and l.Title = r.Title and l.Nationality = r.Nationality")

In [None]:
linker.estimate_u_using_random_sampling(max_pairs=1e7)

In [None]:
linker.estimate_parameters_using_expectation_maximisation("l.Lastname = r.Lastname and l.Middlename = r.Middlename", fix_u_probabilities=False)
linker.estimate_parameters_using_expectation_maximisation("l.Firstname = r.Firstname and l.Month = r.Month and l.Year = r.Year and l.Title = r.Title and l.Nationality = r.Nationality", fix_u_probabilities=False)

In [None]:
#linker.save_model_to_json("Chapter10_Splink_Settings.json", overwrite=True)
linker.load_settings("Chapter10_Splink_Settings.json")

## Step 4 - Match Classification and Clustering

In [None]:
df_predict = linker.predict(threshold_match_probability=0.9)

In [None]:
clusters = linker.cluster_pairwise_predictions_at_threshold(df_predict, threshold_match_probability=0.9)
df_clusters = clusters.as_pandas_dataframe()

In [None]:
df_clusters.head(n=5)

In [None]:
df_cgroup = df_clusters.groupby(['cluster_id'], sort=False)[['unique_id','Firstname','Title','Nationality']].agg(lambda x: list(set(x))).reset_index()

In [None]:
df_cgroup.info()

## Step 5 - Vizualisation 

In [None]:
import matplotlib.pyplot as plt
import numpy as np

mybins =[1,2,10,100,1000,10000]

fig, ax = plt.subplots()
counts, bins, patches = ax.hist(df_cgroup['unique_id'].apply(len), bins=mybins )

bin_centers = 0.5 * np.diff(bins) + bins[:-1]
for label, x in zip(['1','2-10','10-100','100-1000','1000+'], bin_centers):
    ax.annotate(label, xy=(x, 0), xycoords=('data', 'axes fraction'),   xytext=(0,-10), textcoords='offset points',  va='top', ha='right')

ax.tick_params(labelbottom=False)
ax.xaxis.set_label_coords(0,-0.1)
ax.set_xlabel('Number of controlled companies')
ax.set_ylabel('Count')
ax.set_title('Distribution of significant company control')

ax.set_yscale('log')
ax.set_xscale('log')

fig.tight_layout()    
plt.show()

## Step6 - Cluster Analysis

In [None]:
df_cselect = df_cgroup[(df_cgroup['Firstname'].apply(len) > 1) &
                  (df_cgroup['Title'].apply(len) > 1) & 
                  (df_cgroup['Nationality'].apply(len) > 1) &
                  (df_cgroup['unique_id'].apply(len) == 6)]

In [None]:
df_cselect.head(n=5)

In [None]:
df_cselect.info()

In [None]:
linker.cluster_studio_dashboard(df_predict, clusters, "Chapter10_cluster_studio.html", 
                                cluster_ids = df_cselect['cluster_id'].to_list(),
                                overwrite=True)

In [None]:
from IPython.display import IFrame

IFrame(
    src="Chapter10_cluster_studio.html", width="100%", height=1200
)  