In [1]:
from splink.duckdb.duckdb_linker import DuckDBLinker

## Read in data

In [2]:
import pandas as pd 
pd.options.display.max_rows = 1000
df = pd.read_csv("./data/fake_1000.csv")
df.head(5)

Unnamed: 0,unique_id,first_name,surname,dob,city,email,cluster
0,0,Robert,Alan,1971-06-24,,robert255@smith.net,0
1,1,Robert,Allen,1971-05-24,,roberta25@smith.net,0
2,2,Rob,Allen,1971-06-24,London,roberta25@smith.net,0
3,3,Robert,Alen,1971-06-24,Lonon,,0
4,4,Grace,,1997-04-26,Hull,grace.kelly52@jones.com,1


## Initialise the linker, passing in the input dataset(s)

In [3]:

linker = DuckDBLinker(df)

## Load estimated model parameters from previous notebook

In [4]:
linker.load_settings_from_json("./demo_settings/saved_model_from_demo.json")

# Predicting match weights using the trained model

In [5]:
df_predictions = linker.predict()
df_predictions.as_pandas_dataframe(limit=5)

Unnamed: 0,match_weight,match_probability,unique_id_l,unique_id_r,first_name_l,first_name_r,gamma_first_name,bf_first_name,surname_l,surname_r,...,bf_tf_adj_city,tf_city_l,tf_city_r,email_l,email_r,gamma_email,bf_email,cluster_l,cluster_r,match_key
0,5.756288,0.981836,4,5,Grace,Grace,2,85.525038,,Kelly,...,1.0,0.00123,,grace.kelly52@jones.com,grace.kelly52@jones.com,1,256.837973,1,1,0
1,-4.285461,0.048779,9,922,Evie,Evie,2,85.525038,Dean,Jones,...,1.0,0.00123,,evihd56@earris-bailey.net,eviejones@brewer-sparks.org,0,0.43749,3,230,0
2,-4.285461,0.048779,14,998,Oliver,Oliver,2,85.525038,Griffiths,Bird,...,1.0,0.00123,,o.griffiths90@reyes-coleman.com,oliver.b@smith.net,0,0.43749,5,250,0
3,-3.092784,0.104916,18,475,Caleb,Caleb,2,85.525038,Rwoe,Scott,...,1.0,0.04059,,,c.scott@brooks.com,-1,1.0,8,119,0
4,-5.412137,0.022946,21,917,Darcy,Darcy,2,85.525038,Bernass,Rhodes,...,1.0,0.00861,0.0492,darcy.b@silva.com,drhodes16@johnson-robinson.com,0,0.43749,9,229,0


# Visualising results

## Waterfall chart
You can also view rows in this dataset as a waterfall chart as follows:

In [7]:
records_to_plot = df_predictions.as_record_dict(limit=5)
linker.waterfall_chart(records_to_plot, filter_nulls=False)

## Match weights histogram
A histogram showing the distribution of match weights can be viewed as follows:

In [8]:
linker.match_weight_histogram(df_predictions)