## Quality assurance when you have fully labelled data

In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model

In [1]:
import pandas as pd 
import altair as alt
alt.renderers.enable("mimetype")

df = pd.read_csv("./data/fake_1000.csv")
df.head(2)

Unnamed: 0,unique_id,first_name,surname,dob,city,email,cluster
0,0,Robert,Alan,1971-06-24,,robert255@smith.net,0
1,1,Robert,Allen,1971-05-24,,roberta25@smith.net,0


In [2]:
from splink.duckdb.duckdb_linker import DuckDBLinker
import splink.duckdb.duckdb_comparison_template_library as ctl
import splink.duckdb.duckdb_comparison_library as cl

settings = {
    "link_type": "dedupe_only",
    "blocking_rules_to_generate_predictions": [
        "l.first_name = r.first_name",
        "l.surname = r.surname",
    ],
    "comparisons": [
        ctl.name_comparison("first_name"),
        ctl.name_comparison("surname"),
        ctl.date_comparison("dob", cast_strings_to_date=True),
        cl.exact_match("city", term_frequency_adjustments=True),
        cl.levenshtein_at_thresholds("email", 2),
    ],
    "retain_matching_columns": True,
    "retain_intermediate_calculation_columns": True,
}

In [3]:
linker = DuckDBLinker(df, settings, set_up_basic_logging=False)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email"
]

linker.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)


In [4]:
linker.estimate_u_using_random_sampling(max_pairs=1e6, seed=3)

In [5]:
session_dob = linker.estimate_parameters_using_expectation_maximisation("l.dob = r.dob")
session_email = linker.estimate_parameters_using_expectation_maximisation("l.email = r.email")

In [6]:
linker.truth_space_table_from_labels_column(
    "cluster", match_weight_round_to_nearest=0.1
).as_pandas_dataframe(limit=5)

Unnamed: 0,truth_threshold,match_probability,row_count,p,n,tp,tn,fp,fn,P_rate,N_rate,tp_rate,tn_rate,fp_rate,fn_rate,precision,recall,f1
0,-17.2,7e-06,4353.0,2031.0,2322.0,2031.0,0.0,2322.0,0.0,0.0,0.533425,1.0,0.0,1.0,0.0,0.466575,1.0,0.636278
1,-16.6,1e-05,4353.0,2031.0,2322.0,2029.0,0.0,2322.0,2.0,0.0,0.533425,0.999015,0.0,1.0,0.000985,0.46633,0.999015,0.635851
2,-16.5,1.1e-05,4353.0,2031.0,2322.0,2029.0,234.0,2088.0,2.0,0.0,0.533425,0.999015,0.100775,0.899225,0.000985,0.492835,0.999015,0.660052
3,-16.0,1.5e-05,4353.0,2031.0,2322.0,2029.0,429.0,1893.0,2.0,0.0,0.533425,0.999015,0.184755,0.815245,0.000985,0.517338,0.999015,0.681788
4,-15.4,2.3e-05,4353.0,2031.0,2322.0,2027.0,429.0,1893.0,4.0,0.0,0.533425,0.998031,0.184755,0.815245,0.001969,0.517092,0.998031,0.681345


In [7]:
linker.roc_chart_from_labels_column("cluster")

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [8]:
linker.precision_recall_chart_from_labels_column("cluster")

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html


In [9]:
# Plot some false positives
linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)

Unnamed: 0,clerical_match_score,found_by_blocking_rules,match_weight,match_probability,unique_id_l,unique_id_r,first_name_l,first_name_r,gamma_first_name,bf_first_name,...,tf_city_r,bf_city,bf_tf_adj_city,email_l,email_r,gamma_email,bf_email,cluster_l,cluster_r,match_key
0,1.0,True,-4.621133,0.039048,177,178,Ellie,Ellie,3,85.824271,...,,1.0,1.0,,elliee@oguzmanoc.m,-1,1.0,48,48,0
1,1.0,True,-2.770475,0.127823,248,249,Joshua,Joshua,3,85.824271,...,,1.0,1.0,,j.williams@levine-johnson.com,-1,1.0,64,64,0
2,1.0,True,-5.770653,0.017988,324,328,Kai,Kai,3,85.824271,...,,1.0,1.0,k.t50eherand@z.ncom,k.t50@her.andezncodm,0,0.124985,87,87,0
3,1.0,True,-0.142871,0.475262,361,362,Mohammed,Mohammed,3,85.824271,...,,1.0,1.0,,mohammedfox24@wilson.com,-1,1.0,95,95,0
4,1.0,True,-2.770475,0.127823,376,380,Eliza,Eliza,3,85.824271,...,,1.0,1.0,,elizataylor@marshall.com,-1,1.0,98,98,0


In [10]:
records = linker.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)

linker.waterfall_chart(records)

<VegaLite 4 object>

If you see this message, it means the renderer has not been properly enabled
for the frontend that you are using. For more information, see
https://altair-viz.github.io/user_guide/troubleshooting.html
