## Evaluation when you have fully labelled data

In this example, our data contains a fully-populated ground-truth column called `cluster` that enables us to perform accuracy analysis of the final model


<a target="_blank" href="https://colab.research.google.com/github/moj-analytical-services/splink/blob/splink4_dev/docs/demos/examples/duckdb/accuracy_analysis_from_labels_column.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>


In [1]:
# Uncomment and run this cell if you're running in Google Colab.
# !pip install git+https://github.com/moj-analytical-services/splink.git@splink4_dev

In [2]:
from splink import splink_datasets

df = splink_datasets.fake_1000
df.head(2)

In [3]:
from splink import SettingsCreator, Linker, block_on, DuckDBAPI
import splink.comparison_template_library as ctl
import splink.comparison_library as cl

settings = SettingsCreator(
    link_type="dedupe_only",
    blocking_rules_to_generate_predictions=[
        block_on("first_name"),
        block_on("surname"),
    ],
    comparisons=[
        ctl.NameComparison("first_name"),
        ctl.NameComparison("surname"),
        ctl.DateComparison(
            "dob",
            input_is_string=True,
            datetime_metrics=["month", "year", "year"],
            datetime_thresholds=[1, 1, 10],
        ),
        cl.ExactMatch("city").configure(term_frequency_adjustments=True),
        ctl.EmailComparison("email", include_username_fuzzy_level=False),
    ],
    retain_intermediate_calculation_columns=True,
)

In [4]:
db_api = DuckDBAPI()
linker = Linker(df, settings, database_api=db_api)
deterministic_rules = [
    "l.first_name = r.first_name and levenshtein(r.dob, l.dob) <= 1",
    "l.surname = r.surname and levenshtein(r.dob, l.dob) <= 1",
    "l.first_name = r.first_name and levenshtein(r.surname, l.surname) <= 2",
    "l.email = r.email",
]

linker.training.estimate_probability_two_random_records_match(deterministic_rules, recall=0.7)

In [5]:
linker.training.estimate_u_using_random_sampling(max_pairs=1e6, seed=5)

In [6]:
session_dob = linker.training.estimate_parameters_using_expectation_maximisation(block_on("dob"))
session_email = linker.training.estimate_parameters_using_expectation_maximisation(
    block_on("email")
)

In [7]:
linker.accuracy_analysis_from_labels_column(
    "cluster", output_type="table"
).as_pandas_dataframe(limit=5)

In [8]:
linker.accuracy_analysis_from_labels_column("cluster", output_type="roc")

In [9]:
linker.accuracy_analysis_from_labels_column(
    "cluster",
    output_type="threshold_selection",
    threshold_actual=0.5,
    add_metrics=["f1"],
)

In [10]:
# Plot some false positives
linker.inference.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_pandas_dataframe(limit=5)

In [11]:
records = linker.inference.prediction_errors_from_labels_column(
    "cluster", include_false_negatives=True, include_false_positives=True
).as_record_dict(limit=5)

linker.visualisations.waterfall_chart(records)