# 07 - Contrasting privacy score between source structured records and clinical note extraction

In notebook 3, the data loss of individual identifiers from the original Synthea record to the extracts generated by the NER were compared. In this notebook the effect this loss has on the privacy risk score is examined.

The approach taken is to compute the privacy risk score on both the Synthea records and the extracted records. Ideally, there would be perfect agreement.

In [None]:
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
from privacy_fingerprint.common.config import (
    load_experiment_config,
    load_experiment_config_from_file,
    load_global_config_from_file,
)

# Example config files are available in the config directory.
# They will need to be modified with the path to the Julia executable

load_global_config_from_file("../configs/global_config.yaml")
load_experiment_config_from_file("../configs/experiment_config.yaml")

In [None]:
experiment_config = load_experiment_config()
experiment_config.scoring.encoding_scheme = "rarest"

In [None]:
import privacy_fingerprint.extract.aws_comprehend as aws
import privacy_fingerprint.generate.synthea as synthea
from privacy_fingerprint.score import PrivacyRiskScorer, encode, preprocess

In [None]:
# The dataset will be loaded from the directory created in notebook 2.
output_dir = "../../local/experiment_data/"

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

## Generate scores on extracted records

The following cells calculate the privacy risk score as in the other notebooks using the entire pipeline.

In [None]:
# The format of the NER records must be standardised to enable scoring
common_ner_results = aws.prepare_common_records(
    aws.DEFAULT_IDENTIFIERS, ner_records
)

In [None]:
pcm_dataset = preprocess(common_ner_results)

In [None]:
pcm_dataset

In [None]:
def simplify_ethnicity(text):
    text = text.lower()
    if text == "":
        return "unknown"
    mentions = defaultdict(int)
    for ethnicity, label in [
        ("white", "white"),
        ("black", "black"),
        ("african", "black"),
        ("asian", "asian"),
        ("indian", "asian"),
        ("pakistani", "asian"),
        ("chinese", "asian"),
    ]:
        if ethnicity in text:
            mentions[label] += 1
    if len(mentions) > 1:
        return "mixed"
    elif len(mentions) == 1:
        return list(mentions.keys())[0]
    else:
        return "unknown"


def simplify_date_of_birth(date):
    dt = pd.to_datetime(date, errors="coerce")
    if pd.isnull(dt):
        return None
    else:
        return 10 * (dt.year // 10)

In [None]:
transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

cols = [
    "date_of_birth",
    "gender",
    "ethnicity",
    "disease",
    "symptoms",
    "treatment",
    "prescriptions",
]


encoded_dataset, lookup = encode(
    pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
population_score = scorer.calculate_population_uniqueness(encoded_dataset)
print(population_score)
scorer.fit(encoded_dataset)
#     individual_scores = scorer.predict(encoded_dataset)
e2e = {
    "population_score": population_score,
    "individual_scores": scorer.predict(encoded_dataset),
}

## Generate scores on Synthea records

The following cells calculate the privacy risk scores using the original Synthea records.

In [None]:
common_results = synthea.prepare_common_records(
    synthea.DEFAULT_IDENTIFIERS, synthea_records
)

In [None]:
synthea_pcm_dataset = preprocess(common_results)

In [None]:
transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

cols = [
    "date_of_birth",
    "gender",
    "ethnicity",
    "disease",
    "symptoms",
    "treatment",
    "prescriptions",
]


encoded_dataset, lookup = encode(
    synthea_pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
population_score = scorer.calculate_population_uniqueness(encoded_dataset)
print(population_score)
scorer.fit(encoded_dataset)
#     individual_scores = scorer.predict(encoded_dataset)
initial_records = {
    "population_score": population_score,
    "individual_scores": scorer.predict(encoded_dataset),
}

## Comparison

In [None]:
print(
    "Population uniqueness on initial records",
    initial_records["population_score"],
)
print("Population uniqueness on extracted records", e2e["population_score"])

print(
    "Correlation between privacy risk scores on the initial Synthea records and extracted records",
    initial_records["individual_scores"].corr(e2e["individual_scores"]),
)
fig, ax = plt.subplots(1, 1)
ax.plot(initial_records["individual_scores"], e2e["individual_scores"], "k.")
ax.set_xlabel("Initial structured records")
ax.set_ylabel("NER extracted records")

In [None]:
df_plot = pd.DataFrame()

df_plot["init_ius"] = initial_records["individual_scores"]
df_plot["e2e_ius"] = e2e["individual_scores"]

g = sns.JointGrid(
    data=df_plot,
    x="init_ius",
    y="e2e_ius",
    # kind="scatter",
    xlim=(-0.01, 1.01),
    ylim=(-0.01, 1.01),
    # s=5
)
g.plot_joint(sns.scatterplot, s=5, alpha=0.85)
g.plot_marginals(sns.histplot, bins=25)

g.ax_joint.set_xlabel("Initial structured records")
g.ax_joint.set_ylabel("NER extracted records")

In [None]:
comparison = pd.DataFrame(
    {
        "initial": initial_records["individual_scores"],
        "extract": e2e["individual_scores"],
    }
)
comparison["difference"] = (comparison.initial - comparison.extract).abs()
comparison

In [None]:
# Examine individual records with the largest difference in score

for idx in comparison.sort_values("difference", ascending=False).index[:10]:
    print(comparison.loc[idx])
    for key in common_results[idx].dict().keys():
        if key not in cols:
            continue
        print(
            "{0:<15} {1:<30} {2}".format(
                key,
                str(common_results[idx].dict()[key]),
                str(common_ner_results[idx].dict()[key]),
            )
        )

    print()

In [None]:
# Compare the ordering of records by privacy risk in the Synthea and extracted datasets


def compare_scores(a, b, label, ax=None, color=None):
    assert len(a) == len(b), "Lengths must match"
    if ax is None:
        fig, ax = plt.subplots(1, 1)
    c = pd.DataFrame({"a": a, "b": b})
    c = c.sort_values("b")
    c["b_rank"] = range(1, 1 + len(a))
    c = c.sort_values("a")
    c["a_rank"] = range(1, 1 + len(a))
    fraction_below = []
    for i in range(len(a)):
        fraction_below.append((c.iloc[:i].b_rank <= c.iloc[i].a_rank).sum())
    if color:
        ax.plot(fraction_below, label=label, color=color)
    else:
        ax.plot(fraction_below, label=label)
    return ax


ax = compare_scores(
    comparison.initial.tolist(),
    comparison.initial.tolist(),
    "Identity",
    color="#555555",
)
ax = compare_scores(
    comparison.initial.tolist(),
    comparison.extract.tolist(),
    "Extract",
    ax=ax,
    color="#c10078",
)

ax = compare_scores(
    comparison.initial.tolist(),
    comparison.initial.sample(frac=1).tolist(),
    "Random",
    ax=ax,
    color="#cccccc",
)
ax.legend()
ax.set_xlabel("Ranked scores from Synthea records")
ax.set_ylabel("Agreement following NER extraction")