# 04 - Calculating and modifying privacy risk score

In this notebook we:

* Calculate the privacy risk score
* Remove identifiers and assess the impact on the score
* Modify identifiers and assess the impact on the score

In [None]:
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from privacy_fingerprint.common.config import (
    load_experiment_config_from_file,
    load_global_config_from_file,
)

load_global_config_from_file("../configs/global_config.yaml")
load_experiment_config_from_file("../configs/experiment_config.yaml")

In [None]:
import privacy_fingerprint.extract.aws_comprehend as aws
from privacy_fingerprint.score import PrivacyRiskScorer, encode, preprocess

In [None]:
# The dataset will be loaded from the directory created in notebook 2.
output_dir = "../experiments/02_generate_dataset"

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

In [None]:
# The format of the NER records must be standardised to enable scoring
common_ner_results = aws.prepare_common_records(
    aws.DEFAULT_IDENTIFIERS, ner_records
)

In [None]:
pcm_dataset = preprocess(common_ner_results)

In [None]:
print(pcm_dataset.columns)

The cell below calculates the privacy risk score for the entire dataset and subsets that exclude specific columns. The goal is to maximise privacy while maintaining utility.

In [None]:
score_summary = []
for to_drop in [
    [],
    ["nhs_number"],
    ["nhs_number", "name"],
    [
        "nhs_number",
        "name",
        "date_of_birth",
        "date_of_visit",
        "gender",
        "ethnicity",
    ],
    [
        "nhs_number",
        "name",
        "date_of_birth",
        "date_of_visit",
        "gender",
        "ethnicity",
        "department",
        "provider",
    ],
    [
        "nhs_number",
        "name",
        "date_of_visit",
        "gender",
        "ethnicity",
        "department",
        "provider",
    ],
    [
        "nhs_number",
        "name",
        "date_of_visit",
        "ethnicity",
        "department",
        "provider",
    ],
    ["nhs_number", "name", "date_of_visit", "department", "provider"],
    [
        "nhs_number",
        "name",
        "date_of_visit",
        "department",
        "provider",
        "date_of_birth",
    ],
]:
    cols = [
        i
        for i in pcm_dataset.columns
        if not any([i.startswith(j) for j in to_drop])
    ]
    encoded_dataset, lookup = encode(pcm_dataset[cols])
    scorer = PrivacyRiskScorer()
    result = scorer.calculate_population_uniqueness(encoded_dataset)
    row = {i: "X" if i in cols else "" for i in pcm_dataset.columns}
    row["SCORE"] = result
    score_summary.append(row)

score_summary = pd.DataFrame(score_summary).T
score_summary

Reducing the options for gender and ethnicity can improve privacy while likely maintaining much of the utility.

In [None]:
def simplify_ethnicity(text):
    text = text.lower()
    if text == "":
        return "unknown"
    mentions = defaultdict(int)
    for ethnicity, label in [
        ("white", "white"),
        ("black", "black"),
        ("african", "black"),
        ("asian", "asian"),
        ("indian", "asian"),
        ("pakistani", "asian"),
        ("chinese", "asian"),
    ]:
        if ethnicity in text:
            mentions[label] += 1
    if len(mentions) > 1:
        return "mixed"
    elif len(mentions) == 1:
        return list(mentions.keys())[0]
    else:
        return "unknown"


transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
}

score_summary = []
for to_drop in [
    ["nhs_number", "name", "date_of_visit", "department", "provider"],
    [
        "nhs_number",
        "name",
        "date_of_visit",
        "department",
        "provider",
        "date_of_birth",
    ],
]:
    cols = [
        i
        for i in pcm_dataset.columns
        if not any([i.startswith(j) for j in to_drop])
    ]
    encoded_dataset, lookup = encode(
        pcm_dataset[cols].transform(
            {i: transformations.get(i, lambda x: x) for i in cols}
        )
    )
    scorer = PrivacyRiskScorer()
    result = scorer.calculate_population_uniqueness(encoded_dataset)
    row = {
        i: "simplify" if i in transformations else "X" if i in cols else ""
        for i in pcm_dataset.columns
    }
    row["SCORE"] = result
    score_summary.append(row)

score_summary = pd.DataFrame(score_summary).T
score_summary

Grouping dates of birth into decades also improves privacy. In adults, the utility will likely be maintained although a different approach may be needed for children.

In [None]:
def simplify_date_of_birth(date):
    dt = pd.to_datetime(date, errors="coerce")
    if pd.isnull(dt):
        return None
    else:
        return 10 * (dt.year // 10)


transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

score_summary = []
for to_drop in [
    ["nhs_number", "name", "date_of_visit", "department", "provider"],
]:
    cols = [
        i
        for i in pcm_dataset.columns
        if not any([i.startswith(j) for j in to_drop])
    ]
    #     print(to_drop, cols)
    encoded_dataset, lookup = encode(
        pcm_dataset[cols].transform(
            {i: transformations.get(i, lambda x: x) for i in cols}
        )
    )
    scorer = PrivacyRiskScorer()
    result = scorer.calculate_population_uniqueness(encoded_dataset)
    #     print(result)
    row = {
        i: "simplify" if i in transformations else "X" if i in cols else ""
        for i in pcm_dataset.columns
    }
    row["SCORE"] = result
    score_summary.append(row)

score_summary = pd.DataFrame(score_summary).T
score_summary

Focusing on the date of birth the impact of converting to decades or removing entirely on individual records can be visualised.

A symlog scale is used on the y axis to give greater visual separation.

In [None]:
all_transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

score_summary = []

# Include DOB without alteration
transformations = {
    k: v
    for k, v in all_transformations.items()
    if k in ["gender", "ethnicity"]
}
cols = [
    i
    for i in pcm_dataset.columns
    if not any(
        [
            i.startswith(j)
            for j in [
                "nhs_number",
                "name",
                "date_of_visit",
                "department",
                "provider",
            ]
        ]
    )
]
encoded_dataset, lookup = encode(
    pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
result = scorer.calculate_population_uniqueness(encoded_dataset)
scorer.fit(encoded_dataset)
record_results_with_dob = scorer.predict(encoded_dataset)
row = {
    i: "simplify" if i in transformations else "X" if i in cols else ""
    for i in pcm_dataset.columns
}
row["SCORE"] = result
score_summary.append(row)
print("Completed with DOB")

# Include DOB but convert to decades
transformations = {
    k: v
    for k, v in all_transformations.items()
    if k in ["gender", "ethnicity", "date_of_birth"]
}
cols = [
    i
    for i in pcm_dataset.columns
    if not any(
        [
            i.startswith(j)
            for j in [
                "nhs_number",
                "name",
                "date_of_visit",
                "department",
                "provider",
            ]
        ]
    )
]
encoded_dataset, lookup = encode(
    pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
result = scorer.calculate_population_uniqueness(encoded_dataset)
scorer.fit(encoded_dataset)
record_results_simplified_dob = scorer.predict(encoded_dataset)
row = {
    i: "simplify" if i in transformations else "X" if i in cols else ""
    for i in pcm_dataset.columns
}
row["SCORE"] = result
score_summary.append(row)
print("Completed with decade DOB")

# Exclude DOB
transformations = {
    k: v
    for k, v in all_transformations.items()
    if k in ["gender", "ethnicity"]
}
cols = [
    i
    for i in pcm_dataset.columns
    if not any(
        [
            i.startswith(j)
            for j in [
                "nhs_number",
                "name",
                "date_of_visit",
                "department",
                "provider",
                "date_of_birth",
            ]
        ]
    )
]
encoded_dataset, lookup = encode(
    pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
result = scorer.calculate_population_uniqueness(encoded_dataset)
scorer.fit(encoded_dataset)
record_results_no_dob = scorer.predict(encoded_dataset)
row = {
    i: "simplify" if i in transformations else "X" if i in cols else ""
    for i in pcm_dataset.columns
}
row["SCORE"] = result
score_summary.append(row)
print("Completed without DOB")


score_summary = pd.DataFrame(score_summary).T
score_summary

In [None]:
record_results_with_dob.head()

In [None]:
fig, ax = plt.subplots(1, 1)

record_results_with_dob.plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="DOB (0.967)", color="#ddd"
)
record_results_simplified_dob.plot.kde(
    ax=ax,
    ind=np.linspace(0, 1, 41),
    label="Pooled DOB (0.611)",
    color="#C10078",
)
record_results_no_dob.plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="No DOB (0.393)", color="#4D58E5"
)
# record_results_with_dob.plot.hist(ax=ax, label="DOB")
# record_results_simplified_dob.plot.hist(ax=ax,label="Pooled DOB")
# record_results_no_dob.plot.hist(ax=ax, label="No DOB")
ax.set_ylim(0, 40)
ax.set_xlabel("Privacy Score")
ax.set_yscale("symlog")
ax.legend()
sns.despine()