# 05 - Exploring the impact of rare diseases on privacy risk

In this notebook we:

* Identify the occurrence of unique diseases
* Assess the impact of these diseases on the inidividual privacy risk
* Extend the analysis to other unique identifiers

In [None]:
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from privacy_fingerprint.common.config import (
    load_experiment_config,
    load_experiment_config_from_file,
    load_global_config_from_file,
)

# Example config files are available in the config directory.
# They will need to be modified with the path to the Julia executable

load_global_config_from_file("../configs/global_config.yaml")
load_experiment_config_from_file("../configs/experiment_config.yaml")

experiment_config = load_experiment_config()
experiment_config.scoring.encoding_scheme = "rarest"

In [None]:
import privacy_fingerprint.extract.aws_comprehend as aws
from privacy_fingerprint.score import PrivacyRiskScorer, encode, preprocess

In [None]:
# The dataset will be loaded from the directory created in notebook 2.
output_dir = "../experiments/02_generate_dataset"

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

In [None]:
# The format of the NER records must be standardised to enable scoring
common_ner_results = aws.prepare_common_records(
    aws.DEFAULT_IDENTIFIERS, ner_records
)

In [None]:
pcm_dataset = preprocess(common_ner_results)

In [None]:
# Get the number of diseases that appear only once
(pcm_dataset.groupby("disease").size() == 1).sum()

In [None]:
def simplify_ethnicity(text):
    text = text.lower()
    if text == "":
        return "unknown"
    mentions = defaultdict(int)
    for ethnicity, label in [
        ("white", "white"),
        ("black", "black"),
        ("african", "black"),
        ("asian", "asian"),
        ("indian", "asian"),
        ("pakistani", "asian"),
        ("chinese", "asian"),
    ]:
        if ethnicity in text:
            mentions[label] += 1
    if len(mentions) > 1:
        return "mixed"
    elif len(mentions) == 1:
        return list(mentions.keys())[0]
    else:
        return "unknown"


def simplify_date_of_birth(date):
    dt = pd.to_datetime(date, errors="coerce")
    if pd.isnull(dt):
        return None
    else:
        return 10 * (dt.year // 10)

In [None]:
transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

cols = [
    "date_of_birth",
    "gender",
    "ethnicity",
    "disease",
    "symptoms",
    "treatment",
    "prescriptions",
]
encoded_dataset, lookup = encode(
    pcm_dataset[cols].transform(
        {i: transformations.get(i, lambda x: x) for i in cols}
    )
)
scorer = PrivacyRiskScorer()
population_score = scorer.calculate_population_uniqueness(encoded_dataset)
print("The overall population uniqueness is", population_score)
scorer.fit(encoded_dataset)
individual_scores = scorer.predict(encoded_dataset)

From the individual scores, the relationship with unique diseases can be assessed.

In [None]:
rare_disease_list = pcm_dataset.groupby("disease").size() == 1
rare_disease_list = rare_disease_list[rare_disease_list].index.tolist()
print("Diseases mentioned only once", len(rare_disease_list))

print(
    "Average score for records mentioning a rare disease",
    individual_scores[pcm_dataset.disease.isin(rare_disease_list)].mean(),
)
print(
    "Average score for records with more common diseases",
    individual_scores[~pcm_dataset.disease.isin(rare_disease_list)].mean(),
)
fig, ax = plt.subplots(1, 1)
individual_scores[pcm_dataset.disease.isin(rare_disease_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Rare"
)
individual_scores[~pcm_dataset.disease.isin(rare_disease_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Common"
)
ax.set_yscale("symlog")
ax.set_ylim(0, 1000)
ax.legend()
sns.despine()
ax.set_title("Relationship between rare diseases and risk score")
plt.show()

## Rare symptoms

A similar analysis can be repeated for symptoms.

In [None]:
rare_symptom_list = pcm_dataset.groupby("symptoms").size() == 1
rare_symptom_list = rare_symptom_list[rare_symptom_list].index.tolist()
print("Symptoms mentioned only once", len(rare_symptom_list))

print(
    "Average score for records mentioning a rare symptom",
    individual_scores[pcm_dataset.symptoms.isin(rare_symptom_list)].mean(),
)
print(
    "Average score for records with more common symptoms",
    individual_scores[~pcm_dataset.symptoms.isin(rare_symptom_list)].mean(),
)
fig, ax = plt.subplots(1, 1)
individual_scores[pcm_dataset.symptoms.isin(rare_symptom_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Rare"
)
individual_scores[~pcm_dataset.symptoms.isin(rare_symptom_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Common"
)
ax.set_yscale("symlog")
ax.set_ylim(0, 1000)
ax.legend()
sns.despine()
plt.show()

## Rare treatments

A similar analysis can be repeated for treatments.

In [None]:
rare_treatment_list = pcm_dataset.groupby("treatment").size() == 1
rare_treatment_list = rare_treatment_list[rare_treatment_list].index.tolist()
print("Treatments mentioned only once", len(rare_treatment_list))

print(
    "Average score for records mentioning a rare treatment",
    individual_scores[pcm_dataset.treatment.isin(rare_treatment_list)].mean(),
)
print(
    "Average score for records with more common treatments",
    individual_scores[~pcm_dataset.treatment.isin(rare_treatment_list)].mean(),
)
fig, ax = plt.subplots(1, 1)
individual_scores[pcm_dataset.treatment.isin(rare_treatment_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Rare"
)
individual_scores[~pcm_dataset.treatment.isin(rare_treatment_list)].plot.kde(
    ax=ax, ind=np.linspace(0, 1, 41), label="Common"
)
ax.set_yscale("symlog")
ax.set_ylim(0, 1000)
ax.legend()
sns.despine()
plt.show()

## Rare prescriptions

A similar analysis can be repeated for prescriptions.

In [None]:
rare_prescriptions_list = pcm_dataset.groupby("prescriptions").size() == 1
rare_prescriptions_list = rare_prescriptions_list[
    rare_prescriptions_list
].index.tolist()
print("Prescriptions mentioned only once", len(rare_prescriptions_list))

print(
    "Average score for records mentioning a rare prescription",
    individual_scores[
        pcm_dataset.prescriptions.isin(rare_prescriptions_list)
    ].mean(),
)
print(
    "Average score for records with more common prescriptions",
    individual_scores[
        ~pcm_dataset.prescriptions.isin(rare_prescriptions_list)
    ].mean(),
)
fig, ax = plt.subplots(1, 1)
individual_scores[
    pcm_dataset.prescriptions.isin(rare_prescriptions_list)
].plot.kde(ax=ax, ind=np.linspace(0, 1, 41), label="Rare")
individual_scores[
    ~pcm_dataset.prescriptions.isin(rare_prescriptions_list)
].plot.kde(ax=ax, ind=np.linspace(0, 1, 41), label="Common")
ax.set_yscale("symlog")
ax.set_ylim(0, 1000)
ax.legend()
sns.despine()
plt.show()

## Combinations

Combinations can also be investigated.

In [None]:
print("Prescriptions mentioned only once", len(rare_prescriptions_list))

print(
    "Average score for records mentioning a rare disease, symptom, treatment or prescription",
    individual_scores[
        (
            pcm_dataset.disease.isin(rare_disease_list)
            | pcm_dataset.symptoms.isin(rare_symptom_list)
            | pcm_dataset.treatment.isin(rare_treatment_list)
            | pcm_dataset.prescriptions.isin(rare_prescriptions_list)
        )
    ].mean(),
)
print(
    "Average score for records with more common diseases, symptoms, treatments and prescriptions",
    individual_scores[
        ~(
            pcm_dataset.disease.isin(rare_disease_list)
            | pcm_dataset.symptoms.isin(rare_symptom_list)
            | pcm_dataset.treatment.isin(rare_treatment_list)
            | pcm_dataset.prescriptions.isin(rare_prescriptions_list)
        )
    ].mean(),
)
fig, ax = plt.subplots(1, 1)
individual_scores[
    (
        pcm_dataset.disease.isin(rare_disease_list)
        | pcm_dataset.symptoms.isin(rare_symptom_list)
        | pcm_dataset.treatment.isin(rare_treatment_list)
        | pcm_dataset.prescriptions.isin(rare_prescriptions_list)
    )
].plot.kde(ax=ax, ind=np.linspace(0, 1, 41), label="Rare")
individual_scores[
    ~(
        pcm_dataset.disease.isin(rare_disease_list)
        | pcm_dataset.symptoms.isin(rare_symptom_list)
        | pcm_dataset.treatment.isin(rare_treatment_list)
        | pcm_dataset.prescriptions.isin(rare_prescriptions_list)
    )
].plot.kde(ax=ax, ind=np.linspace(0, 1, 41), label="Common")
ax.set_yscale("symlog")
ax.set_ylim(0, 1000)
ax.legend()
sns.despine()
ax.set_title("Relationship between rare identifiers and risk score")
plt.show()