# 06 - Investigating effect of dataset size



In [None]:
import json
import os
from collections import defaultdict

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

In [None]:
from privacy_fingerprint.common.config import (
    load_experiment_config,
    load_experiment_config_from_file,
    load_global_config_from_file,
)

# Example config files are available in the config directory.
# They will need to be modified with the path to the Julia executable

load_global_config_from_file("../configs/global_config.yaml")
load_experiment_config_from_file("../configs/experiment_config.yaml")

experiment_config = load_experiment_config()
experiment_config.scoring.encoding_scheme = "rarest"

In [None]:
import privacy_fingerprint.extract.aws_comprehend as aws
from privacy_fingerprint.score import PrivacyRiskScorer, encode, preprocess

In [None]:
# The dataset will be loaded from the directory created in notebook 2.
output_dir = "../experiments/02_generate_dataset"

with open(os.path.join(output_dir, "synthea_dataset.json")) as fp:
    synthea_records = json.load(fp)

with open(os.path.join(output_dir, "llm_dataset.json")) as fp:
    llm_results = json.load(fp)

with open(os.path.join(output_dir, "ner_dataset.json")) as fp:
    ner_records = json.load(fp)

In [None]:
# The format of the NER records must be standardised to enable scoring
common_ner_results = aws.prepare_common_records(
    aws.DEFAULT_IDENTIFIERS, ner_records
)

In [None]:
pcm_dataset = preprocess(common_ner_results)

In [None]:
def simplify_ethnicity(text):
    text = text.lower()
    if text == "":
        return "unknown"
    mentions = defaultdict(int)
    for ethnicity, label in [
        ("white", "white"),
        ("black", "black"),
        ("african", "black"),
        ("asian", "asian"),
        ("indian", "asian"),
        ("pakistani", "asian"),
        ("chinese", "asian"),
    ]:
        if ethnicity in text:
            mentions[label] += 1
    if len(mentions) > 1:
        return "mixed"
    elif len(mentions) == 1:
        return list(mentions.keys())[0]
    else:
        return "unknown"


def simplify_date_of_birth(date):
    dt = pd.to_datetime(date, errors="coerce")
    if pd.isnull(dt):
        return None
    else:
        return 10 * (dt.year // 10)

In [None]:
# This cell can be very slow to run

transformations = {
    "gender": lambda x: x.lower()
    if x.lower() in ["female", "male"]
    else "unknown",
    "ethnicity": simplify_ethnicity,
    "date_of_birth": simplify_date_of_birth,
}

cols = [
    "date_of_birth",
    "gender",
    "ethnicity",
    "disease",
    "symptoms",
    "treatment",
    "prescriptions",
]
size_results = []
for repeat in range(10):
    idx = pcm_dataset.index.tolist()
    np.random.shuffle(idx)
    for dataset_size in [1000, 750, 500, 250, 100]:
        encoded_dataset, lookup = encode(
            pcm_dataset.loc[idx[:dataset_size], cols].transform(
                {i: transformations.get(i, lambda x: x) for i in cols}
            )
        )
        scorer = PrivacyRiskScorer()
        population_score = scorer.calculate_population_uniqueness(
            encoded_dataset
        )
        print(population_score)
        scorer.fit(encoded_dataset)
        #     individual_scores = scorer.predict(encoded_dataset)
        size_results.append(
            {
                "repeat": repeat,
                "size": dataset_size,
                "population_score": population_score,
                "individual_scores": scorer.predict(encoded_dataset),
            }
        )

In [None]:
import joblib

joblib.dump(
    size_results,
    os.path.join(output_dir, "dataset_size_score_comparison.joblib"),
)

In [None]:
size_results = joblib.load(
    os.path.join(output_dir, "dataset_size_score_comparison.joblib")
)

In [None]:
pop_comparison = pd.DataFrame(size_results)
pop_comparison = pop_comparison.drop("individual_scores", axis=1)

In [None]:
fig, ax = plt.subplots(1, 1)
pop_comparison.plot.box(ax=ax, by="size", column="population_score")
ax.set_ylabel("Score")
ax.set_xlabel("Dataset size")
ax.set_title("Population privacy score")

In [None]:
# The scores for the same 100 records in each dataset size is compared

repeats = sorted(list(set([i["repeat"] for i in size_results])))
sampling_summary = []
for r in repeats:
    fig, ax = plt.subplots(1, 1)
    repeat_results = {i["size"]: i for i in size_results if i["repeat"] == r}
    print(len(repeat_results))
    for k in repeat_results.keys():
        repeat_subset = repeat_results[k]["individual_scores"].loc[
            repeat_results[100]["individual_scores"].index
        ]
        repeat_subset.plot.kde(
            ax=ax,
            label=str(k),
            ind=np.linspace(0, 1, 41),
        )
        repeat_subset.median()
        sampling_summary.append(
            {
                "repeat": r,
                "size": k,
                "median": repeat_subset.median(),
                "mean": repeat_subset.mean(),
            }
        )
    ax.set_yscale("symlog")
    ax.set_ylim(0, 1000)
    ax.legend()
    sns.despine()
    plt.show()

sampling_summary = pd.DataFrame(sampling_summary)
sampling_summary

In [None]:
sampling_summary.plot.box(by="size", column="median")

In [None]:
fig, ax = plt.subplots(1, 1)
sampling_summary.plot.box(ax=ax, by="size", column="mean")
ax.set_ylabel("Score")
ax.set_xlabel("Dataset size")
ax.set_title("Mean individual record privacy score")