# 08 - Common identifiers in rare combinations

How do privacy risk scores react when identifiers are common individually but uncommon together?

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

In [None]:
from privacy_fingerprint.common.config import (
    load_experiment_config_from_file,
    load_global_config_from_file,
)

# Example config files are available in the config directory.
# They will need to be modified with the path to the Julia executable

load_global_config_from_file("../configs/global_config.yaml")
load_experiment_config_from_file("../configs/experiment_config.yaml")

In [None]:
from privacy_fingerprint.score import PrivacyRiskScorer, encode

We will create a dummy dataset with only two identifiers each with two values. This simulates a worse case scenario for common identifiers in rare combinations.

In [None]:
pcm_dataset = pd.concat(
    [
        pd.DataFrame(
            [
                {"disease": "headache", "treatment": "aspirin"}
                for _ in range(13)
            ]
        ),
        pd.DataFrame(
            [
                {"disease": "laceration", "treatment": "stitches"}
                for _ in range(5)
            ]
        ),
    ],
    ignore_index=True,
)
pcm_dataset.loc[0, "disease"] = "laceration"
pcm_dataset.loc[1, "treatment"] = "stitches"

display(pcm_dataset.groupby(["disease", "treatment"]).size())

In [None]:
encoded_dataset, lookup = encode(pcm_dataset)
scorer = PrivacyRiskScorer()
population_score = scorer.calculate_population_uniqueness(encoded_dataset)
scorer.fit(encoded_dataset)
individual_scores = scorer.predict(encoded_dataset)
transformed_dataset = scorer.map_records_to_copula(encoded_dataset)

print(population_score)
print((individual_scores > 1e-7).sum())
num_records = pcm_dataset.merge(
    right=pcm_dataset.groupby(
        pcm_dataset.columns.tolist(), as_index=False
    ).size(),
    on=pcm_dataset.columns.tolist(),
    how="left",
)

In [None]:
sns.set_style("white")
fig, ax = plt.subplots(1, 1)
ax2 = ax.twinx()
individual_scores.plot(ax=ax, color="#c10078")
ax.set_ylabel("Privacy risk score", color="#c10078")
num_records["size"].plot(ax=ax2, color="#4d58e5")
ax2.set_ylabel("Number of similar records", color="#4d58e5")
ax.set_xlabel("Record")
ax.set_xticks(range(18))
plt.show()

Here we see that records that appear in the dataset multiple times have very low privacy risk scores (records 3-12). Records that appear slightly less frequently but still multiple times have a higher but still low privacy risk score (records 12-17).

However, the records with a rare combination of common identifiers (records 1-2) also have a low score. The rarity of the combination is not reflected in a increased privacy risk score.