
# IRRS ‚Äî Synthetic Data Reidentification Risk Score

This notebook implements the IRRS metric with **improved notebook output**, including:
- structured configuration table
- risk score table
- recommendations in Markdown


In [None]:
import math
import pandas as pd
from IPython.display import display, Markdown

# =========================
#  Normalization 0‚Äì1
# =========================

def sensitivity_score(cfg):
    s = cfg["training_data_sensitivity"]
    if s == "low":
        return 0.2
    if s == "moderate":
        return 0.6
    return 1.0  # high


def model_complexity_score(cfg):
    m = cfg["model_type"]
    if m in {"gan", "vae", "diffusion", "autoregressive"}:
        return 1.0
    if m in {"statistical", "simple_noise"}:
        return 0.2
    return 0.5


def quasi_identifier_score(cfg, max_qi=10):
    return min(1.0, cfg["n_quasi_identifiers"] / max_qi)


def rare_records_score(cfg):
    return float(cfg["prop_rare_records"])


def mode_score(cfg):
    return 1.0 if cfg["synthetic_mode"] == "row_level" else 0.2


def dataset_size_score(cfg):
    N = max(1, int(cfg["n_records"]))
    t = 1.0 - (math.log10(N) / 6.0)
    return max(0.0, min(1.0, t))


def privacy_protection_score(cfg):
    techniques = cfg.get("privacy_techniques", [])

    eff_map = {
        "differential_privacy": 0.6,
        "k_anonymity": 0.4,
        "l_diversity": 0.3,
        "t_closeness": 0.3,
        "aggregation_only": 0.5,
        "generalization": 0.25,
        "suppression": 0.2,
        "noise_addition": 0.2,
    }

    prod_term = 1.0
    for t in techniques:
        prod_term *= (1.0 - eff_map.get(t, 0.0))

    return 1.0 - prod_term


def score_membership_inference_irrs(cfg):
    S_sens = sensitivity_score(cfg)
    C_model = model_complexity_score(cfg)
    Q_QI = quasi_identifier_score(cfg)
    R_rare = rare_records_score(cfg)
    M_mode = mode_score(cfg)
    P_priv = privacy_protection_score(cfg)

    r = (
        0.35 * S_sens +
        0.20 * C_model +
        0.15 * Q_QI +
        0.15 * R_rare +
        0.10 * M_mode -
        0.25 * P_priv
    )

    r = max(0.0, min(1.0, r))
    return 100 * r


def score_attribute_inference_irrs(cfg):
    S_sens = sensitivity_score(cfg)
    C_model = model_complexity_score(cfg)
    Q_QI = quasi_identifier_score(cfg)
    R_rare = rare_records_score(cfg)
    P_priv = privacy_protection_score(cfg)

    r = (
        0.30 * S_sens +
        0.25 * C_model +
        0.25 * Q_QI +
        0.10 * R_rare -
        0.30 * P_priv
    )

    r = max(0.0, min(1.0, r))
    return 100 * r


def score_singling_out_irrs(cfg):
    Q_QI = quasi_identifier_score(cfg)
    R_rare = rare_records_score(cfg)
    T_N = dataset_size_score(cfg)
    M_mode = mode_score(cfg)
    P_priv = privacy_protection_score(cfg)

    r = (
        0.30 * Q_QI +
        0.30 * R_rare +
        0.20 * T_N +
        0.10 * M_mode -
        0.25 * P_priv
    )

    r = max(0.0, min(1.0, r))
    return 100 * r


def global_risk_category(score):
    if score < 30:
        return "Low"
    if score < 60:
        return "Moderate"
    return "High"


def compute_irrs(cfg):
    R_mem = score_membership_inference_irrs(cfg)
    R_attr = score_attribute_inference_irrs(cfg)
    R_single = score_singling_out_irrs(cfg)

    IRRS = 0.4 * R_mem + 0.3 * R_attr + 0.3 * R_single

    return {
        "R_mem": R_mem,
        "R_attr": R_attr,
        "R_single": R_single,
        "IRRS": IRRS,
        "category": global_risk_category(IRRS)
    }


def generate_recommendations_simple(result, cfg):
    recs = []
    techniques = cfg.get("privacy_techniques", [])
    sens = cfg["training_data_sensitivity"]

    if result["R_mem"] >= 60:
        recs.append(
            "- High membership inference risk: consider formal mechanisms such as differential privacy or stronger regularization."
        )

    if result["R_attr"] >= 60:
        recs.append(
            "- High attribute inference risk: review quasi-identifiers and apply generalization/suppression to reduce predictability."
        )

    if result["R_single"] >= 60:
        recs.append(
            "- High singling-out risk: reduce the proportion of rare records and evaluate k-anonymity-like guarantees."
        )

    if sens == "high" and "differential_privacy" not in techniques:
        recs.append(
            "- Training data are highly sensitive and no differential privacy was reported: consider introducing at least one formal PET."
        )

    if not techniques:
        recs.append(
            "- No explicit privacy-enhancing techniques were reported: consider a privacy-by-design strategy and document safeguards."
        )

    if not recs:
        recs.append(
            "- IRRS suggests relatively controlled reidentification risk under the current assumptions. Periodic review is still recommended."
        )

    return recs


def show_irrs_report(result, cfg):
    display(Markdown("## üßÆ IRRS ‚Äî Synthetic Data Reidentification Risk Score"))

    cfg_table = pd.DataFrame(
        {
            "Parameter": [
                "n_records",
                "n_quasi_identifiers",
                "prop_rare_records",
                "privacy_techniques",
                "training_data_sensitivity",
                "model_type",
                "synthetic_mode",
            ],
            "Value": [
                cfg["n_records"],
                cfg["n_quasi_identifiers"],
                cfg["prop_rare_records"],
                ", ".join(cfg["privacy_techniques"]) if cfg["privacy_techniques"] else "(none)",
                cfg["training_data_sensitivity"],
                cfg["model_type"],
                cfg["synthetic_mode"],
            ],
        }
    )

    display(Markdown("### Input configuration"))
    display(cfg_table)

    risk_table = pd.DataFrame(
        {
            "Risk Dimension": [
                "Membership inference (R_mem)",
                "Attribute inference (R_attr)",
                "Singling-out (R_single)",
                "Global IRRS",
            ],
            "Score (0‚Äì100)": [
                round(result["R_mem"], 1),
                round(result["R_attr"], 1),
                round(result["R_single"], 1),
                round(result["IRRS"], 1),
            ],
            "Category": [
                "",
                "",
                "",
                result["category"],
            ],
        }
    )

    display(Markdown("### Risk scores"))
    display(risk_table)

    recs = generate_recommendations_simple(result, cfg)
    rec_md = "### Recommendations\n" + "\n".join(recs)
    display(Markdown(rec_md))



## ‚ñ∂Ô∏è Example usage

Edit the configuration below and run the cell to compute the IRRS score.


In [None]:
config = {
    "n_records": 50000,
    "n_quasi_identifiers": 6,
    "prop_rare_records": 0.12,
    "privacy_techniques": ["k_anonymity", "noise_addition"],
    "training_data_sensitivity": "high",
    "model_type": "gan",
    "synthetic_mode": "row_level",
}

result = compute_irrs(config)
show_irrs_report(result, config)