In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter

In [None]:
df = pd.read_csv("./formatted_results.csv")
gma_enabled_on_dataset = df.loc[0]["GraphMatchingAttack"]
overlap_or_training_proportion = "Overlap" if gma_enabled_on_dataset else "Training Proportion"

In [None]:
df["Overlap"] = df["Overlap"].astype(float)
df["TrainedF1"] = df["TrainedF1"].astype(float)
df["ReidentificationRate"] = df["ReidentificationRate"].astype(float)
baseline_metrics = {
    "fakename_1k":     {"Precision": 0.2162, "Recall": 0.2476, "Dice Coefficient": 0.2300},
    "fakename_2k":     {"Precision": 0.2131, "Recall": 0.2452, "Dice Coefficient": 0.2271},
    "fakename_5k":     {"Precision": 0.2144, "Recall": 0.2470, "Dice Coefficient": 0.2287},
    "fakename_10k":    {"Precision": 0.2151, "Recall": 0.2467, "Dice Coefficient": 0.2289},
    "fakename_20k":    {"Precision": 0.2153, "Recall": 0.2473, "Dice Coefficient": 0.2293},
    "fakename_50k":    {"Precision": 0.2151, "Recall": 0.2463, "Dice Coefficient": 0.2288},
    "titanic_full":    {"Precision": 0.2468, "Recall": 0.3770, "Dice Coefficient": 0.2896},
    "euro_person":     {"Precision": 0.2197, "Recall": 0.2446, "Dice Coefficient": 0.2306}
}

encoding_map = {
    "BloomFilter": "Bloom Filter",
    "TabMinHash": "Tabulation Minhash",
    "TwoStepHash": "Two-Step Hash"
}
dataset_map = {
    "fakename_1k.tsv": "FakeName (1k)",
    "fakename_2k.tsv": "FakeName (2k)",
    "fakename_5k.tsv": "FakeName (5k)",
    "fakename_10k.tsv": "FakeName (10k)",
    "fakename_20k.tsv": "FakeName (20k)",
    "fakename_50k.tsv": "FakeName (50k)",
    "titanic_full.tsv": "Titanic",
    "euro_person.tsv": "EuroPerson"
}

dataset_order = [
    "fakename_1k.tsv",
    "fakename_2k.tsv",
    "fakename_5k.tsv",
    "fakename_10k.tsv",
    "fakename_20k.tsv",
    "fakename_50k.tsv",
    "titanic_full.tsv",
    "euro_person.tsv"
]
dataset_order_index = {name: idx for idx, name in enumerate(dataset_order)}

def sort_datasets(sequence):
    """Return datasets in the canonical plotting order."""
    return sorted(sequence, key=lambda name: dataset_order_index.get(name, len(dataset_order_index)))



In [None]:
groupedByEncoding = (
    df.groupby(["Encoding", "Overlap"])
      [["TrainedPrecision", "TrainedRecall", "TrainedF1", "ReidentificationRate"]]
      .mean()
      .reset_index()
)
groupedByDataset = (
  df.groupby(["Dataset", "Overlap"])
    [["TrainedPrecision", "TrainedRecall", "TrainedF1", "ReidentificationRate"]]
    .mean()
    .reset_index()
)
groupedByEncodingAndDataset = (
    df.groupby(["Encoding", "Overlap", "Dataset"])
      [["TrainedPrecision", "TrainedRecall", "TrainedF1", "ReidentificationRate"]]
      .mean()
      .reset_index()
)

In [None]:
overall_avg_f1 = df["TrainedF1"].mean()
overall_avg_prr = df["ReidentificationRate"].mean()
print(f"Average F1:  {overall_avg_f1:.4f}")
print(f"Average PRR: {overall_avg_prr:.4f}")

best_f1_idx = df["TrainedF1"].idxmax()
best_reid_idx = df["ReidentificationRate"].idxmax()
best_f1_exp = df.loc[best_f1_idx]
best_reid_exp = df.loc[best_reid_idx]

print(f"Best F1: {best_f1_exp['TrainedF1']:.4f}")
print(f"Best PRR: {best_reid_exp['ReidentificationRate']:.4f}")

# === AVERAGES ===
avg_per_encoding = (
    df.groupby("Encoding")[["TrainedF1", "ReidentificationRate"]]
      .mean()
      .rename(columns={"TrainedF1": "AvgF1", "ReidentificationRate": "AvgPRR"})
      .reset_index()
)



print("\nAverage Per Encoding")
print(avg_per_encoding)



# === BEST PER ENCODING ===
best_per_encoding = (
    df.loc[df.groupby("Encoding")["TrainedF1"].idxmax(), 
           ["Encoding", "Dataset", "TrainedF1", "ReidentificationRate"]]
    .rename(columns={"TrainedF1": "F1", "ReidentificationRate": "PRRAtBestF1"})
    .reset_index(drop=True)
)
print("\nBest Per Encoding (by F1 Score)")
print(best_per_encoding)

best_per_encoding_reid = (
    df.loc[df.groupby("Encoding")["ReidentificationRate"].idxmax(),
           ["Encoding", "Dataset", "TrainedF1", "ReidentificationRate"]]
    .rename(columns={"ReidentificationRate": "PRR", "TrainedF1": "F1AtBestPRR"})
    .reset_index(drop=True)
)
print("\nBest Per Encoding (by PRR)")
print(best_per_encoding_reid)



In [None]:
plt.figure(figsize=(12, 7))

encodings = groupedByEncoding['Encoding'].unique()

colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, Orange, Green

encodings.sort()

for i, encoding in enumerate(encodings):
    encoding_data = groupedByEncoding[groupedByEncoding['Encoding'] == encoding]
    
    # Print (x, y) pairs
    print(f"\nEncoding: {encoding_map.get(encoding, encoding)}")
    for x, y in zip(encoding_data['Overlap'], encoding_data['TrainedF1']):
        print(f"({x}, {y})")
    
    # Plot the line
    plt.plot(encoding_data['Overlap'], encoding_data['TrainedF1'], 
             marker='o', 
             color=colors[i], 
             linewidth=2.5, 
             markersize=8,
             markeredgecolor="white",
             label=encoding_map.get(encoding, encoding),
             alpha=0.8)

# Add baseline Dice Coefficient (averaged across all datasets) as a horizontal line
baseline_f1s = [v['Dice Coefficient'] for v in baseline_metrics.values()]
avg_baseline_f1 = sum(baseline_f1s) / len(baseline_f1s)
plt.axhline(y=avg_baseline_f1, color='black', linestyle='--', linewidth=2, label=f'Baseline (avg. Dice = {avg_baseline_f1:.3f})')

# Customize the plot
plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
ax = plt.gca()
ax.xaxis.set_major_formatter(PercentFormatter(1.0))
plt.ylabel('Dice Coefficient', fontsize=14, fontweight='bold')
plt.title(f"Dice Coefficient on Unseen Records vs {overlap_or_training_proportion} by Encoding Scheme", fontsize=16, fontweight='bold', pad=20)
# Legend outside the plot
plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

# Grid improvements
plt.grid(True, alpha=0.2, linestyle="--")

plt.ylim(0, 1)

# Improve layout
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 7))

# Get unique encoding schemes
encodings = groupedByEncoding['Encoding'].unique()

# Create a color palette for the different encoding schemes
colors = ['#1f77b4', '#ff7f0e', '#2ca02c']  # Blue, Orange, Green

for i, encoding in enumerate(encodings):
    # Filter data for this encoding
    encoding_data = groupedByEncoding[groupedByEncoding['Encoding'] == encoding]
    
    # Print (x, y) pairs
    print(f"\nEncoding: {encoding_map.get(encoding, encoding)}")
    for x, y in zip(encoding_data['Overlap'], encoding_data['ReidentificationRate']):
        print(f"({x}, {y})")
    
    # Plot the line
    plt.plot(encoding_data['Overlap'], encoding_data['ReidentificationRate'] * 100, 
             marker='o', 
             color=colors[i], 
             markeredgecolor="white",   # makes markers clearer
             linewidth=2.5, 
             markersize=8,
             label=encoding_map.get(encoding, encoding),
             alpha=0.8)

# Customize the plot
plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
plt.ylabel('Perfect Re-Identification Rate', fontsize=14, fontweight='bold')
ax = plt.gca()
ax.yaxis.set_major_formatter(PercentFormatter())
ax.xaxis.set_major_formatter(PercentFormatter(1.0))

plt.title(f"Perfect Re-Identification Rate vs {overlap_or_training_proportion} by Encoding Scheme", fontsize=16, fontweight='bold', pad=20)
# Legend outside the plot
plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

# Grid improvements
plt.grid(True, alpha=0.2, linestyle="--")

# Improve layout
plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 7))

datasets = groupedByDataset['Dataset'].unique()

colors = [
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
    "#9467bd", "#8c564b", "#e377c2", "#7f7f7f"
]

datasets = sort_datasets(datasets)

for i, dataset in enumerate(datasets):
    dataset_data = groupedByDataset[groupedByDataset['Dataset'] == dataset]
    
     # Print (x, y) pairs
    print(f"\nDataset: {dataset}")
    for x, y in zip(dataset_data['Overlap'], dataset_data['TrainedF1']):
        print(f"({x}, {y})")
    plt.plot(dataset_data['Overlap'], dataset_data['TrainedF1'], 
             marker='o', 
             markersize=7,
             markeredgecolor="white",   # makes markers clearer
             linewidth=2.5, 
             color=colors[i % len(colors)],
             label=dataset_map.get(dataset, dataset),
             alpha=0.9)

# Labels & title
plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
plt.ylabel('Dice Coefficient', fontsize=14, fontweight='bold')

plt.title(f"Dice Coefficient on Unseen Records vs {overlap_or_training_proportion} by Dataset", fontsize=16, fontweight='bold', pad=20)

baseline_f1s = [v['Dice Coefficient'] for v in baseline_metrics.values()]
avg_baseline_f1 = sum(baseline_f1s) / len(baseline_f1s)
plt.axhline(y=avg_baseline_f1, color='black', linestyle='--', linewidth=2, label=f'Baseline (avg. Dice = {avg_baseline_f1:.3f})')

# Legend outside the plot
plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)
ax = plt.gca()
ax.xaxis.set_major_formatter(PercentFormatter(1.0))
# Grid improvements
plt.grid(True, alpha=0.2, linestyle="--")

plt.ylim(0,1)

plt.tight_layout()
plt.show()

In [None]:
plt.figure(figsize=(12, 7))

datasets = groupedByDataset['Dataset'].unique()

datasets = sort_datasets(datasets)

colors = [
    "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
    "#9467bd", "#8c564b", "#e377c2", "#7f7f7f"
]

for i, dataset in enumerate(datasets):
    dataset_data = groupedByDataset[groupedByDataset['Dataset'] == dataset]
    
    plt.plot(dataset_data['Overlap'], dataset_data['ReidentificationRate'] * 100, 
             marker='o', 
             markersize=7,
             markeredgecolor="white",   # makes markers clearer
             linewidth=2.5, 
             color=colors[i % len(colors)],
             label=dataset,
             alpha=0.9)

# Labels & title
plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
plt.ylabel('Perfect Re-Identification Rate', fontsize=14, fontweight='bold')

plt.title(f"Perfect Re-Identification Rate vs {overlap_or_training_proportion} by Dataset", fontsize=16, fontweight='bold', pad=20)

# Legend outside the plot
plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

# Grid improvements
plt.grid(True, alpha=0.2, linestyle="--")

plt.tight_layout()
plt.show()

In [None]:
# Get unique encoding schemes
encodings = groupedByEncodingAndDataset['Encoding'].unique()

encodings.sort()

for i, encoding in enumerate(encodings):
    plt.figure(figsize=(12, 7))
    print("Encoding: " + encoding)
    encoding_data = groupedByEncodingAndDataset[groupedByEncodingAndDataset['Encoding'] == encoding]
    datasets = encoding_data['Dataset'].unique()
    
    datasets = sort_datasets(datasets)

    colors = [
        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
        "#9467bd", "#8c564b", "#e377c2", "#7f7f7f"
    ]

    for i, dataset in enumerate(datasets):
        dataset_data = encoding_data[encoding_data['Dataset'] == dataset]
        
        plt.plot(dataset_data['Overlap'], dataset_data['TrainedF1'], 
                marker='o', 
                markersize=7,
                markeredgecolor="white",   # makes markers clearer
                linewidth=2.5, 
                color=colors[i % len(colors)],
                label=dataset,
                alpha=0.9)

    # Labels & title
    plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
    plt.ylabel('Dice Coefficient', fontsize=14, fontweight='bold')

    plt.title(f"Dice Coefficient vs {overlap_or_training_proportion} by Dataset", fontsize=16, fontweight='bold', pad=20)

    # Legend outside the plot
    plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

    # Grid improvements
    plt.grid(True, alpha=0.2, linestyle="--")

    plt.tight_layout()
    plt.show()

In [None]:
# Get unique encoding schemes
encodings = groupedByEncodingAndDataset['Encoding'].unique()

encodings.sort()

for i, encoding in enumerate(encodings):
    plt.figure(figsize=(12, 7))
    print("Encoding: " + encoding)
    encoding_data = groupedByEncodingAndDataset[groupedByEncodingAndDataset['Encoding'] == encoding]
    datasets = encoding_data['Dataset'].unique()
    
    datasets = sort_datasets(datasets)

    colors = [
        "#1f77b4", "#ff7f0e", "#2ca02c", "#d62728",
        "#9467bd", "#8c564b", "#e377c2", "#7f7f7f"
    ]

    for i, dataset in enumerate(datasets):
        dataset_data = encoding_data[encoding_data['Dataset'] == dataset]
        
        plt.plot(dataset_data['Overlap'], dataset_data['ReidentificationRate'] * 100, 
                marker='o', 
                markersize=7,
                markeredgecolor="white",   # makes markers clearer
                linewidth=2.5, 
                color=colors[i % len(colors)],
                label=dataset,
                alpha=0.9)

    # Labels & title
    plt.xlabel(overlap_or_training_proportion, fontsize=14, fontweight='bold')
    plt.ylabel('Perfect Re-Identification Rate', fontsize=14, fontweight='bold')
    plt.title(f"Perfect Re-Identification Rate vs {overlap_or_training_proportion} by Dataset", fontsize=16, fontweight='bold', pad=20)

    # Legend outside the plot
    plt.legend(fontsize=11, loc='center left', bbox_to_anchor=(1, 0.5), frameon=False)

    # Grid improvements
    plt.grid(True, alpha=0.2, linestyle="--")

    plt.tight_layout()
    plt.show()