In [1]:
#imports and setup

!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset, random_split, Subset

import numpy as np
from collections import defaultdict, Counter
import random

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score, roc_curve
from scipy.stats import entropy

# Set device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cpu")

Looking in indexes: https://download.pytorch.org/whl/cu121


In [2]:
print(torch.cuda.is_available())

True


In [3]:
# Define the transformations (normalization is key for training)
transform = transforms.Compose([
    transforms.ToTensor(),
    # Normalization parameters for CIFAR-10
    # mean for CIFAR-10 = (0.4914, 0.4822, 0.4465)
    #std for CIFAR-10  = (0.2023, 0.1994, 0.2010)
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))
])

# Load original datasets
original_trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
original_testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

100%|██████████| 170M/170M [00:13<00:00, 12.2MB/s]


In [4]:
# Reproducibility
SEED = 32
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED); torch.cuda.manual_seed_all(SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# ---- AlexNet variant for CIFAR-10 (32x32) ----
class CIFARAlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()
        self.features = nn.Sequential(
            # 32x32 -> 32x32
            nn.Conv2d(3, 64, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            # 32x32 -> 16x16
            nn.MaxPool2d(kernel_size=2, stride=2),

            # 16x16 -> 16x16
            nn.Conv2d(64, 192, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),
            # 16x16 -> 8x8
            nn.MaxPool2d(kernel_size=2, stride=2),

            # 8x8 -> 8x8
            nn.Conv2d(192, 384, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # 8x8 -> 8x8
            nn.Conv2d(384, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # 8x8 -> 8x8
            nn.Conv2d(256, 256, kernel_size=3, padding=1),
            nn.ReLU(inplace=True),

            # 8x8 -> 4x4
            nn.MaxPool2d(kernel_size=2, stride=2),
        )
        # 256 * 4 * 4 = 4096
        self.classifier = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(256*4*4, 4096),
            nn.ReLU(inplace=True),
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(inplace=True),
            nn.Linear(4096, num_classes),
        )

    def forward(self, x):
        x = self.features(x)
        x = torch.flatten(x, 1)  # (N, 256*4*4)
        x = self.classifier(x)
        return x

# Augmentation for training (paper §6.3): flip, ±10° rotation, ±10% translate, ~0.2% zoom
train_transform_w_aug = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5),
    transforms.RandomAffine(degrees=10, translate=(0.10, 0.10), scale=(0.998, 1.002)),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

# No-aug (for eval and loaders that shouldn't augment)
eval_transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

In [5]:
# Use the already-downloaded datasets but re-wrap them with the correct transforms when needed
full_train_eval = torchvision.datasets.CIFAR10(root='./data', train=True, download=False, transform=eval_transform)
full_test_eval  = torchvision.datasets.CIFAR10(root='./data', train=False, download=False, transform=eval_transform)

NUM_TRAIN = len(full_train_eval)   # 50_000
NUM_TEST  = len(full_test_eval)    # 10_000

#Attacker Dataset - 20K members from CIFAR training, 10K nonmembers from CIFAR training
attacker_idxs = np.random.permutation(np.arange(NUM_TRAIN))

shadow_member_idx = attacker_idxs[:20000]
shadow_nonmembers_idx = attacker_idxs[20000:30000]

Dtrain_attack_members = Subset(full_train_eval, shadow_member_idx.tolist())
Dtrain_attack_nonmembers  = Subset(full_train_eval, shadow_nonmembers_idx.tolist())

# === Summary of Dataset Partitioning (CIFAR-10) ===
print("===== Dataset Partition Summary =====")
print(f"Total CIFAR-10 Train Samples: {len(full_train_eval)}")
print(f"Total CIFAR-10 Test Samples:  {len(full_test_eval)}\n")

print("\n--- Attacker Train Set ---")
print(f"Members (from train):      {len(Dtrain_attack_members)}")
print(f"Non-members:   {len(Dtrain_attack_nonmembers)}")

===== Dataset Partition Summary =====
Total CIFAR-10 Train Samples: 50000
Total CIFAR-10 Test Samples:  10000


--- Attacker Train Set ---
Members (from train):      20000
Non-members:   10000


In [6]:
BATCH_SIZE = 128

def make_subset_dataset(indices, with_aug: bool):
    base = torchvision.datasets.CIFAR10(root='./data', train=True, download=False,
                                        transform=train_transform_w_aug if with_aug else eval_transform)
    return Subset(base, indices.tolist())

train_loader = []
eval_loader  = []

ds_train = make_subset_dataset(shadow_member_idx, with_aug=True)
ds_eval  = make_subset_dataset(shadow_nonmembers_idx, with_aug=False)
train_loader.append(DataLoader(ds_train, batch_size=BATCH_SIZE, shuffle=True,  num_workers=2, pin_memory=True))
eval_loader.append( DataLoader(ds_eval,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True))

# Full test loader for reporting model accuracies (like Table 2)
full_test_loader = DataLoader(full_test_eval, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

In [7]:
# models = []
# acc_table = []
# for i, tr_loader in enumerate(train_loader, start=1):
#     print(f"\nTraining shadow_model (with augmentation)...")
#     net = build_model()
#     net = train_one_model(net, tr_loader, epochs=60)

#     # Store in-memory
#     models.append(net)

#     # Evaluate accuracy
#     acc = accuracy(net, full_test_loader)
#     acc_table.append((f"f{i}", acc))
#     print(f"Shadow model accuracy on CIFAR-10 test (with aug): {acc:.2f}%")

#      # ---- SAVE TO DISK ----
#     torch.save(net.state_dict(), f"shadow_model.pth")
#     print(f"Saved model as shadow_model.pth")

In [8]:
# ---- weight init helper (optional) ----
def kaiming_init(m):
    if isinstance(m, (nn.Conv2d, nn.Linear)):
        nn.init.kaiming_normal_(m.weight, nonlinearity='relu')
        if m.bias is not None:
            nn.init.constant_(m.bias, 0.0)

def build_model():
    net = CIFARAlexNet(num_classes=10)
    net.apply(kaiming_init)
    return net

def train_one_model(model, train_loader, epochs=60, device=device):
    model = model.to(device)
    criterion = nn.CrossEntropyLoss()

    optimizer = optim.Adam(model.parameters(), lr=1e-3, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.MultiStepLR(optimizer, milestones=[30, 45, 55], gamma=0.1)

    model.train()
    for epoch in range(epochs):
        running = 0.0
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            optimizer.zero_grad(set_to_none=True)
            out = model(xb)
            loss = criterion(out, yb)
            loss.backward()
            optimizer.step()
            running += loss.item() * xb.size(0)
        scheduler.step()
        if (epoch+1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} - loss: {running/len(train_loader.dataset):.4f}")
    return model

@torch.no_grad()
def accuracy(model, data_loader, device=device):
    model.eval()
    correct = 0
    total = 0
    for xb, yb in data_loader:
        xb, yb = xb.to(device), yb.to(device)
        logits = model(xb)
        pred = torch.argmax(logits, dim=1)
        correct += (pred == yb).sum().item()
        total += yb.size(0)
    return 100.0 * correct / total

In [9]:
def load_model(path):
    device = torch.device("cpu")
    model = build_model().to(device)
    state_dict = torch.load(path, map_location=device)
    model.load_state_dict(state_dict)
    model.eval()
    return model

shadow_path = "shadow_model.pth"
shadow_model = load_model(shadow_path)

In [10]:
shadow_eval_members = make_subset_dataset(shadow_member_idx, with_aug=False)
shadow_eval_nonmembers = make_subset_dataset(shadow_nonmembers_idx, with_aug=False)

shadow_eval_members_loader = DataLoader(shadow_eval_members, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
shadow_eval_nonmembers_loader = DataLoader(shadow_eval_nonmembers, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

In [11]:
#collect features for training shadow model classifier
criterion = nn.CrossEntropyLoss(reduction="none")

def collect_features(model, data_loader, member_label, device = device):
    model.eval()
    all_features = []
    all_membership = []

    for xb, yb in data_loader:
        xb, yb = xb.to(device), yb.to(device)

        logits = model(xb)

        pred = torch.argmax(logits, dim=1)
        correct = (pred == yb).float()

        probs = F.softmax(logits, dim = 1)

        max_prob, _ = probs.max(dim = 1)

        loss = criterion(logits, yb)

        eps = 1e-12
        entropy = -(probs * (probs + eps).log()).sum(dim = 1)

        top_values, _ = probs.topk(2, dim=1)
        margin = top_values[:, 0] - top_values[:, 1]

        model_features = torch.stack([correct, max_prob, loss, entropy, margin], dim = 1)

        all_features.append(model_features.cpu())
        all_membership.append(torch.full((model_features.size(0),), member_label, dtype  = torch.long))

    all_features = torch.cat(all_features, dim = 0).detach().cpu().numpy()
    all_membership = torch.cat(all_membership, dim = 0).detach().cpu().numpy()

    return all_features, all_membership

In [12]:
#collect the shadow model features which will be used to train the attacker
shadow_x_member, shadow_y_member = collect_features(shadow_model, shadow_eval_members_loader, member_label = 1)
shadow_x_nonmember, shadow_y_nonmember = collect_features(shadow_model, shadow_eval_nonmembers_loader, member_label = 0)

#sanity check to make sure there are 20k members with 5 features
print("Shadow Member Feature Shape:", shadow_x_member.shape)
print("Shadow Member Labels Shape:", shadow_y_member.shape)

#sanity check to make sure there are 10k nonmembers with 5 features
print("Shadow Nonmember Feature Shape:", shadow_x_nonmember.shape)
print("Shadow Nonmember Labels Shape:", shadow_y_nonmember.shape)

Shadow Member Feature Shape: (20000, 5)
Shadow Member Labels Shape: (20000,)
Shadow Nonmember Feature Shape: (10000, 5)
Shadow Nonmember Labels Shape: (10000,)


In [13]:
shadow_attack_x = np.vstack([shadow_x_member, shadow_x_nonmember])
shadow_attack_y = np.concatenate([shadow_y_member, shadow_y_nonmember])

In [14]:
shadow_attack_x = np.vstack([shadow_x_member, shadow_x_nonmember])
shadow_attack_y = np.concatenate([shadow_y_member, shadow_y_nonmember])

clf = RandomForestClassifier(n_estimators = 200, max_depth = None, min_samples_split = 2, min_samples_leaf = 1, n_jobs = -1, class_weight = "balanced")
clf.fit(shadow_attack_x, shadow_attack_y)

shadow_attack_scores = clf.predict_proba(shadow_attack_x)[:, 1]

shadow_auc = roc_auc_score(shadow_attack_y, shadow_attack_scores)
print("Shadow AUC:", shadow_auc)

shadow_fpr, shadow_tpr, thresholds = roc_curve(shadow_attack_y, shadow_attack_scores)
shadow_advantage = (shadow_tpr - shadow_fpr).max()

print("Shadow Attack Advantage:", shadow_advantage)

Shadow AUC: 1.0
Shadow Attack Advantage: 1.0


# MCE CODE

In [15]:
# Load the five models
f1 = load_model("f1.pth")
f2 = load_model("f2.pth")
f3 = load_model("f3.pth")
f4 = load_model("f4.pth")
f5 = load_model("f5.pth")

models = [f1, f2, f3, f4, f5]

# (Optional) double-check f1 accuracy achieved in the setup
print("f1 test accuracy:", accuracy(f1, full_test_loader))


f1 test accuracy: 73.59


In [16]:
#collect features from any of the models to use for attack
def collect_features(dataset, models, member_label, attack_type, ece_scores, device = device):
    all_features = []
    all_membership = []
    eps = 1e-12

    for i in range(len(dataset)):
        if i  % 1000 == 0:
            print(i)
        xb, yb = dataset[i]
        xb = xb.to(device)

        if torch.is_tensor(yb):
            yb = int(yb.item())

        #call specific prediction function for each attack type
        if attack_type == 'CDE':
            probs, excluded_idx = confidence_deviation_predict(xb, models, device)
        elif attack_type == 'HCE':
            probs, excluded_idx = calibration_weighted_predict(xb, models, device, ece_scores)
        elif attack_type == 'KLD':
            probs, excluded_idx = hybrid_kl_predict(xb, models, device)
        else:
            print("Error, no such attack function.")
        max_prob = float(probs.max())

        pred = int(np.argmax(probs))
        correct = 1.0 if pred == yb else 0.0

        true_class = float(probs[yb])
        loss = -np.log(true_class + eps)

        entropy = -float(np.sum(probs * np.log(probs + eps)))

        sorted_probs = np.sort(probs)[::-1]
        margin = float(sorted_probs[0] - sorted_probs[1])

        all_features.append([correct, max_prob, loss, entropy, margin])
        all_membership.append(member_label)

    all_features = np.array(all_features, dtype=np.float32)
    all_membership = np.array(all_membership, dtype=np.int64)

    return all_features, all_membership

In [17]:
attack_aucs = []

# IMPROVED MCE ATTACK

In [18]:
# ---- Disjoint split of the training set into n equal parts ----
n = 5

all_train_idx = np.arange(NUM_TRAIN)
np.random.shuffle(all_train_idx)
splits = np.array_split(all_train_idx, n)  # list of 5 arrays (~10k each)

# ---- EO train set: 2.5k x n members (from train) + 5k non-members (from test) ----
EO_MEM_PER_SPLIT = 2500
eo_mem_indices = []
for s in splits:
    eo_mem_indices.extend(np.random.choice(s, size=EO_MEM_PER_SPLIT, replace=False))
eo_mem_indices = np.array(eo_mem_indices)  # length = 12_500

eo_nonmem_indices = np.random.choice(np.arange(NUM_TEST), size=5000, replace=False)

Dtrain_EO_members     = Subset(full_train_eval, eo_mem_indices.tolist())
Dtrain_EO_nonmembers  = Subset(full_test_eval,  eo_nonmem_indices.tolist())

# ---- MIAShield test set: 5k members (from train) + 5k non-members (from test)
# ensure disjointness with Dtrain_EO to avoid bias/leakage
remaining_train = np.setdiff1d(all_train_idx, eo_mem_indices, assume_unique=False)
remaining_test  = np.setdiff1d(np.arange(NUM_TEST), eo_nonmem_indices, assume_unique=False)
test_mem_indices    = np.random.choice(remaining_train, size=5000, replace=False)
test_nonmem_indices = np.random.choice(remaining_test,  size=5000, replace=False)

Dtest_MIASHIELD_members    = Subset(full_train_eval, test_mem_indices.tolist())
Dtest_MIASHIELD_nonmembers = Subset(full_test_eval,  test_nonmem_indices.tolist())

# === Summary of Dataset Partitioning (CIFAR-10) ===
print("\n--- MIAShield Test Set ---")
print(f"Members (from train):      {len(Dtest_MIASHIELD_members)}")
print(f"Non-members (from test):   {len(Dtest_MIASHIELD_nonmembers)}")


--- MIAShield Test Set ---
Members (from train):      5000
Non-members (from test):   5000


In [19]:
# EO & MIAShield loaders (evaluation only — no aug)
loader_train_EO_mem    = DataLoader(Dtrain_EO_members,    batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
loader_train_EO_nonmem = DataLoader(Dtrain_EO_nonmembers, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

loader_test_MIASHIELD_mem    = DataLoader(Dtest_MIASHIELD_members,    batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
loader_test_MIASHIELD_nonmem = DataLoader(Dtest_MIASHIELD_nonmembers, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)


In [20]:
# --- 1. Confidence Deviation Exclusion (CDE) ---
def confidence_deviation_predict(x_tensor, ensemble_models, device, **kwargs):
  """
  Excludes the model whose confidence on the majority label deviates most
  (absolute difference) from the ensemble mean.
  """
  all_probs = []
  for model in ensemble_models:
    model.eval() # Set model to evaluation mode
    with torch.no_grad():
      output = model(x_tensor.unsqueeze(0).to(device))
      all_probs.append(torch.softmax(output, dim=1).cpu().numpy()[0])
  predictions = np.array(all_probs) # Shape: (5, 10)

  # Identify Majority Label
  top_labels = [np.argmax(p) for p in predictions]
  if not top_labels:
    return np.zeros(predictions.shape[1]), 0
  majority_label = Counter(top_labels).most_common(1)[0][0]

  # Calculate Deviation
  target_confidence = predictions[:, majority_label]
  mean_confidence = np.mean(target_confidence)
  deviation = np.abs(target_confidence - mean_confidence)

  # Exclude the outlier
  excluded_idx = np.argmax(deviation)

  remaining_preds = np.delete(predictions, excluded_idx, axis=0)
  final_pred = np.mean(remaining_preds, axis=0)

  return final_pred, excluded_idx

# --- 2. Historical Calibration Error (HCE) Helpers ---
def compute_ece(model, loader, device, n_bin=10):
  """
  Calculates expected calibration error (ECE) for a single model.
  """
  model.eval()
  bin_boundaries = torch.linspace(0, 1, n_bin + 1)
  confidence_list = []
  predictions_list = []
  labels_list = []

  with torch.no_grad():
    for xb, yb in loader:
      xb, yb = xb.to(device), yb.to(device)
      logits = model(xb)
      probs = torch.softmax(logits, dim=1)
      conf, preds = torch.max(probs, 1)
      confidence_list.append(conf)
      predictions_list.append(preds)
      labels_list.append(yb)

  confidence = torch.cat(confidence_list)
  predictions = torch.cat(predictions_list)
  labels = torch.cat(labels_list)
  accuracies = predictions.eq(labels)

  ece = torch.zeros(1, device=device)
  for bin_lower, bin_upper in zip(bin_boundaries[:-1], bin_boundaries[1:]):
    in_bin = confidence.gt(bin_lower.item()) * confidence.le(bin_upper.item())
    prop_in_bin = in_bin.float().mean()
    if prop_in_bin.item() > 0:
      accuracy_in_bin = accuracies[in_bin].float().mean()
      avg_confidence_in_bin = confidence[in_bin].mean()
      ece += torch.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
  return ece.item()

def calibration_weighted_predict(x_tensor, ensemble_models, device, ece_scores=None):
  """
  Excludes model based on Confidence weighted by historical calibration error.
  Score = Confidence * (1 + ECE). Higher score = higher likelihood of being
  excluded.
  """
  if ece_scores is None:
    raise ValueError("ECE scores must be provided.")

  all_probs = []
  for model in ensemble_models:
    model.eval() # Set model to evaluation
    with torch.no_grad():
      output = model(x_tensor.unsqueeze(0).to(device))
      all_probs.append(torch.softmax(output, dim=1).cpu().numpy()[0])

  predictions = np.array(all_probs) # Shape
  top_labels = [np.argmax(p) for p in predictions]
  majority_label = Counter(top_labels).most_common(1)[0][0]

  target_confidence = predictions[:, majority_label]

  # Weight the confidence by model's general calibration error
  exclusion_scores = []
  for i, conf in enumerate(target_confidence):
    score = conf * (1.0 + ece_scores[i])
    exclusion_scores.append(score)

  excluded_idx = np.argmax(exclusion_scores)
  remaining_preds = np.delete(predictions, excluded_idx, axis=0)
  final_pred = np.mean(remaining_preds, axis=0)
  return final_pred, excluded_idx

# --- 3. Hybrid Approach (KL Divergence) ---
def hybrid_kl_predict(x_tensor, ensemble_models, device, **kwargs):
  """
  Excludes the model whose output distribution diverges most (KL Divergence)
  from the consensus distribution of the ensemble.
  """
  all_prods = []
  for model in ensemble_models:
    model.eval() # Set model to evaluation
    with torch.no_grad():
      output = model(x_tensor.unsqueeze(0).to(device))
      all_prods.append(torch.softmax(output, dim=1).cpu().numpy()[0])

  predictions = np.array(all_prods) # Shape

  # Calculate Consensus (Mean Distribution)
  consensus = np.mean(predictions, axis=0)

  # Calculate KL Divergence for each model vs Consensus
  # entropy(pk, qk) calculates KL(pk || qk)
  kl_divergences = [entropy(pred, consensus) for pred in predictions]

  excluded_idx = np.argmax(kl_divergences)
  remaining_preds = np.delete(predictions, excluded_idx, axis=0)
  final_pred = np.mean(remaining_preds, axis=0)
  return final_pred, excluded_idx

##CDE

In [21]:
#collect CDE features which will be used with classifer
cde_x_member, cde_y_member = collect_features(full_train_eval, models, member_label = 1, attack_type = 'CDE', ece_scores = None, device = device)
cde_x_nonmember, cde_y_nonmember = collect_features(full_test_eval, models, member_label = 0, attack_type = 'CDE', ece_scores = None, device = device)

print("CDE Features:")
print("Member Feature Shape:", cde_x_member.shape)
print("Member Labels Shape:", cde_y_member.shape)
print("Nonmember Feature Shape:", cde_x_nonmember.shape)
print("Nonmember Labels Shape:", cde_y_nonmember.shape)

cde_attack_x = np.vstack([cde_x_member, cde_x_nonmember])
cde_attack_y = np.concatenate([cde_y_member, cde_y_nonmember])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
CDE Features:
Member Feature Shape: (50000, 5)
Member Labels Shape: (50000,)
Nonmember Feature Shape: (10000, 5)
Nonmember Labels Shape: (10000,)


In [22]:
cde_attack_scores = clf.predict_proba(cde_attack_x)[:, 1]

cde_auc = roc_auc_score(cde_attack_y, cde_attack_scores)

print("CDE AUC:", cde_auc)

cde_fpr, cde_tpr, thresholds = roc_curve(cde_attack_y, cde_attack_scores)
cde_advantage = (cde_tpr - cde_fpr).max()

print("CDE Attack Advantage:", cde_advantage)

attack_aucs.append({'model': 'CDE', 'AUC': cde_auc, 'Advantage': cde_advantage})

CDE AUC: 0.510212668
CDE Attack Advantage: 0.017320000000000002


##HCE

In [23]:
# --- Setup: Calculate ECE Scores (Required for Strategy 2) ---
print("Computing ECE scores...")

ece_scores = [compute_ece(m, loader_test_MIASHIELD_nonmem, device) for m in models]
print(f"ECE Scores: {ece_scores}\n")

# --- Generic Evaluation Runner ---
def run_evaluation(strategy, predict_fn, dataset, models, device, **kwargs):
  correct = 0
  exclusion_counts = np.zeros(len(models), dtype=int)
  total_samples = len(dataset)

  print(f"--- Evaluating {strategy} ---")

  for i in range(total_samples):
    x_tensor, y_true = dataset[i]

    # Execute the specific prediction strategy
    pred_probs, excluded = predict_fn(x_tensor, models, device, **kwargs)

    y_pred = np.argmax(pred_probs)
    if y_pred == y_true:
      correct += 1
    exclusion_counts[excluded] += 1

  accuracy = correct / total_samples
  print(f"Accuracy: {accuracy * 100:.2f}%")
  print(f"Exclusion Counts: {exclusion_counts.tolist()}\n")


Computing ECE scores...
ECE Scores: [0.09007111191749573, 0.08758401870727539, 0.0854107141494751, 0.1029433012008667, 0.07500015944242477]



In [24]:
#collect HCE features which will be used with classifer
hce_x_member, hce_y_member = collect_features(full_train_eval, models, member_label = 1, attack_type = 'HCE', ece_scores = ece_scores, device = device)
hce_x_nonmember, hce_y_nonmember = collect_features(full_test_eval, models, member_label = 0, attack_type = 'HCE', ece_scores = ece_scores, device = device)

print("HCE Features:")
print("Member Feature Shape:", hce_x_member.shape)
print("Member Labels Shape:", hce_y_member.shape)
print("Nonmember Feature Shape:", hce_x_nonmember.shape)
print("Nonmember Labels Shape:", hce_y_nonmember.shape)

hce_attack_x = np.vstack([hce_x_member, hce_x_nonmember])
hce_attack_y = np.concatenate([hce_y_member, hce_y_nonmember])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
HCE Features:
Member Feature Shape: (50000, 5)
Member Labels Shape: (50000,)
Nonmember Feature Shape: (10000, 5)
Nonmember Labels Shape: (10000,)


In [25]:
hce_attack_scores = clf.predict_proba(hce_attack_x)[:, 1]

hce_auc = roc_auc_score(hce_attack_y, hce_attack_scores)
print("HCE AUC:", hce_auc)

hce_fpr, hce_tpr, thresholds = roc_curve(hce_attack_y, hce_attack_scores)
hce_advantage = (hce_tpr - hce_fpr).max()

print("HCE Attack Advantage:", hce_advantage)

attack_aucs.append({'model': 'HCE', 'AUC': hce_auc, 'Advantage': hce_advantage})

HCE AUC: 0.508926113
HCE Attack Advantage: 0.014880000000000004


##KLD

In [26]:
#collect MCE features which will be used with classifer
kld_x_member, kld_y_member = collect_features(full_train_eval, models, member_label = 1, attack_type = 'KLD', ece_scores = None, device = device)
kld_x_nonmember, kld_y_nonmember = collect_features(full_test_eval, models, member_label = 0,  attack_type = 'KLD', ece_scores = None, device = device)

print("KLD Features:")
print("Member Feature Shape:", kld_x_member.shape)
print("Member Labels Shape:", kld_y_member.shape)
print("Nonmember Feature Shape:", kld_x_nonmember.shape)
print("Nonmember Labels Shape:", kld_y_nonmember.shape)

kld_attack_x = np.vstack([kld_x_member, kld_x_nonmember])
kld_attack_y = np.concatenate([kld_y_member, kld_y_nonmember])

0
1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000
29000
30000
31000
32000
33000
34000
35000
36000
37000
38000
39000
40000
41000
42000
43000
44000
45000
46000
47000
48000
49000
0
1000
2000
3000
4000
5000
6000
7000
8000
9000
KLD Features:
Member Feature Shape: (50000, 5)
Member Labels Shape: (50000,)
Nonmember Feature Shape: (10000, 5)
Nonmember Labels Shape: (10000,)


In [27]:
kld_attack_scores = clf.predict_proba(kld_attack_x)[:, 1]

kld_auc = roc_auc_score(kld_attack_y, kld_attack_scores)

print("KLD AUC:", kld_auc)

kld_fpr, kld_tpr, thresholds = roc_curve(kld_attack_y, kld_attack_scores)
kld_advantage = (kld_tpr - kld_fpr).max()

print("KLD Attack Advantage:", kld_advantage)

attack_aucs.append({'model': 'KLD', 'AUC': kld_auc, 'Advantage': kld_advantage})

KLD AUC: 0.510310242
KLD Attack Advantage: 0.017939999999999956


In [29]:
for attack in attack_aucs:
    print(f"Model under attack: {attack['model']}")
    print(f"Attack AUC: {attack['AUC']}")
    print(f"Attack advantage: {attack['Advantage']} \n")

Model under attack: CDE
Attack AUC: 0.510212668
Attack advantage: 0.017320000000000002 

Model under attack: HCE
Attack AUC: 0.508926113
Attack advantage: 0.014880000000000004 

Model under attack: KLD
Attack AUC: 0.510310242
Attack advantage: 0.017939999999999956 

