In [1]:
%load_ext autoreload
%autoreload 2
import torch
import torch.utils.data
import numpy as np
import sklearn.datasets
import sklearn.model_selection
import matplotlib.pyplot as plt

import abstract_gradient_training as agt

In [87]:
"""Initialise the halfmoons training data."""
seed = 0
batchsize = 5000  # number of samples per batch
test_size = 500
n_users = 100
# batchsize = 3  # number of samples per batch
# test_size = 1
# n_users = 1
n_batches = 2  # number of batches per epoch
n_epochs = 5  # number of epochs

torch.manual_seed(seed)
# load the dataset
x, y = sklearn.datasets.make_moons(noise=0.1, n_samples=n_batches*batchsize + test_size, random_state=seed)
# to make it easier to train, we'll space the moons out a bit and add some polynomial features
x[y==0, 1] += 0.2
x = np.hstack((x, x**2, (x[:, 0] * x[:, 1])[:, None], x**3))
# # perform a test-train split
# x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
#     x, y, test_size=test_size / (n_batches*batchsize + test_size), random_state=seed
# )

# # convert into pytorch dataloaders
# x_train, y_train = torch.from_numpy(x_train).float(), torch.from_numpy(y_train)
# x_test, y_test = torch.from_numpy(x_test).float(), torch.from_numpy(y_test)
# dataset_train = torch.utils.data.TensorDataset(x_train, y_train)
# dataset_test = torch.utils.data.TensorDataset(x_test, y_test)
# dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batchsize, shuffle=True)
# dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batchsize, shuffle=False)


# Assign random users 1–4 to each datapoint
user_labels = np.random.randint(1, n_users + 1, size=len(x))

# Train-test split
x_train, x_test, y_train, y_test, user_train, user_test = sklearn.model_selection.train_test_split(
    x, y, user_labels, test_size=test_size / (n_batches * batchsize + test_size), random_state=seed
)


# Convert to PyTorch tensors
x_train = torch.from_numpy(x_train).float()
x_test = torch.from_numpy(x_test).float()
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)
user_train = torch.from_numpy(user_train)
user_test = torch.from_numpy(user_test)

# Combine inputs and both labels into TensorDatasets
dataset_train = torch.utils.data.TensorDataset(x_train, user_train, y_train)
dataset_test = torch.utils.data.TensorDataset(x_test, user_test, y_test)

# dataset_train = torch.utils.data.TensorDataset(x_train, y_train)
# dataset_test = torch.utils.data.TensorDataset(x_test, y_test)

print(x_train)
print(user_train)

# Create DataLoaders
dataloader_train = torch.utils.data.DataLoader(dataset_train, batch_size=batchsize, shuffle=True)
dataloader_test = torch.utils.data.DataLoader(dataset_test, batch_size=batchsize, shuffle=False)


# for batch_idx, (inputs, user_labels, targets) in enumerate(dataloader_train):
#     if batch_idx >= 2:  # Print only the first 2 batches
#         break
#     print(f"Batch {batch_idx + 1}:")
#     print("Inputs:")
#     print(inputs)  # This will show the features (x)
#     print("User Labels:")
#     print(user_labels)  # This will show the random user labels
#     print("Targets:")
#     print(targets)  # This will show the target labels (y)
#     print("="*50)

tensor([[ 9.5001e-02,  6.8417e-02,  9.0252e-03,  ...,  6.4997e-03,
          8.5740e-04,  3.2026e-04],
        [ 9.8377e-01, -4.7357e-01,  9.6781e-01,  ..., -4.6589e-01,
          9.5210e-01, -1.0621e-01],
        [-7.9546e-01,  5.5757e-01,  6.3276e-01,  ..., -4.4352e-01,
         -5.0334e-01,  1.7334e-01],
        ...,
        [ 8.3354e-01,  6.2519e-01,  6.9480e-01,  ...,  5.2112e-01,
          5.7914e-01,  2.4436e-01],
        [ 8.8961e-03,  1.8451e-01,  7.9140e-05,  ...,  1.6414e-03,
          7.0403e-07,  6.2816e-03],
        [ 9.3713e-01,  5.1568e-01,  8.7822e-01,  ...,  4.8326e-01,
          8.2301e-01,  1.3713e-01]])
tensor([100,  36,   3,  ...,  87,  34,  73])


In [89]:
import importlib
importlib.reload(agt)
"""Let's train a logistic classifier on the halfmoons example above."""
model = torch.nn.Sequential(
    torch.nn.Linear(7, 128),  # First Linear layer: maps 7 input features to 2 output features
    torch.nn.ReLU(),        # ReLU activation function applied to the output of the first linear layer
    torch.nn.Linear(128, 2)   # Second Linear layer: maps the 2 features from ReLU to 1 final output feature
)
config = agt.AGTConfig(
    learning_rate=0.5,
    n_epochs=5,
    loss="cross_entropy",
    log_level="WARNING",
    device="cuda:0",
    clip_gamma=0.1,
)
# k_values = [0, 1, 10, 20, 50, 100]  # using more values here will improve the guarantees AGT will give
k_values = [50]  # using more values here will improve the guarantees AGT will give
bounded_model_dict = {}  # we'll store our results for each value of 'k' as a dictionary from 'k' to the bounded model

for k_private in k_values:
    config.k_private=k_private
    torch.manual_seed(seed)
    bounded_model = agt.bounded_models.IntervalBoundedModel(model)
    bounded_model = agt.privacy_certified_training_user_level(bounded_model, config, dataloader_train)
    bounded_model_dict[k_private] = bounded_model
    
    # as a metric, compute the number of predictions in the test set certified at this value of k_private
    certified_preds = agt.test_metrics.certified_predictions(bounded_model, x_test)
    print(f"Certified Predictions at k={k_private}: {certified_preds:.2f}")

Batch content: 6
Shape of batch: torch.Size([5000, 7]), users: torch.Size([5000]), labels: torch.Size([5000])
Unique users: tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  27,  28,
         29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,  40,  41,  42,
         43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,  55,  56,
         57,  58,  59,  60,  61,  62,  63,  64,  65,  66,  67,  68,  69,  70,
         71,  72,  73,  74,  75,  76,  77,  78,  79,  80,  81,  82,  83,  84,
         85,  86,  87,  88,  89,  90,  91,  92,  93,  94,  95,  96,  97,  98,
         99, 100]), Inverse indices: tensor([18, 17, 10,  ...,  4, 31, 27])
Shape of batch: torch.Size([5000, 7]), users: torch.Size([5000]), labels: torch.Size([5000])
Unique users: tensor([  1,   2,   3,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  14,
         15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,  

In [None]:
"""Let's use this set of bounded models to for better private prediction using the smooth sensitivity mechanism."""

epsilon = 0.5  # privacy loss
noise_free_acc = agt.test_metrics.test_accuracy(bounded_model_dict[0], x_test, y_test)[0]

# compute accuracy using the smooth sensitivity Cauchy mechanism
smooth_sens_noise_level = agt.privacy_utils.get_calibrated_noise_level(
    x_test, bounded_model_dict, epsilon, noise_type="cauchy"
)
smooth_sens_acc = agt.privacy_utils.noisy_test_accuracy(
    bounded_model_dict[0], x_test, y_test, noise_level=smooth_sens_noise_level, noise_type="cauchy"
)

# compute accuracy when using the global sensitivity mechanism
global_sens_acc = agt.privacy_utils.noisy_test_accuracy(
    bounded_model_dict[0], x_test, y_test, noise_level=1.0 / epsilon
)

print(f"Noise Free Accuracy: {noise_free_acc:.2f}")
print(f"Smooth Sensitivity Accuracy: {smooth_sens_acc:.2f}")
print(f"Global Sensitivity Accuracy: {global_sens_acc:.2f}")

Noise Free Accuracy: 0.86
Smooth Sensitivity Accuracy: 0.85
Global Sensitivity Accuracy: 0.58


: 

In [None]:
# """Initialise a large model (which will be random here but would be a pre-trained model in practice)."""
# model = torch.nn.Sequential(
#     torch.nn.Linear(7, 128),
#     torch.nn.ReLU(),
#     torch.nn.Linear(128, 128),
#     torch.nn.ReLU(),
#     torch.nn.Linear(128, 128),
#     torch.nn.ReLU(),
#     torch.nn.Linear(128, 2)
# )
# config = agt.AGTConfig(
#     learning_rate=0.5,
#     n_epochs=2,
#     loss="cross_entropy",
#     log_level="INFO",
#     device="cuda:0",
#     clip_gamma=0.1,
#     k_private=10
# )

# # first try training the whole thing - observe that the certified accuracy goes to zero
# bounded_model = agt.bounded_models.IntervalBoundedModel(model)
# bounded_model = agt.privacy_certified_training_user_level(bounded_model, config, dataloader_train, dataloader_test)

# # second, split the model into a fixed part and a trainable part
# fixed_layers, trainable_layers = model[:4], model[4:]
# # wrap both in bounded models, using the first as the 'transform' argument to the second
# transform = agt.bounded_models.IntervalBoundedModel(fixed_layers, trainable=False)
# bounded_model = agt.bounded_models.IntervalBoundedModel(trainable_layers, transform=transform)
# # train the model
# bounded_model = agt.privacy_certified_training_user_level(bounded_model, config, dataloader_train, dataloader_test)

[AGT] [INFO    ] [03:30:35] Starting epoch 1


Batch content: 6


[AGT] [INFO    ] [03:30:35] Batch 1. Loss (accuracy): 0.460 <= 0.460 <= 0.460
[AGT] [INFO    ] [03:30:36] Starting epoch 2
[AGT] [INFO    ] [03:30:36] Batch 2. Loss (accuracy): 0.496 <= 0.498 <= 0.944
[AGT] [INFO    ] [03:30:37] Batch 3. Loss (accuracy): 0.000 <= 0.896 <= 1.000
[AGT] [INFO    ] [03:30:38] Final Eval. Loss (accuracy): 0.000 <= 0.898 <= 1.000
[AGT] [INFO    ] [03:30:38] Starting epoch 1


Batch content: 6


[AGT] [INFO    ] [03:30:38] Batch 1. Loss (accuracy): 0.460 <= 0.460 <= 0.460
[AGT] [INFO    ] [03:30:39] Starting epoch 2
[AGT] [INFO    ] [03:30:39] Batch 2. Loss (accuracy): 0.498 <= 0.498 <= 0.498
[AGT] [INFO    ] [03:30:40] Batch 3. Loss (accuracy): 0.582 <= 0.782 <= 0.920
[AGT] [INFO    ] [03:30:41] Final Eval. Loss (accuracy): 0.582 <= 0.912 <= 0.982


: 