<a href="https://colab.research.google.com/github/mgozon/DLG-UROP/blob/main/dlg_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DLG Models
This notebook provides models with and without DLG.
- batch_DLG_Adam
- label_to_onehot
- train_net_Adam
- test_net
- train_net_Adam_DLG

dlg_stats is also imported for analysis in training with dlg (future: consider removing dependency and doing all analysis in another notebook)

In [None]:
import numpy as np
import torch
import torch.nn.functional as F
from torch.autograd import grad
from torch.utils.data import RandomSampler                                        # sample random minibatch
from tqdm.notebook import trange
from math import sqrt

In [None]:
#@title Adding dlg_stats
from google.colab import drive
drive.mount('/content/drive/')
%run 'drive/MyDrive/UROP/Project Files/dlg_stats.ipynb'

Mounted at /content/drive/
defined: assign_guess(guess, gt_dataset, n, verbose = False): guess_perm
defined: assign_best(guess, gt_dataset, n, verbose = False): best_match
defined: compute_stats(guess_perm, gt_data, recovered_threshold = 0.25): rel_errors, recovered_rate, cos_angles


# DLG Algorithm

In [None]:
#@title Batch-DLG with LBFGS (not updated)
# ***NOTE - this gives NaNs when ReLU is used since it requires a differentiable loss function
# --> also possibly due to PyTorch implementation error - https://github.com/pytorch/pytorch/issues/5953)

# DLG algorithm on a given set of flowers and returns the hypothesized input
def batch_DLG_LBFGS(original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, verbose = False):
    losses = []

    # identify (data, label) using LBFGS on the squared difference between the original and guessed gradient
    dummy_data = torch.randn(batch_size, gt_data_len).to(device).requires_grad_(True)
    dummy_label = torch.randn(batch_size, gt_onehot_label_len).to(device).requires_grad_(True)
    optimizer_dlg = torch.optim.LBFGS((dummy_data, dummy_label), max_iter=20)

    global opt_steps; opt_steps = 0
    for epoch in range(100):
        # closure function needed for LBFGS optimizer
        def closure():
            global opt_steps; opt_steps += 1

            # compute gradient of dummy data/label
            optimizer_dlg.zero_grad()
            pred = net(dummy_data)
            dummy_onehot_label = F.softmax(dummy_label, dim=1)
            dummy_loss = criterion(pred, dummy_onehot_label)
            dummy_dy_dx = torch.autograd.grad(dummy_loss, net.parameters(), create_graph=True)
            
            # compute loss function, i.e. the SE of the gradients
            grad_diff = 0
            for gx, gy in zip(dummy_dy_dx, original_dy_dx):
                grad_diff += ((gx - gy) ** 2).sum()
            
            grad_diff.backward()
            return grad_diff
        
        # perform GD and log information
        optimizer_dlg.step(closure)
        current_loss = closure()
        losses.append(current_loss.item())

        if (verbose):
            print(current_loss)
        # if (current_loss < 1e-9):
        #     break
        # setting an upper limit on the number of optimization steps (e.g. limited attacking capability)
        #if (opt_steps >= 80): 
        #    break
    
    return dummy_data, opt_steps, losses

In [None]:
#@title Batch-DLG with Adam
# note: single-update optimization should not be used on one optimizer since Adam requires gradient history to perform updates --> see code block below
# this appears to take longer to converge but may give better results than LBFGS on batches (without using optimization)

# DLG algorithm on a given set of flowers and returns the hypothesized input with gradient losses
def batch_DLG_Adam(net, criterion, device, original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, epoch_mult = 500, w_decay = 1e-12, scheduler_k = 0.001, verbose = False):
    losses = []
    opt_steps = epoch_mult * batch_size

    # identify (data, label) using LBFGS on the squared difference between the original and guessed gradient
    dummy_data = torch.randn(batch_size, gt_data_len).to(device).requires_grad_(True)
    dummy_label = torch.randn(batch_size, gt_onehot_label_len).to(device).requires_grad_(True)
    optimizer_dlg = torch.optim.Adam((dummy_data, dummy_label), lr=1, weight_decay=w_decay) # optimal learning rate seems to depend on the batch size of the dlg
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer_dlg, lambda epoch: scheduler_k ** (1/opt_steps))

    for epoch in trange(opt_steps):
        optimizer_dlg.zero_grad()
        pred = net(dummy_data)
        dummy_onehot_label = F.softmax(dummy_label, dim=1)
        dummy_loss = criterion(pred, dummy_onehot_label)
        dummy_dy_dx = torch.autograd.grad(dummy_loss, net.parameters(), create_graph=True)
        
        # compute loss function, i.e. the SE of the gradients
        grad_diff = 0
        for gx, gy in zip(dummy_dy_dx, original_dy_dx):
            grad_diff += ((gx - gy) ** 2).sum()

        grad_diff.backward()

        # Adam depends on past updates, and so this doesn't really work - loss fluctuates in dlg attack *significantly* when used
        # only update a single dummy_data/dummy_label at a time
        # mult = torch.zeros([batch_size, 1])
        # mult[epoch%batch_size, 0] = 1
        # dummy_data.grad *= mult; dummy_label.grad *= mult
        
        optimizer_dlg.step()
        scheduler.step()
        losses.append(grad_diff.item())

        if verbose:
            print(grad_diff)
        # if (grad_diff < 1e-9):
        #     break
    
    return dummy_data, losses

print('defined: batch_DLG_Adam(net, criterion, device, original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, epoch_mult = 500, w_decay = 1e-12, scheduler_k = 0.001, verbose = False): dummy_data, losses')

defined: batch_DLG_Adam(net, criterion, device, original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, epoch_mult = 500, w_decay = 1e-12, scheduler_k = 0.001, verbose = False): dummy_data, losses


In [None]:
#@title Batch-DLG with Adam - individual optimizers (not updated)
# this doesn't seem to converge nearly as well as a single Adam optimizer

# DLG algorithm on a given set of flowers and returns the hypothesized input
def batch_DLG_Adam2(original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, verbose = False):
    losses = []

    dummy_data = [torch.randn(1, 4).to(device).requires_grad_(True) for i in range(batch_size)]
    dummy_label = [torch.randn(1, 3).to(device).requires_grad_(True) for i in range(batch_size)]
    optimizer_dlg = [torch.optim.Adam((dummy_data[i], dummy_label[i]), lr=1, weight_decay=1e-9) for i in range(batch_size)]
    #scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer_dlg, lambda epoch: 0.999)

    opt_steps = 500 * batch_size
    for epoch in trange(opt_steps):
        idx = epoch % batch_size
        optimizer_dlg[idx].zero_grad()
        pred = [net(dummy_data[i]) for i in range(batch_size)]
        dummy_onehot_labels = [F.softmax(dummy_label[i], dim=1) for i in range(batch_size)]
        dummy_loss = sum([criterion(pred[i], dummy_onehot_labels[i]) for i in range(batch_size)]) / batch_size
        dummy_dy_dx = torch.autograd.grad(dummy_loss, net.parameters(), create_graph=True)
        
        # compute loss function, i.e. the SE of the gradients
        grad_diff = 0
        for gx, gy in zip(dummy_dy_dx, original_dy_dx):
            grad_diff += ((gx - gy) ** 2).sum()

        grad_diff.backward()
        optimizer_dlg[idx].step()
        #scheduler.step()
        losses.append(grad_diff.item())

        if verbose:
            print(grad_diff)
        # if (grad_diff < 1e-9):
        #     break
    
    return dummy_data, opt_steps, losses

In [None]:
#@title Batch-DLG with SGD (not updated)
# this doesn't seem to converge nearly as well as LBFGS even with single-update optimization

# DLG algorithm on a given set of flowers and returns the hypothesized input
def batch_DLG_SGD(original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, verbose = False):
    losses = []
    opt_steps = 500 * batch_size

    # identify (data, label) using LBFGS on the squared difference between the original and guessed gradient
    dummy_data = torch.randn(batch_size, gt_data_len).to(device).requires_grad_(True)
    dummy_label = torch.randn(batch_size, gt_onehot_label_len).to(device).requires_grad_(True)
    optimizer_dlg = torch.optim.SGD((dummy_data, dummy_label), lr=1) #weight_decay=1e-9)#, momentum=0.001)#, weight_decay=1e-9)
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer_dlg, lambda epoch: 0.001 ** (1/opt_steps))

    for epoch in trange(opt_steps):
        optimizer_dlg.zero_grad()
        pred = net(dummy_data)
        dummy_onehot_label = F.softmax(dummy_label, dim=1)
        dummy_loss = criterion(pred, dummy_onehot_label)
        dummy_dy_dx = torch.autograd.grad(dummy_loss, net.parameters(), create_graph=True)
        
        # compute loss function, i.e. the SE of the gradients
        grad_diff = 0
        for gx, gy in zip(dummy_dy_dx, original_dy_dx):
            grad_diff += ((gx - gy) ** 2).sum()

        grad_diff.backward()

        # only update a single dummy_data/dummy_label at a time
        mult = torch.zeros([batch_size, 1])
        mult[epoch%batch_size, 0] = 1
        dummy_data.grad *= mult; dummy_label.grad *= mult
        
        optimizer_dlg.step()
        scheduler.step()
        losses.append(grad_diff.item())

        if verbose:
            print(grad_diff)
        # if (grad_diff < 1e-9):
        #     break
    
    return dummy_data, opt_steps, losses

# Model Training

In [None]:
# auxiliary functions for NN - convert to onehot and loss function
def label_to_onehot(target, n_classes):
    onehot_target = torch.zeros(target.size(0), n_classes, device=target.device)
    onehot_target.scatter_(1, target, 1)
    return onehot_target

print('defined: label_to_onehot(target, n_classes): onehot_target')

defined: label_to_onehot(target, n_classes): onehot_target


In [None]:
# train NN without running batch dlg
# Note: converges sometimes to near-optimal predictions
def train_net_Adam(net, criterion, device, train_data, train_target, output_dim, batch_size = 32, epochs = 100):
    losses = []
    train_dst_len = train_data.shape[0]
    optimizer = torch.optim.Adam(net.parameters(), lr=0.001, weight_decay=1e-5) # regularizer may not be necessary
    for epoch in range(epochs):
        optimizer.zero_grad()

        rand_subset = list(RandomSampler(range(train_dst_len), num_samples=batch_size))
        gt_data = torch.tensor(train_data[rand_subset]).to(device)
        gt_label = torch.tensor(train_target[rand_subset]).to(device)
        gt_onehot_label = label_to_onehot(gt_label, n_classes = output_dim)

        output = net(gt_data.float())
        loss = criterion(output, gt_onehot_label)
        loss.backward()
        
        optimizer.step()

        #batch_accuracy = torch.sum(torch.eq(torch.argmax(output, dim=1), gt_label)) / batch_size
        #accuracies.append(batch_accuracy)
        losses.append(loss.detach().clone())
    
    return losses

print('defined: train_net_Adam(net, criterion, device, train_data, train_target, output_dim, batch_size = 32, epochs = 100): losses')

defined: train_net_Adam(net, criterion, device, train_data, train_target, output_dim, batch_size = 32, epochs = 100): losses


In [None]:
def test_net(net, test_data, test_target):
    test_dst_len = test_data.shape[0]
    pred = net(torch.tensor(test_data).float())
    correct = torch.sum(torch.eq(torch.argmax(pred, dim=1, keepdim=True), torch.tensor(test_target)))
    print(f'score: {correct}/{test_dst_len}')

print('defined: test_net(net, test_data, test_target): prints accuracy')

defined: test_net(net, test_data, test_target): prints accuracy


In [None]:
#@title train with LBFGS optimizer

# NOTE: LBFGS isn't converging when using mini-batches
def train_net_LBFGS(train_data, train_target, batch_size = 16, epochs = 100):
    print(train_data); print(train_target)
    train_dst_len = train_data.shape[0]
    optimizer = torch.optim.LBFGS(net.parameters(), lr=0.001)
    scheduler = torch.optim.lr_scheduler.MultiplicativeLR(optimizer, lambda epoch: 0.99)

    for epoch in range(epochs):
        rand_subset = list(RandomSampler(range(train_dst_len), num_samples=batch_size))
        # print('epoch, randset: ', epoch, rand_subset)
        gt_data = torch.tensor(train_data[rand_subset]).to(device)
        gt_label = torch.tensor(train_target[rand_subset]).to(device)
        gt_onehot_label = label_to_onehot(gt_label, num_classes=3)
        #print('lbfgs: ', gt_data, gt_onehot_label)
        
        def closure():
            optimizer.zero_grad()
            output = net(gt_data.float())
            loss = criterion(output, gt_onehot_label)
            loss.backward()
            #print(f'output: {output}, onehot_label: {gt_onehot_label}')
            print('loss: ', loss)
            return loss
      
        optimizer.step(closure)
        scheduler.step()

In [None]:
# NOTE: this function 
def train_net_Adam_DLG(net, criterion, device, scaler, train_data, train_target, output_dim, batch_size = 32, epochs = 100, verbose = False, dlg_rate = 10, gt_data_len = -1, gt_onehot_label_len = -1, epoch_mult=100, w_decay=1e-12, scheduler_k = 0.01, leak_no_train = False):
    train_dst_len = train_data.shape[0]
    optimizer = torch.optim.Adam(net.parameters()) #lr=0.001)

    # statistics
    losses = []; grad_norms = []
    dlg_timestamps = []
    AA_REs = []; CA_REs = []
    AA_RRate = []; CA_RRate = []
    AA_cos_angles = []; CA_cos_angles = []

    for epoch in trange(epochs):
        optimizer.zero_grad()

        rand_subset = list(RandomSampler(range(train_dst_len), num_samples=batch_size))
        # print('epoch, randset: ', epoch, rand_subset)
        gt_data = torch.tensor(train_data[rand_subset]).to(device)
        gt_label = torch.tensor(train_target[rand_subset]).to(device)
        gt_onehot_label = label_to_onehot(gt_label, n_classes= output_dim)

        output = net(gt_data.float())
        #print('adam: ', output.tolist(), gt_onehot_label.tolist())
        loss = criterion(output, gt_onehot_label)

        # Perform DLG ---------------------------------------------------------------

        if (epoch%dlg_rate == 0):
            batch_dy_dx = torch.autograd.grad(loss, net.parameters(), retain_graph = True)
            batch_norm = sqrt(sum([torch.linalg.norm(_.detach())**2 for _ in batch_dy_dx]))
            grad_norms.append(batch_norm)
            original_dy_dx = list((_.detach().clone() for _ in batch_dy_dx)) # share the gradients with other clients
            guess, _ = batch_DLG_Adam(net, criterion, device, original_dy_dx, batch_size, gt_data_len, gt_onehot_label_len, epoch_mult, w_decay=1e-12, scheduler_k = 0.01, verbose=False)
            
            # rescale data for analysis
            gt_data = torch.tensor(scaler.inverse_transform(gt_data.detach().clone()))
            guess = torch.tensor(scaler.inverse_transform(guess.detach().clone()))
            
            # assign all
            guess_perm = assign_guess(guess, gt_data, batch_size, verbose)
            rel_errors, recovered_rate, cos_angles = compute_stats(guess_perm, gt_data)
            AA_REs.append(rel_errors)
            AA_RRate.append(recovered_rate)
            AA_cos_angles.append(cos_angles)
            dlg_timestamps.append(epoch)

            # closest assignment
            best_match = assign_best(guess, gt_data, batch_size, verbose)
            rel_errors, recovered_rate, cos_angles = compute_stats(guess, best_match)
            CA_REs.append(rel_errors)
            CA_RRate.append(recovered_rate)
            CA_cos_angles.append(cos_angles)

        # end of DLG ----------------------------------------------------------------

        # issue: running batch_DLG seems to ruin the stored gradients of the parameters
        # consider creating a separate ml model and running dlg on that
        # inefficient - REMOVE -----------------------------------------------------------------------------
        optimizer.zero_grad()
        loss = criterion(output, gt_onehot_label)
        # end of REMOVE ------------------------------------------------------------------------------------

        losses.append(loss.detach().clone())
        # *** if leak_no_train is set to true, then don't update model (e.g. for leaking on random weights) ***
        if (leak_no_train):
            continue

        loss.backward()
        if (verbose):
            print('current loss: ', loss)
        
        optimizer.step()

        #batch_accuracy = torch.sum(torch.eq(torch.argmax(output, dim=1), gt_label)) / batch_size
        #accuracies.append(batch_accuracy)
        
    
    return losses, dlg_timestamps, AA_REs, AA_RRate , AA_cos_angles, CA_REs, CA_RRate, CA_cos_angles, grad_norms
  
print('defined: train_net_Adam_DLG(net, criterion, device, scaler, train_data, train_target, output_dim, batch_size = 32, epochs = 100, verbose = False, dlg_rate = 10, gt_data_len = -1, gt_onehot_label_len = -1, epoch_mult=100, w_decay=1e-12, scheduler_k = 0.01, leak_no_train = False): losses, dlg_timestamps, AA_REs, AA_RRate , AA_cos_angles, CA_REs, CA_RRate, CA_cos_angles')
print(' --> leak_no_train allows for running DLG on the same set of random weights for statistical purposes')

defined: train_net_Adam_DLG(net, criterion, device, scaler, train_data, train_target, output_dim, batch_size = 32, epochs = 100, verbose = False, dlg_rate = 10, gt_data_len = -1, gt_onehot_label_len = -1, epoch_mult=100, w_decay=1e-12, scheduler_k = 0.01, leak_no_train = False): losses, dlg_timestamps, AA_REs, AA_RRate , AA_cos_angles, CA_REs, CA_RRate, CA_cos_angles
 --> leak_no_train allows for running DLG on the same set of random weights for statistical purposes
