In [1]:
%load_ext autoreload
%autoreload 2
import sys
import torch
import copy
from torch import nn
from torch.utils import data
sys.path.append('..')
# sys.path.append('/system/user/beck/pwbeck/projects/regularization/ml_utilities')
from omegaconf import OmegaConf
from ml_utilities.torch_models import get_model_class
from ml_utilities.torch_utils import gradients_to_vector
from erank.data.datasetgenerator import DatasetGenerator
gpu_id = 0

  from .autonotebook import tqdm as notebook_tqdm


## Setup

In [2]:
# model config
cfg = """
model:
  name: cnn2d
  out_channels: 64
  out_units: 10
  model_kwargs:
    image_size: 28
    in_channels: 1
    act_fn: relu
    layer_configs:
      - out_channels: ${model.out_channels}
        kernel_size: 3
        batch_norm: true
        stride: 1
        padding: 0
        max_pool_kernel_size: 2
      - out_channels: ${model.out_channels}
        kernel_size: 3
        batch_norm: true
        stride: 1
        padding: 0
        max_pool_kernel_size: 2
      - out_channels: ${model.out_channels}
        kernel_size: 3
        batch_norm: true
        stride: 1
        padding: 0
      - out_channels: ${model.out_channels}
        kernel_size: 3
        batch_norm: true
        stride: 1
        padding: 0
    linear_output_units:
      - ${model.out_units}

data:
  dataset: rotatedvision
  dataset_kwargs:
    data_root_path: /home/max/phd/data #/system/user/beck/pwbeck/data
    dataset: mnist
    rotation_angle: 0.0
  dataset_split:
    train_val_split: 0.8
"""
cfg = OmegaConf.create(cfg)

In [3]:
# model
model_cfg = cfg.model
model_class = get_model_class(model_cfg.name)
# init model
model_ad = model_class(**model_cfg.model_kwargs)
# load model
model_p = model_class.load('./../tmp/11.6_models/model_step_1550.p', device='cpu')

In [4]:
# data
batch_size = 64
data_cfg = cfg.data
dataset_generator = DatasetGenerator(dataset=data_cfg.dataset,
                                    dataset_kwargs=data_cfg.dataset_kwargs,
                                    dataset_split=data_cfg.dataset_split)
dataset_generator.generate_dataset()
train_set, val_set = dataset_generator.train_split, dataset_generator.val_split

train_loader = data.DataLoader(train_set, batch_size=64)
batch_x, batch_y = next(iter(train_loader))

## Playground

In [5]:
# add model together
def add_models(model1: nn.Module, model2: nn.Module, res_model: nn.Module = None) -> nn.Module:
    if res_model is None:
        res_model = copy.deepcopy(model1)
    state_dict = res_model.state_dict()
    with torch.no_grad():
        for (name_p1, p1), (name_p2, p2) in zip(model1.named_parameters(), model2.named_parameters()):
            assert name_p1 == name_p2, 'Models to add do not match!'
            state_dict[name_p1] = p1 + p2
    res_model.load_state_dict(state_dict)
    if res_model is None:
        return res_model

# set model weights to zero
def set_model_weights_to_zero(model : nn.Module) -> None:
    with torch.no_grad():
        for param_name, param in model.named_parameters():
            param.fill_(0.0)

loss_fn = torch.nn.CrossEntropyLoss()
batch_x, batch_y = batch_x.to(gpu_id), batch_y.to(gpu_id)

In [6]:
## Direct gradient
model = copy.deepcopy(model_p)

In [7]:
model = model.to(device=gpu_id)

In [8]:
pred = model(batch_x)

In [9]:
l = loss_fn(pred, batch_y)

In [10]:
l.backward()

In [11]:
# Gradient
grad_direct = gradients_to_vector(model.parameters())

In [12]:
## Addition gradient
model2 = model_class(**model_cfg.model_kwargs)
set_model_weights_to_zero(model_ad)

In [13]:
next(iter(model_ad.named_parameters()))

('cnn.0.0.weight',
 Parameter containing:
 tensor([[[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0., 0.]]],
 
 
         [[[0., 0., 0.],
           [0., 0., 0.],
           [0., 0

In [14]:
model2 = model2.to(device=gpu_id)

In [15]:
add_models(model_p, model_ad, model2)

In [16]:
pred = model2(batch_x)
l = loss_fn(pred, batch_y)
l.backward()

In [17]:
next(iter(model2.named_parameters()))

('cnn.0.0.weight',
 Parameter containing:
 tensor([[[[ 0.0366,  0.1346, -0.3255],
           [-0.3141, -0.1886,  0.0856],
           [ 0.0753,  0.3080,  0.0117]]],
 
 
         [[[ 0.1961, -0.0537, -0.1676],
           [-0.5037, -0.3206, -0.1433],
           [ 0.0345,  0.2601,  0.2863]]],
 
 
         [[[-0.3473, -0.2177,  0.1239],
           [ 0.2187, -0.0845,  0.2903],
           [-0.0598,  0.0504,  0.3453]]],
 
 
         [[[-0.4789, -0.3427, -0.1798],
           [-0.3042,  0.0845, -0.3911],
           [-0.2564, -0.3690, -0.4676]]],
 
 
         [[[-0.0852,  0.3991,  0.2719],
           [ 0.2268,  0.1098, -0.1304],
           [-0.0134, -0.4354, -0.3453]]],
 
 
         [[[-0.2677,  0.1783,  0.2145],
           [-0.1276,  0.0574,  0.2516],
           [ 0.3509,  0.1684,  0.0743]]],
 
 
         [[[ 0.1849, -0.1525,  0.2097],
           [-0.3901, -0.3820, -0.3454],
           [ 0.2953,  0.2119, -0.2347]]],
 
 
         [[[ 0.0669,  0.1662, -0.0644],
           [ 0.0384,  0.0838,  0.186

In [18]:
# Gradient
grad_addition = gradients_to_vector(model_ad.parameters())
print(grad_addition)

None


In [19]:
grad_direct

tensor([-0.0046, -0.0070, -0.0045,  ..., -0.0059, -0.0016, -0.0011],
       device='cuda:0')

In [20]:
gradients_to_vector(model_p.parameters())

In [21]:
gradients_to_vector(model2.parameters())

tensor([-0.0046, -0.0070, -0.0045,  ..., -0.0059, -0.0016, -0.0011],
       device='cuda:0')

## Verify gradients are equal
Check whether the gradient of a 0 added model is the same as the gradient of the other model