### Simple toy example
This notebook reproduces empirically the toy example in Fig.1 on the paper. We compute attributions for the only hidden layer of the model using different methods.

In [9]:
import torch
import torch.nn as nn
import numpy as np
import torch.nn.functional as F

# Define the model weights
w1 = torch.tensor(np.array([[-0.25, 1.0, 1.0, 1.0], [0.25, -1.0, 1.0, 1.0]])).float()
b1 =  torch.tensor(np.array([-0.,0.,0.,0.])).float()
w2 = torch.tensor(np.array([[2], [0.5], [0.5,], [0.]])).float()

# Generate some input data
x_test = np.array([[10*np.random.random(), 10*np.random.random()] for _ in range(100)])
y_test = np.array([[np.max(xi)] for xi in x_test])
x_test = torch.tensor(x_test).float()
y_test = torch.tensor(y_test).float()

In [10]:
def run(x):
    z = F.relu(torch.mm(x, w1) + b1)
    z.requires_grad = True
    y = torch.mm(z, w2)
    return z, y

def run_last(z):
    y = torch.mm(z, w2)
    return y

z, y = run(x_test)
loss = F.mse_loss(y, y_test)
print (f"Loss: {loss}")

# APoZ
print (f"APoZ scores: {(z > 0.).sum(0)}")

# Weight norm
print (f"Weight norm: {w1.abs().sum(0)}")

# Gradient and Taylor
loss.backward()
print (f"Gradient: {z.grad.abs().sum(0)}")
print (f"Taylor: {(z.grad * z).sum(0)}")

# SV
sv = np.array([0., 0., 0., 0.])
for _ in range(20000):
    _z = z.clone().detach()
    _y = run_last(_z)
    loss = F.mse_loss(_y, y_test)
    for i in np.random.permutation(4):
        _z.index_fill_(1, torch.tensor(np.array([i])), 0.0)
        _y = run_last(_z)
        new_loss = F.l1_loss(_y, y_test)
        delta = new_loss - loss
        sv[i] += delta.clone().detach().numpy()
        loss = new_loss
print (f"SV: {sv / 20000}")
        
        

Loss: 1.1722183227539062
APoZ scores: tensor([ 5056,  4944, 10000, 10000])
Weight norm: tensor([0.5000, 2.0000, 2.0000, 2.0000])
Gradient: tensor([4.0052, 1.0013, 1.0013, 0.2003])
Taylor: tensor([ -1.7014,  -1.6179, -11.7222,   2.3444], grad_fn=<SumBackward1>)
SV: [ 0.80446774  0.76193867  4.96361897 -1.044156  ]
