# Beyond Black Boxes Tutorial

## Part 1: Manual Understanding


In [None]:
import torch
import torch.nn as nn
import torchvision
import random
import matplotlib.pyplot as plt

device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [None]:
from torchvision.models import googlenet

model = googlenet(pretrained=True).to(device)

In [None]:
# GoogleNet architecture: https://github.com/pytorch/vision/blob/main/torchvision/models/googlenet.py

class NewModel(nn.Module):

	def __init__(self, model):
		super().__init__()
		self.model = model

	def forward(self, x):
			# N x 3 x 224 x 224
			x = self.model.conv1(x)
			# N x 64 x 112 x 112
			x = self.model.maxpool1(x)
			# N x 64 x 56 x 56
			x = self.model.conv2(x)
			# N x 64 x 56 x 56
			x = self.model.conv3(x)
			# N x 192 x 56 x 56
			x = self.model.maxpool2(x)
			# N x 192 x 28 x 28
			x = self.model.inception3a(x)
			# N x 256 x 28 x 28
			x = self.model.inception3b(x)
			# N x 480 x 28 x 28
			x = self.model.maxpool3(x)
			# N x 480 x 14 x 14
			x = self.model.inception4a(x)
			# N x 512 x 14 x 14
			x = self.model.inception4b(x)
			# N x 512 x 14 x 14
			x = self.model.inception4c(x)
			# N x 512 x 14 x 14
			x = self.model.inception4d(x)
			# N x 528 x 14 x 14
			x = self.model.inception4e(x)
			# N x 832 x 14 x 14
			x = self.model.maxpool4(x)
			# N x 832 x 7 x 7
			x = self.model.inception5a(x)
			# N x 832 x 7 x 7
			# x = self.model.inception5b(x)
			# # N x 1024 x 7 x 7
			# x = self.model.avgpool(x)
			# # N x 1024 x 1 x 1
			# x = torch.flatten(x, 1)
			# # N x 1024
			# x = self.model.dropout(x)
			# x = self.model.fc(x)
			# N x 1000 (num_classes)
			return x

In [None]:
newmodel = NewModel(model)
newmodel.eval()

seed = 999
random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

In [None]:
def layer_gradient(model, input_tensor, lrs, sigmas, index=0):
		input_tensor = input_tensor.to(device)
		iterations = 100

		for i in range(iterations):
				input_tensor.requires_grad = True
				output = model(input_tensor).to(device)

				focus = output[0, index, :, :] # the target to maximize the output
				target = torch.ones(focus.shape).to(device) * 200 # make a large target of the correct dims
				loss = torch.sum(target - focus)

				loss.backward()
				input_tensor = input_tensor - (lrs[0]*(iterations-i)/iterations + lrs[1]*i/iterations) * input_tensor.grad
				input_tensor = torchvision.transforms.functional.gaussian_blur(
						 input_tensor, 3, sigma=(sigmas[0]*(iterations-i)/iterations + sigmas[1]*i/iterations))

				input_tensor = input_tensor.detach()

		return input_tensor

In [None]:
index = 5

input_tensor = (torch.rand(1, 3, 299, 299))
ouput_tensor = layer_gradient(newmodel, input_tensor, [0.5, 0.4], [2.4, 0.8], index)

# input_tensor = torchvision.transforms.Resize([380, 380])(input_tensor)
# input_tensor = layer_gradient(newmodel, input_tensor, [0.4, 0.3], [1.5, 0.4], index)

# input_tensor = torchvision.transforms.Resize([460, 460])(input_tensor)
# ouput_tensor = layer_gradient(newmodel, input_tensor, [0.3, 0.2], [1.1, 0.3], index)

ouput_tensor = torch.clamp(ouput_tensor, 0, 1)

# visualize
plt.imshow(ouput_tensor.data[0].cpu().detach().numpy().transpose(1, 2, 0))
plt.axis('off')
plt.show()

## Part 2: Lucent library

<img src="https://raw.githubusercontent.com/greentfrapp/lucent/master/images/lucent_header.jpg" width="600"></img>

### Basics

In [None]:
!pip install --quiet git+https://github.com/greentfrapp/lucent.git

In [None]:
import torch

from lucent.optvis import render, param, transform, objectives
from lucent.modelzoo.util import get_model_layers

from torchvision.models import googlenet, resnet50

In [None]:
# model = resnet50(pretrained=True).to(device).eval()
model = googlenet(pretrained=True).to(device).eval()

Use `get_model_layers` to retrieve all the model layers.

In [None]:
# Print the first 10 layer names
get_model_layers(model)[-10:]

In [None]:
import matplotlib.pyplot as plt
import numpy as np

def show_result(result):
  result = result[0].squeeze()
  print(result.shape)

  plt.figure(figsize=(5, 5))
  plt.imshow(result)
  plt.show()

Then we can just use `render.render_vis` to generate a visualization of a channel at a particular layer.

In [None]:
show_result(render.render_vis(model, "inception5a:5", show_inline=False))

We can also try to optimize for a particular label by just passing "labels" as the layer name, which will use the last layer in the model.

ImageNet label IDs are [here](https://gist.github.com/yrevar/942d3a0ac09ec9e5eb3a).

In [None]:
# We can try to optimize for "tiger cat" which has label ID 282 for this model
show_result(render.render_vis(model, "labels:282", show_inline=False))

Unfortunately we don't get very good results, because we are optimizing the output of a softmax layer.

To maximize a particular softmax output, the model can choose to maximize the corresponding input, minimize all other inputs or some combination in between.  We would get a much better visualization if we can directly maximize the logits instead, which we can for the `torchvision` models!

### Fiddling with the Knobs

Lucent splits visualizations into *objectives*, *parameterizations* and *transforms*:

* **Objectives** -- What do you want the model to visualize?
* **Parameterizations** -- How do you describe the image?
* **Transforms** -- What transformations do you want your visualization to be robust to?

#### Objectives

What loss function do we want to minimize? What part of the model do we want to understand? In essence, we are trying to generate an image that causes a particular neuron or filter to activate strongly. The objective allows us to select a specific neuron, channel or a mix!

In [None]:
# The default is optimizing for a channel/filter
obj = objectives.channel("inception4a", 248)
show_result(render.render_vis(model, obj, show_inline=False))

In [None]:
# The default is optimizing for a channel/filter
obj = objectives.channel("inception4a", 261)
show_result(render.render_vis(model, obj, show_inline=False))

In [None]:
# The default is optimizing for a channel/filter
obj = objectives.neuron("inception4a", 261, 5)
show_result(render.render_vis(model, obj, show_inline=False))

In [None]:
# We can sum objectives: we sum the previous two objectives and get a mix of the two images
channel = lambda n: objectives.channel("inception4a", n)
obj = channel(261) + channel(248)
show_result(render.render_vis(model, obj, show_inline=False))

#### Transformations

Another way to reduce high-frequency components in the visualization is by imposing constraints in the form of transformation robustness.

Read more about this in [The Enemy of Feature Visualization](https://distill.pub/2017/feature-visualization/#enemy-of-feature-vis) from Distill's Feature Visualization article.

In [None]:
# No transformations, similar to our example earlier
obj = objectives.channel("inception4a", 261)
show_result(render.render_vis(model, obj, transforms=[], show_inline=False))

In [None]:
# Adding jitter, notice that the visualization is much less noisy!
jitter_only = [transform.jitter(8)]
show_result(render.render_vis(model, obj, transforms=jitter_only, show_inline=False))

In [None]:
# Adding a whole suite of transforms!
all_transforms = [
    transform.pad(16),
    transform.jitter(8),
    transform.random_scale([n/100. for n in range(80, 120)]),
    transform.random_rotate(list(range(-10,10)) + list(range(-5,5)) + 10*list(range(-2,2))),
    transform.jitter(2),
]

show_result(render.render_vis(model, obj, transforms=all_transforms, show_inline=False))

And that's it for now!

Credit and reference: https://github.com/greentfrapp/lucent