# Load and visualize models

Install prerequisites:

```bash
# huggingface transformers
pip install transformers

# visualizer
conda install -y graphviz python-graphviz
pip install torchlens
```


In [None]:
import torch
from torchvision.models import ResNet18_Weights
from torchvision import models

from torchview import draw_graph
import torchlens as tl


## ResNet18

In [None]:
model: models.resnet.ResNet = models.resnet18(
    weights=ResNet18_Weights.IMAGENET1K_V1
)


In [None]:
help(tl.log_forward_pass)


In [None]:
model_history  = tl.log_forward_pass(
    model, torch.zeros((2, 3, 224, 224)),
    vis_opt="rolled",
    vis_direction="topdown",
    vis_fileformat="svg",
    vis_outpath="resnet18.svg",
)
print(model_history)


## VisionTransformer

In [None]:
from transformers import AutoImageProcessor, AutoModelForImageClassification

processor = AutoImageProcessor.from_pretrained("WinKawaks/vit-small-patch16-224")
vit_model = AutoModelForImageClassification.from_pretrained("WinKawaks/vit-small-patch16-224")


In [None]:
for k, v in vit_model.config.to_dict().items():
    if k in ["label2id", "id2label"]:
        print(f"{k:40s}: length {len(v)}")
        continue
    print(f"{k:40s}: {v}")


Inspect the model code to find out where the positional encoding is added, since that is not obvious from the model graph output.

Turns position encodings are learned weights that are added at the end of `vit_model.vit.embeddings.forward`

In the graph they show up as `add_1_6 params 1x197x384`:

Sequence length is 197 = 1 Learned "CLS" token + 14x14 positional embeddings.


In [None]:
import inspect

print(inspect.getsource(vit_model.forward))
print("========================================================")
print(inspect.getsource(vit_model.vit.forward))
print("========================================================")
print(inspect.getsource(vit_model.vit.embeddings.forward))
print("========================================================")
print(vit_model.vit.embeddings.position_embeddings)


In [None]:
vit_model


In [None]:
model_history  = tl.log_forward_pass(
    vit_model, torch.zeros((2, 3, 224, 224)),
    vis_opt="unrolled",
    vis_direction="topdown",
    vis_fileformat="svg",
    vis_outpath="vit_small.svg",
    vis_nesting_depth=99,
)
print(model_history)


## Generative Language Transformer

With one forward pass we can generate one token ("word") at a time.

Here: "Hello my name" -> model -> " is"

For a longer output you would loop. Huggingface allows this functionality with `model.generate`

Notes: Importantly for a batch of text with varying input length you would need to properly pad (pad left side for generative transformers) and pass attention_mask to mask the padding tokens.


In the graph you can see the embedding consists of a language embedding (50257x768) which maps from tokens to embeddings (the tokenizer before mapped text to tokens) and a position embedding (1024x768) from which only the first 3 positions are used.

In [None]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForCausalLM

tokenizer = AutoTokenizer.from_pretrained("distilgpt2")
model = AutoModelForCausalLM.from_pretrained("distilgpt2")

tokens = tokenizer("Hello my name", return_tensors="pt")
output_logits = model(tokens.input_ids).logits

print(output_logits.shape)


In [None]:
greedy_decoding = output_logits.argmax(-1)
tokenizer.decode(greedy_decoding[0, -1], skip_special_tokens=False)


In [None]:
model_history  = tl.log_forward_pass(
    model,
    tokens.input_ids,
    vis_opt="unrolled",
    vis_direction="topdown",
    vis_fileformat="svg",
    vis_nesting_depth=99,
    vis_outpath="distilgpt2.svg",
)
print(model_history)
