In [None]:
import os
from pathlib import Path

import torch 
import numpy as np

from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM, GPTNeoForCausalLM

In [None]:
available_gpus = [torch.cuda.device(i) for i in range(torch.cuda.device_count())]
available_gpus

In [None]:
device = 'cuda:7' if torch.cuda.is_available() else 'cpu'

In [None]:
# attention_type = "causal"
attention_type = "bidirectional"

In [None]:
token = ""

In [None]:
# model_name_or_path = "EleutherAI/gpt-neo-1.3B"
# model_name_or_path = "princeton-nlp/Sheared-LLaMA-1.3B"
# model_name_or_path = "meta-llama/Llama-2-7b-hf"
# model_name_or_path = "meta-llama/Llama-2-7b-chat-hf"
model_name_or_path = "mistralai/Mistral-7B-v0.1"
# model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.1"
# model_name_or_path = "mistralai/Mistral-7B-Instruct-v0.2"


In [None]:
# change cach dir for models
CACHE_DIR = "/data/pre-trained-models-cache"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, token=token)
lm = AutoModelForCausalLM.from_pretrained(model_name_or_path, token=token, cache_dir=CACHE_DIR)

In [None]:
lm.eval()

In [None]:
text = 'Montreal is the second most populous city in Canada, the tenth most populous city in North America, and the most populous city in the province of Quebec. Founded in 1642 as Ville-Marie, or "City of Mary",[15] it is named after Mount Royal,[16] the triple-peaked hill around which the early city of Ville-Marie was built.[17] The city is centred on the Island of Montreal, which obtained its name from the same origin as the city,[18][19] and a few much smaller peripheral islands, the largest of which is Île Bizard. The city is 196 km (122 mi) east of the national capital, Ottawa, and 258 km (160 mi) southwest of the provincial capital, Quebec City.'

ids = tokenizer.encode(text)
tokens = tokenizer.convert_ids_to_tokens(ids)

input_ids = torch.tensor(ids).reshape(1, -1)
print(input_ids.shape)
print(input_ids)
print(tokens)

In [None]:
# enable bidirectional attention
attention_mask = None
if attention_type == "bidirectional":
    # construct attention mask (batch_size, 1, seq_len, seq_len)
    attention_mask = torch.ones(size=(1, 1, len(ids), len(ids))).to(device)

    if model_name_or_path in ["princeton-nlp/Sheared-LLaMA-1.3B", "meta-llama/Llama-2-7b-hf", "meta-llama/Llama-2-7b-chat-hf"]:
        lm.model._update_causal_mask = lambda attention_mask, _: attention_mask

    if model_name_or_path == "EleutherAI/gpt-neo-1.3B":
        gpt_neo_max_length = 2048
        bi_mask = torch.ones((1, 1, gpt_neo_max_length, gpt_neo_max_length), dtype=bool)

        # overwrite causal mask at every layer
        for lidx in range(len(lm.transformer.h)):
            lm.transformer.h[lidx].attn.attention.bias = bi_mask

In [None]:
# put inputs and model on GPU
lm.to(device)
input_ids = input_ids.to(device)

In [None]:
labels = input_ids
output = lm.forward(input_ids=input_ids, labels=labels, attention_mask=attention_mask, output_attentions=True, output_hidden_states=True)

----

In [None]:
# look at attention matrices
A = output.attentions[-1].squeeze()[-1].detach().cpu().numpy()
print(np.triu(A, k=1)) # the future

In [None]:
# save attention matrices to disk

# TODO(mm): save to /local
data_path = f"/projects/transformers/attention_data/{model_name_or_path.split('/')[-1]}/{attention_type}"

# create dir
Path(data_path).mkdir(parents=True, exist_ok=True)    
    
for layer in range(len(output.attentions)):
    A = output.attentions[layer].squeeze().detach().cpu().numpy()
    file_name = f"A_layer{layer}.npy"
    with open(os.path.join(data_path, file_name), 'wb') as f:
        np.save(f, A)

----

In [None]:
output.loss

In [None]:
logits = output.logits.squeeze()
print(logits.shape)

In [None]:
preds = torch.argmax(logits, dim=1)
print(preds.shape)

In [None]:
preds_tokens = tokenizer.convert_ids_to_tokens(preds)

In [None]:
tokens

In [None]:
preds_tokens

In [None]:
len(output.hidden_states)

In [None]:
output.hidden_states[-1].shape

In [None]:
output.hidden_states[-1].squeeze()[10].shape

In [None]:
# save hidden states to disk

# TODO(mm): save to /local
data_path = f"/projects/transformers/hidden_states_data/{model_name_or_path.split('/')[-1]}/{attention_type}"

# create dir
Path(data_path).mkdir(parents=True, exist_ok=True)    
    
for layer in range(len(output.hidden_states)):
    A = output.hidden_states[layer].detach().cpu().numpy()
    file_name = f"H_layer{layer}.npy"
    with open(os.path.join(data_path, file_name), 'wb') as f:
        np.save(f, A)