<h1> BERT Visualization Techniques</h1>

Aims

1. Experiment with various visualization techniques and modules specific to BERT for PyTorch.
2. Assess which methods suit the project best

Initial sources:
    
https://towardsdatascience.com/deconstructing-bert-part-2-visualizing-the-inner-workings-of-attention-60a16d86b5c1 
    
https://github.com/jessevig/bertviz#neuron-view  

<h2> Techniques to Try </h2>

1. attention-head view https://github.com/tensorflow/tensor2tensor/tree/master/tensor2tensor/visualization <br>
2. model view <br>
3. neuron view

In [1]:
import os
import pandas as pd

import torch

from torch.utils.data import DataLoader, RandomSampler, SequentialSampler
from bertviz import head_view, model_view
from bertviz.neuron_view import show
from transformers import BertTokenizer, BertModel, BertForSequenceClassification

#import functions from other notebooks
from ipynb.fs.defs.BERT_Initial_Training import ClassificationDataset

ModuleNotFoundError: No module named 'ipynb.fs.defs.BERT_Initial_Training'

In [None]:
project_dir = "/Users/paulp/Desktop/UEF/Thesis"
data_dir = os.path.join(project_dir, 'Data')
model_dir = os.path.join(project_dir, 'Models/classifier')
model_path = 'CLS_2ep_256_e-5_model.pt'
test_set_path = 'test.csv'

os.chdir(project_dir)
test_set = pd.read_csv(os.path.join(data_dir, test_set_path))

device = torch.device('mps')

spec_tokens = ['<?>', '<*>', '<R>', #one of the corpora uses these
               '<MISC>', 
               '<LOC>', 
               '<PER>', 
               '<ORG>'] # these will mask named entities later if needed

tokenizer = BertTokenizer.from_pretrained('bert-base-cased', 
                                          additional_special_tokens = spec_tokens,
                                         )

In [None]:
# custom dataset class wrapped around torch.utils dataset

class ClassificationDataset(torch.utils.data.Dataset):
    def __init__(self, df):
        self.labels = [target_idx[a] for a in df['Target']]
        self.texts = [tokenizer.encode_plus(text,
                                add_special_tokens=True,
                                max_length=512,
                                truncation=True,
                                padding='max_length',
                                return_attention_mask = True,
                                return_tensors='pt') 
                      for text in df.loc[:,'Text']]
    def classes(self):
        return self.labels
    
    def __len__(self):
        return len(self.labels)
    
    def get_batch_labels(self, idx):
        return np.array(self.labels[idx])
    
    def get_batch_texts(self, idx):
        return self.texts[idx]
    
    def __getitem__(self, idx):
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)
        
        return batch_texts, batch_y

In [None]:
# Dataset
ds_test = ClassificationDataset(test_set)


In [None]:
#data loader

test_dataloader = DataLoader(
    ds_test,
    sampler = SequentialSampler(ds_test),
    batch_size = 1)

In [None]:
# target:idx dict
target_idx = {b:a for a,b in enumerate(test_set['Target'].unique())}
idx_target = {target_idx[a]:a for a in target_idx.keys()}
# load model : uncomment if not already loaded
#pretrained_state_dict = torch.load('pretrained_2ep_1.pt')
classification_model = BertForSequenceClassification.from_pretrained(
    model_dir, #model_dir,
    num_labels = len(target_idx.keys()),
    output_attentions = True,
    output_hidden_states = False,) 

#classification_model.load_state_dict(
#    torch.load(os.path.join(model_dir, 'classification_2ep.pt')),
#    strict=False)

In [None]:
saved_examples = []

In [None]:
sample = test_set.sample(1)
target = sample['Target'].item()
inputs = tokenizer.encode_plus(sample['Text'].item(),
          return_tensors='pt',
          max_length=30, 
          truncation=True, 
          padding='max_length',
          )

input_ids = inputs['input_ids']
token_type_ids = inputs['token_type_ids']
outs = classification_model(input_ids, token_type_ids=token_type_ids) #attention weights are returned last
attention = outs[-1]

input_id_list = input_ids[0].tolist()
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
pred = idx_target[outs.logits.argmax().item()]

In [None]:
print('target: ', target, ', ', 'prediction: ', pred, '\n\n logits: ', outs.logits, '\n\n text: ', tokenizer.convert_ids_to_tokens(input_id_list))

In [None]:
model_view(attention, tokens)

In [None]:
head_view(attention, tokens)

In [None]:
#neuron view
show(classification_model, 
     model_type='bert', 
     tokenizer=tokenizer, 
     sentence_a = sample['Text'].item(),
     layer=2, 
     head=0)

In [None]:
help(show)

In [None]:
sample['Text'].item()