In [1]:
import string

In [2]:
from src.lit.lit_nlp.api.dataset import Dataset, types
from lit_nlp.api import types as lit_types

from datasets import load_dataset

from lit_nlp.api import model as lit_model
from lit_nlp.api import types as lit_types
from lit_nlp.api.dataset import Dataset
from datasets import load_dataset
from lit_nlp.lib import utils

In [3]:
import numpy as np
import torch

from transformers import AutoModelForMultipleChoice, AutoTokenizer
from datasets import load_dataset

from src.utils_multiple_choice import convert_examples_to_features, InputExample

In [4]:
model = AutoModelForMultipleChoice.from_pretrained("../assets/models/bb_race_m/")
tokenizer = AutoTokenizer.from_pretrained("../assets/models/bb_race_m")

dataset = load_dataset("race", "middle")
test = dataset['test']

label_list = ["0", "1", "2", "3"]
label_map = {
    0: "A",
    1: "B",
    2: "C",
    3: "D"
}
max_seq_length = 128

Reusing dataset race (/home/marcos/.cache/huggingface/datasets/race/middle/0.1.0/a7d1fac780e70c0e75bca35e9f2f8cfc1411edd18ffd6858ddce56f70dfb1e7c)


In [11]:
a = test.shard(100, 0)

In [13]:
a[0]

{'answer': 'C',
 'article': 'Take a class at Dulangkou School, and you\'ll see lots of things different from other schools, You can see the desks are not in rows and students sit in groups. They put their desks together so they\'re facing each other. How can they see the blackboard? There are three blackboards on the three walls of the classroom!\nThe school calls the new way of learning "Tuantuanzuo", meaning sitting in groups. Wei Liying, a Junior 3 teacher, said it was to give students more chances to communicate.\nEach group has five or six students, according to Wei, and they play different roles .There is a team leader who takes care of the whole group. There is a "study leader"who makes sure that everyone finishes their homework. And there is a discipline leader who makes sure that nobody chats in class.\nWang Lin is a team leader. The 15-year-old said that having to deal with so many things was tiring.\n"I just looked after my own business before,"said Wang. "But now I have to 

In [8]:
class RaceData(Dataset):
    def __init__(self):
        dataset = load_dataset("race", "middle")
        test = dataset['test'].shard(10, 0)
        
        # Store as a list of dicts, conforming to self.spec()
        self._examples = [{
          'article': row['article'],
          'question': row['question'],
          'option_0': row['options'][0],
          'option_1': row['options'][1],
          'option_2': row['options'][2],
          'option_3': row['options'][3],
          'answer': row['answer'],
        } for row in test]

    def spec(self):
        return {
          'article': lit_types.TextSegment(),
          'question': lit_types.TextSegment(),
          'option_0': lit_types.TextSegment(),
          'option_1': lit_types.TextSegment(),
          'option_2': lit_types.TextSegment(),
          'option_3': lit_types.TextSegment(),
          'answer': lit_types.TextSegment()
        }

In [9]:
race_data = RaceData()

Reusing dataset race (/home/marcos/.cache/huggingface/datasets/race/middle/0.1.0/a7d1fac780e70c0e75bca35e9f2f8cfc1411edd18ffd6858ddce56f70dfb1e7c)


In [10]:
race_data.spec()

{'article': TextSegment(required=True, default=''),
 'question': TextSegment(required=True, default=''),
 'option_0': TextSegment(required=True, default=''),
 'option_1': TextSegment(required=True, default=''),
 'option_2': TextSegment(required=True, default=''),
 'option_3': TextSegment(required=True, default=''),
 'answer': TextSegment(required=True, default='')}

In [55]:
class RaceModel(lit_model.Model):
    """ RACE QA model."""

    label_list = ["0", "1", "2", "3"]
    label_map = {
        0: "A",
        1: "B",
        2: "C",
        3: "D"
    }

    def __init__(self):
        # Load already pretrained model
        self.model = AutoModelForMultipleChoice.from_pretrained("../assets/models/bb_race_m/")
        self.tokenizer = AutoTokenizer.from_pretrained("../assets/models/bb_race_m")

    def max_minibatch_size(self):
        # Due to hardware limitations
        return 1

    def predict_minibatch(self, inputs):
        examples = [InputExample(
            example_id="pred",
            question=inputs[0]["question"],
            contexts=[inputs[0]["article"], inputs[0]["article"], inputs[0]["article"], inputs[0]["article"]],  # this is not efficient but convenient
            endings=[
                   inputs[0]["option_0"],
                   inputs[0]["option_1"],
                   inputs[0]["option_2"],
                   inputs[0]["option_3"]],
            label=str(ord(inputs[0]['answer']) - ord("A"))
        )]
        options = [
            inputs[0]["option_0"],
            inputs[0]["option_1"],
            inputs[0]["option_2"],
            inputs[0]["option_3"]
        ]
        answer = ord(inputs[0]['answer']) - ord("A")

        feature = convert_examples_to_features(
            examples,
            self.label_list,
            128,
            self.tokenizer
        )[0]

        features = {
            'input_ids': torch.tensor([feature.input_ids]),
            'attention_mask': torch.tensor([feature.attention_mask]),
            'token_type_ids': torch.tensor([feature.token_type_ids]),
        } 
        out = self.model.forward(features['input_ids'], features['attention_mask'], 
                               features['token_type_ids'], 
                               output_attentions=True, output_hidden_states=True, return_dict=True)
        
        output = {
            "probas": torch.nn.functional.softmax(out.logits, dim=-1),
            "cls_emb": out.hidden_states[-1][answer, 0],  # last layer, first token
        }
        for i in range(12):
            output[f"layer_{i}/attention"] = out.attentions[i][answer]
            
        layer = out['hidden_states'][0]
        layer.retain_grad()
        out['logits'][0][answer].backward(retain_graph=True)
        grads = layer.grad[answer].detach().numpy()
        embs = layer[answer].detach().numpy()
        output = {k: v.detach().numpy() for k, v in output.items()}
        
        # Tokens
        for option in range(4):
            input_id_list = feature.input_ids[option]
            full_tokens = self.tokenizer.convert_ids_to_tokens(input_id_list) 

            q_start = feature.token_type_ids[option].index(1)
            tokens = full_tokens[q_start:]
            idx_end = 0
            for tok in reversed(tokens):
                if tok not in list(string.punctuation) + ["[SEP]"]:
                    break
                else:
                    idx_end += 1
            option_start = len(tokens)-len(self.tokenizer.tokenize(options[option]))-idx_end
            output.update({
                f"tokens_option_{option}": tokens[option_start:],
                f"input_embs_option_{option}": embs[q_start+option_start:],
                f"token_grad_option_{option}": grads[q_start+option_start:]
            })
            
            if option == answer:
                output.update({
                    "tokens": full_tokens,
                    "tokens_article": full_tokens[:q_start],
                    "tokens_question": tokens[:option_start],
                    "input_embs_question": embs[q_start:q_start+option_start],
                    "token_grad_question": grads[q_start:q_start+option_start]
                })
        
        yield output

    def input_spec(self) -> lit_types.Spec:
        return {
            'article': lit_types.TextSegment(),
            'question': lit_types.TextSegment(),
            'option_0': lit_types.TextSegment(),
            'option_1': lit_types.TextSegment(),
            'option_2': lit_types.TextSegment(),
            'option_3': lit_types.TextSegment(),
            'answer': lit_types.TextSegment(),
            "input_embs_question": lit_types.TokenEmbeddings(align="tokens_question", required=False),
            "input_embs_option_0": lit_types.TokenEmbeddings(align="tokens_option_0", required=False),
            "input_embs_option_1": lit_types.TokenEmbeddings(align="tokens_option_1", required=False),
            "input_embs_option_2": lit_types.TokenEmbeddings(align="tokens_option_2", required=False),
            "input_embs_option_3": lit_types.TokenEmbeddings(align="tokens_option_3", required=False)
        }

    def output_spec(self) -> lit_types.Spec:
        ret = {
            "probas": lit_types.MulticlassPreds(parent="label", vocab=self.label_list),
            "cls_emb": lit_types.Embeddings(),
            "cls_grad": lit_types.Gradients(grad_for="cls_emb"),
            "tokens": lit_types.Tokens(),
            "tokens_article": lit_types.Tokens(),
            "tokens_question": lit_types.Tokens(),
            "tokens_option_0": lit_types.Tokens(),
            "tokens_option_1": lit_types.Tokens(),
            "tokens_option_2": lit_types.Tokens(),
            "tokens_option_3": lit_types.Tokens(),
            "input_embs_question": lit_types.TokenEmbeddings(align="tokens_question"),
            "input_embs_option_0": lit_types.TokenEmbeddings(align="tokens_option_0"),
            "input_embs_option_1": lit_types.TokenEmbeddings(align="tokens_option_1"),
            "input_embs_option_2": lit_types.TokenEmbeddings(align="tokens_option_2"),
            "input_embs_option_3": lit_types.TokenEmbeddings(align="tokens_option_3"),
            "token_grad_question": lit_types.TokenGradients(align="tokens_question", grad_for="input_embs_question"),
            "token_grad_option_0": lit_types.TokenGradients(align="tokens_option_0", grad_for="input_embs_option_0"),
            "token_grad_option_1": lit_types.TokenGradients(align="tokens_option_1", grad_for="input_embs_option_1"),
            "token_grad_option_2": lit_types.TokenGradients(align="tokens_option_2", grad_for="input_embs_option_2"),
            "token_grad_option_3": lit_types.TokenGradients(align="tokens_option_3", grad_for="input_embs_option_3")
        }
        for i in range(12):
            ret[f"layer_{i}/attention"] = lit_types.AttentionHeads(align_in="tokens", align_out="tokens")
        
        return ret

In [56]:
model = RaceModel()

In [57]:
a = model.predict(race_data.examples[0:10])

In [58]:
result = next(a)
result

convert examples to features: 1it [00:00, 59.57it/s]


{'probas': array([[0.304835  , 0.17968455, 0.1311935 , 0.38428694]], dtype=float32),
 'cls_emb': array([-9.08100903e-02,  1.14601687e-01, -2.06823125e-02,  3.07100803e-01,
        -3.72100025e-01, -7.40069687e-01,  9.07352746e-01,  6.17240131e-01,
        -7.97066450e-01,  5.52764200e-02, -3.88899073e-02, -4.05825436e-01,
        -3.75631332e-01, -6.20702468e-02, -2.71022767e-01,  3.72186035e-01,
         4.61284995e-01,  9.06136811e-01,  1.21293887e-01,  2.53430903e-01,
        -1.01513374e+00,  2.72488475e-01,  3.07474405e-01, -3.76292497e-01,
        -8.29778135e-01,  4.26090449e-01,  9.79065597e-01,  8.06021750e-01,
        -6.15069875e-03, -4.70660955e-01,  3.57491404e-01,  5.01916278e-03,
         1.06517158e-01,  4.01436597e-01, -1.26571268e-01, -6.85578704e-01,
         2.85255164e-01,  3.58678922e-02,  4.59056139e-01, -1.45309493e-01,
        -6.14762664e-01, -4.84846644e-02, -7.55042851e-01,  5.01708388e-01,
        -9.70798969e-01, -7.58725405e-01, -8.63708735e-01,  4.407190

In [50]:
result["token_grad_question"]

array([[-4.92828549e-04,  1.32821826e-03, -3.11596855e-03, ...,
         2.46420410e-03, -1.42716942e-03,  3.35552427e-03],
       [-1.17675634e-04, -1.06106084e-02, -2.07962585e-03, ...,
         9.12060216e-03,  2.57401038e-02, -1.02929315e-02],
       [ 2.67284038e-03,  2.63056788e-03, -3.11346073e-03, ...,
        -1.07147209e-02,  1.14526981e-02, -1.68487467e-02],
       [-1.83990947e-03,  1.91635918e-05, -4.46358882e-03, ...,
        -1.17082999e-03,  1.07807736e-03, -1.24513009e-03],
       [ 7.92705175e-03,  1.24382060e-02, -3.56299640e-03, ...,
        -8.90301424e-04,  9.20848362e-03, -2.21793773e-03],
       [-9.26447508e-04,  4.26599709e-03, -6.53894444e-04, ...,
         1.68360141e-03,  7.28564919e-05, -1.91023725e-03]], dtype=float32)

In [51]:
result["input_embs_question"]

array([[ 0.2794399 ,  0.22079505,  0.60954565, ..., -0.22324286,
         0.3968126 ,  0.42544302],
       [ 0.43792218, -0.68361676, -0.6623697 , ..., -0.4881795 ,
        -0.716595  , -0.5629601 ],
       [-0.35977823,  0.52905387,  0.7917247 , ..., -0.18671656,
         0.01339419, -0.5697215 ],
       [-0.62283885, -0.2543828 ,  0.04579892, ..., -0.09162086,
         0.0975673 ,  0.4917051 ],
       [-0.03752726, -1.2916431 ,  0.29239067, ...,  0.00999428,
        -0.23198265,  0.01094279],
       [ 0.20892411,  0.00926525,  0.4843903 , ..., -0.03264855,
         0.14141315,  0.33522248]], dtype=float32)

In [53]:
for i in range(4):
    print(result[f"input_embs_option_{i}"].shape, result[f"token_grad_option_{i}"].shape,)

(8, 768) (8, 768)
(8, 768) (8, 768)
(10, 768) (10, 768)
(12, 768) (12, 768)


In [23]:
result['layer_0/attention'].shape

(12, 128, 128)