In [None]:
#!pip install torch git+https://github.com/martijnvanbeers/transformers@feature/attention-transformers pandas seaborn matplotlib numpy scikit-learn spacy==2.3.7 https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz

In [1]:
import itertools
import numpy
import pandas
import seaborn
import matplotlib.pyplot as plt
import ipywidgets as widgets
import spacy
import torch

from transformers import (
    AutoConfig,
    AutoTokenizer,
    AutoModelForSequenceClassification, AutoModelForMaskedLM
)

from valuezeroing import calculate_scores

In [2]:
## GPU
if torch.cuda.is_available():
    device = torch.device("cuda")
    print('We will use the GPU:', torch.cuda.get_device_name("cuda"))
else:
    device = torch.device("cpu")
    print('No GPU available, using the CPU instead.')



We will use the GPU: NVIDIA RTX A4000 Laptop GPU


In [3]:
corpus = pandas.read_csv("firsthalf.txt", sep="\t", header=None, names=["line"])

In [4]:
with h5py.File('examplewise.h5', 'r') as hf:
    hidden_states = hf['hidden_states'][:]
    attentions = hf['attentions'][:]
    scores_matrix = hf['raw_scores'][:]


In [5]:
with pandas.option_context("display.max_colwidth", 200):
    display(corpus.head(10))

Unnamed: 0,line
0,"Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29."
1,"Mr. Vinken is chairman of Elsevier N.V., the Dutch publishing group."
2,"Rudolph Agnew, 55 years old and former chairman of Consolidated Gold Fields PLC, was named a nonexecutive director of this British industrial conglomerate."
3,"A form of asbestos once used to make Kent cigarette filters has caused a high percentage of cancer deaths among a group of workers exposed to it more than 30 years ago, researchers reported."
4,"The asbestos fiber, crocidolite, is unusually resilient once it enters the lungs, with even brief exposures to it causing symptoms that show up decades later, researchers said."
5,"Lorillard Inc., the unit of New York-based Loews Corp. that makes Kent cigarettes, stopped using crocidolite in its Micronite cigarette filters in 1956."
6,"Although preliminary findings were reported more than a year ago, the latest results appear in today's New England Journal of Medicine, a forum likely to bring new attention to the problem."
7,"A Lorillard spokewoman said, 'This is an old story."
8,We're talking about years ago before anyone heard of asbestos having any questionable properties.
9,There is no asbestos in our products now.'


In [6]:
class TransformerTokenizer:
    def __init__(self, vocab, tokenizer):
        self.vocab = vocab
        self._tokenizer = tokenizer

    def __call__(self, text):
        result = self._tokenizer._tokenizer.encode(text)
        words = []
        spaces = []
        for wordix,g in itertools.groupby(zip(range(len(result.word_ids[1:-1])), result.word_ids[1:-1]), key=lambda t: t[1]):
            g = list(g)
            first_token = g[0][0]
            last_token = g[-1][0]
            start = result.offsets[first_token+1][0]
            end = result.offsets[last_token+1][1]
            words.append(text[start:end])
            if wordix < max(result.word_ids[1:-1]):
                # If next start != current end we assume a space in between
                next_start, next_end = result.offsets[last_token + 2]
                spaces.append(next_start > end)
            else:
                if end < len(text):
                    spaces.append(True)
                else:
                    spaces.append(False)
        return spacy.tokens.Doc(self.vocab, words=words, spaces=spaces)

In [7]:
transformer = "bert-base-uncased"
config = AutoConfig.from_pretrained(transformer, output_attentions=True)#, attentions_with_qk=True)
tokenizer = AutoTokenizer.from_pretrained(transformer)
model = AutoModelForMaskedLM.from_pretrained(transformer, config=config)
model.to(device)
model.eval()

nlp = spacy.load("en_core_web_sm")
nlp.tokenizer = TransformerTokenizer(nlp.vocab, tokenizer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [49]:
doc.text

"There is no asbestos in our products now.'"

In [52]:
[(t.text, t.pos_, t.dep_, list(t.ancestors),) for t in doc]

[('There', 'PRON', 'expl', [is]),
 ('is', 'AUX', 'ROOT', []),
 ('no', 'DET', 'det', [asbestos, is]),
 ('asbestos', 'NOUN', 'attr', [is]),
 ('in', 'ADP', 'prep', [asbestos, is]),
 ('our', 'DET', 'poss', [products, in, asbestos, is]),
 ('products', 'NOUN', 'pobj', [in, asbestos, is]),
 ('now', 'ADV', 'advmod', [is]),
 ('.', 'PUNCT', 'punct', [is]),
 ("'", 'PUNCT', 'punct', [is])]

In [8]:
#poslist = ["[CLS]", "[SEP]", "CCONJ", "PROPN", "PRON", "AUX", "VERB", "ADP", "NOUN", "SYM", "NUM", "DET", "PUNCT"]
poslist = [

    "[CLS]",
    "[SEP]",
#    "",
    "ADJ",
    "ADP",
    "ADV",
    "AUX",
    "CONJ",
    "CCONJ",
    "DET",
    "INTJ",
    "NOUN",
    "NUM",
    "PART",
    "PRON",
    "PROPN",
    "PUNCT",
    "SCONJ",
    "SYM",
    "VERB",
    "X",
    "EOL",
    "SPACE",
]

In [9]:
len(poslist)

22

In [10]:
combined_df = None
token_count = 0
for i, row in corpus.head(10).iterrows():
    doc = nlp(row['line'])
    scores_matrix, rollout_matrix, att_matrix = calculate_scores(config, model, "bert", tokenizer, doc.text)
    att_matrix = att_matrix.detach().cpu().numpy()
    token_count += scores_matrix.shape[-1]
    result = tokenizer(doc.text, return_special_tokens_mask=True, return_offsets_mapping=True)
    all_tokens = result.tokens()
    docpos = ["[CLS]"] + [doc[t].pos_ for t in result.word_ids()[1:-1]] + ["[SEP]"]
    index = pandas.MultiIndex.from_product([numpy.arange(12)+1, numpy.arange(12)+1, all_tokens, all_tokens], names=['layer','head','from','to'])
    score_df = pandas.DataFrame(numpy.hstack([scores_matrix.reshape(-1, 1), rollout_matrix.reshape(-1,1), att_matrix.reshape(-1,1)]),index=index, columns=["valuezeroing", "rollout_vz", "raw_attention"]).reset_index()
    score_df['from_pos'] = pandas.Categorical(numpy.tile(numpy.repeat(numpy.array(docpos), len(all_tokens)), 12*12), categories=poslist)
    score_df['to_pos'] = pandas.Categorical(numpy.tile(numpy.array(docpos), len(all_tokens)*12*12), categories=poslist)
    score_df['sent'] = i
    counts = ((score_df[(score_df['layer'] == 1) & (score_df['head'] == 1)]
                    .groupby(["from_pos", "to_pos"])
                    .agg({"from": "count"}))
                    .rename(columns={'from': 'combo_count'})
                    .reset_index()
            )
    score_df = score_df.merge(counts, how="left", on=["from_pos", "to_pos"])
    if combined_df is None:
        combined_df = score_df
    else:
        combined_df = pandas.concat([combined_df, score_df])

In [11]:
combined_df.iloc[:50,:]

Unnamed: 0,layer,head,from,to,valuezeroing,rollout_vz,raw_attention,from_pos,to_pos,sent,combo_count
0,1,1,[CLS],[CLS],0.038026,0.038026,0.028996,[CLS],[CLS],0,1
1,1,1,[CLS],pierre,0.008999,0.008999,0.01493,[CLS],PROPN,0,5
2,1,1,[CLS],vin,0.009869,0.009869,0.022651,[CLS],PROPN,0,5
3,1,1,[CLS],##ken,0.02119,0.02119,0.025385,[CLS],PROPN,0,5
4,1,1,[CLS],",",0.024093,0.024093,0.02708,[CLS],PUNCT,0,3
5,1,1,[CLS],61,0.017997,0.017997,0.040306,[CLS],NUM,0,2
6,1,1,[CLS],years,0.02119,0.02119,0.038072,[CLS],NOUN,0,3
7,1,1,[CLS],old,0.014804,0.014804,0.026697,[CLS],ADJ,0,5
8,1,1,[CLS],",",0.023222,0.023222,0.028038,[CLS],PUNCT,0,3
9,1,1,[CLS],will,0.011901,0.011901,0.024817,[CLS],VERB,0,2


In [12]:
with pandas.option_context("display.max_rows", None):
    display(
        combined_df[
            (combined_df['layer'] == 1) &
            (combined_df['head'] == 1) &
            (combined_df['from_pos'] == "NOUN") &
            (combined_df['to_pos'] == "ADJ")
        ]
    )

Unnamed: 0,layer,head,from,to,valuezeroing,rollout_vz,raw_attention,from_pos,to_pos,sent,combo_count
157,1,1,years,old,0.032828,0.032828,0.038219,NOUN,ADJ,0,15
165,1,1,years,none,0.016622,0.016622,0.033203,NOUN,ADJ,0,15
166,1,1,years,##x,0.070019,0.070019,0.047244,NOUN,ADJ,0,15
167,1,1,years,##ec,0.005818,0.005818,0.01472,NOUN,ADJ,0,15
168,1,1,years,##utive,0.123935,0.123935,0.08283,NOUN,ADJ,0,15
307,1,1,board,old,0.013224,0.013224,0.026809,NOUN,ADJ,0,15
315,1,1,board,none,0.015834,0.015834,0.035025,NOUN,ADJ,0,15
316,1,1,board,##x,0.308074,0.308074,0.101465,NOUN,ADJ,0,15
317,1,1,board,##ec,0.029581,0.029581,0.03623,NOUN,ADJ,0,15
318,1,1,board,##utive,0.072125,0.072125,0.06637,NOUN,ADJ,0,15


In [56]:
corpus.loc[0, 'line']

'Pierre Vinken, 61 years old, will join the board as a nonexecutive director Nov. 29.'

In [58]:
combined_df[
        (combined_df['layer'] == 3) &
        (combined_df['head'] == 1) &
        (combined_df['sent'] == 0) &
        (combined_df['from_pos'] == "ADJ") &
        (combined_df['to_pos'] == "ADJ")
    ]

Unnamed: 0,layer,head,from,to,valuezeroing,rollout_vz,raw_attention,from_pos,to_pos,sent,combo_count,adjusted_attention,adjusted_vz,adjusted_rollout_vz
15182,3,1,old,old,0.00021,0.01701,2.320613e-06,ADJ,ADJ,0,25,9.282452e-08,8.398509e-06,0.00068
15190,3,1,old,none,0.00021,0.026031,1.252067e-16,ADJ,ADJ,0,25,5.00827e-18,8.398509e-06,0.001041
15191,3,1,old,##x,0.00021,0.042115,4.509617e-12,ADJ,ADJ,0,25,1.803847e-13,8.398509e-06,0.001685
15192,3,1,old,##ec,0.00021,0.014327,2.585383e-11,ADJ,ADJ,0,25,1.034153e-12,8.398509e-06,0.000573
15193,3,1,old,##utive,0.00021,0.040867,6.051622e-10,ADJ,ADJ,0,25,2.420649e-11,8.398509e-06,0.001635
15382,3,1,none,old,0.0,0.020266,1.671798e-13,ADJ,ADJ,0,25,6.687192e-15,0.0,0.000811
15390,3,1,none,none,0.0,0.029798,5.724209e-09,ADJ,ADJ,0,25,2.289683e-10,0.0,0.001192
15391,3,1,none,##x,1.0,0.027072,0.999998,ADJ,ADJ,0,25,0.03999992,0.04,0.001083
15392,3,1,none,##ec,0.0,0.014201,1.94836e-06,ADJ,ADJ,0,25,7.793441e-08,0.0,0.000568
15393,3,1,none,##utive,0.0,0.022025,1.847001e-08,ADJ,ADJ,0,25,7.388003e-10,0.0,0.000881


In [13]:
g = (combined_df
     .groupby(["layer", "head", "from_pos", "to_pos"])
     .agg({
             "raw_attention": lambda n: numpy.sum(n) / token_count,
             "valuezeroing": lambda n: numpy.sum(n) / token_count,
             "rollout_vz": lambda n: numpy.sum(n) / token_count,
         })
     .dropna()
     .reset_index())
 

In [14]:
combined_df['adjusted_attention'] = combined_df['raw_attention'] / combined_df['combo_count']
combined_df['adjusted_vz'] = combined_df['valuezeroing'] / combined_df['combo_count']
combined_df['adjusted_rollout_vz'] = combined_df['rollout_vz'] / combined_df['combo_count']


In [15]:
ga = (combined_df
     .groupby(["layer", "head", "from_pos", "to_pos"])
     .agg({
         "adjusted_attention": lambda n: numpy.sum(n) / token_count,
         "adjusted_vz": lambda n: numpy.sum(n) / token_count,
         "adjusted_rollout_vz": lambda n: numpy.sum(n) / token_count,
        })
     .dropna()
     .reset_index())
 

In [20]:
def show_head(ignores=[], sortby="valuezeroing", layer=1, head=1, top_n=5):
    am = {
        'raw_attention': "adjusted_attention", 
        'valuezeroing': "adjusted_vz",
        'rollout_vz': "adjusted_rollout_vz",
    }
    display(g[~g['from_pos'].isin(ignores) & ~g['to_pos'].isin(ignores) & (g['layer'] == layer) & (g['head'] == head)].sort_values(sortby, ascending=False).head(top_n))
    display(ga[~ga['from_pos'].isin(ignores) & ~ga['to_pos'].isin(ignores) & (ga['layer'] == layer) & (ga['head'] == head)].sort_values(am[sortby], ascending=False).head(top_n))

In [21]:
w = widgets.interactive(show_head,
                ignores=widgets.SelectMultiple(
                        options=poslist,
                        value=['[CLS]', '[SEP]'],
                        description='Ignored POS',
                        rows=25,
                        disabled=False
                    ),
                sortby=widgets.RadioButtons(
                        options=['raw_attention', 'valuezeroing', 'rollout_vz'],
                        value='valuezeroing',
                        layout={'width': 'max-content'}, # If the items' names are long
                        description='sort by',
                    ),
                layer=widgets.IntSlider(min=1, max=12, value=1, step=1),
                head=widgets.IntSlider(min=1, max=12, value=1, step=1),
                top_n=widgets.IntSlider(min=3, max=20, value=10, step=1)
            )
display(w)

interactive(children=(SelectMultiple(description='Ignored POS', index=(0, 1), options=('[CLS]', '[SEP]', 'ADJ'…

In [45]:
def show_combo(from_pos, to_pos, sortby):
    with pandas.option_context("display.max_rows", 150):
        display(
            pandas.concat([
                g[(g['from_pos'] == from_pos) & (g['to_pos'] == to_pos)],
                ga[(ga['from_pos'] == from_pos) & (ga['to_pos'] == to_pos)][['adjusted_attention', 'adjusted_vz', 'adjusted_rollout_vz']]
            ], axis=1).reset_index(drop=True).sort_values(sortby, ascending=False)
        )

In [46]:
w = widgets.interactive(show_combo,
                from_pos=widgets.Select(
                        options=poslist,
                        value='NOUN',
                    ),
                to_pos=widgets.Select(
                        options=poslist,
                        value='NOUN',
                    ),
                sortby=widgets.RadioButtons(
                        options=['raw_attention', 'valuezeroing', 'rollout_vz', 'adjusted_attention', 'adjusted_vz', 'adjusted_rollout_vz'],
                        value='valuezeroing',
                        layout={'width': 'max-content'}, # If the items' names are long
                        description='sort by',
                    ),

            )
display(w)

interactive(children=(Select(description='from_pos', index=10, options=('[CLS]', '[SEP]', 'ADJ', 'ADP', 'ADV',…