In [1]:
from allennlp.interpret.saliency_interpreters import SimpleGradient
from allennlp.predictors import Predictor

In [35]:
# sentence = "The nurse ran to the emergency room to see [MASK] patient."
sentence = "The doctor ran to the emergency room to see [MASK] patient."
# sentence = "some [MASK] string"
# sentence = "The [MASK] taught the students in class."

sentence, len(sentence.split(" "))

('The doctor ran to the emergency room to see [MASK] patient.', 11)

In [36]:
inputs = {"sentence": sentence}
inputs

{'sentence': 'The doctor ran to the emergency room to see [MASK] patient.'}

In [37]:
# archive = (
#     "https://storage.googleapis.com/allennlp-public-models/bert-masked-lm-2020-10-07.tar.gz"
# )

### Model - AllenNLP

In [38]:
predictor = Predictor.from_path("models/bert-masked-lm-2020-10-07/")
predictor

Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<allennlp_models.lm.predictors.masked_language_model.MaskedLanguageModelPredictor at 0x7fe661b58ca0>

In [39]:
preds = predictor.predict(sentence)
preds

{'probabilities': [[0.3828469514846802,
   0.3691021203994751,
   0.08115176111459732,
   0.07295342534780502,
   0.059848736971616745]],
 'top_indices': [[1117, 1103, 1330, 170, 1123]],
 'token_ids': [101,
  1109,
  3995,
  1868,
  1106,
  1103,
  5241,
  1395,
  1106,
  1267,
  103,
  5351,
  119,
  102],
 'words': [['his', 'the', 'another', 'a', 'her']],
 'tokens': ['[CLS]',
  'The',
  'doctor',
  'ran',
  'to',
  'the',
  'emergency',
  'room',
  'to',
  'see',
  '[MASK]',
  'patient',
  '.',
  '[SEP]']}

### Model - BERT-Base

In [7]:
from transformers.models.auto import AutoConfig, AutoModel
from transformers.models.auto.tokenization_auto import AutoTokenizer

model_uri = 'models/roberta-base/'

config = AutoConfig.from_pretrained(model_uri)
tokenizer = AutoTokenizer.from_pretrained(model_uri)
classifier = AutoModel.from_pretrained(model_uri, config=config)

Some weights of the model checkpoint at models/roberta-base/ were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
model_path = 'models/allennlp-roberta-base'
model_name = 'allennlp-roberta-base'
classifier.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

('models/allennlp-roberta-base/tokenizer_config.json',
 'models/allennlp-roberta-base/special_tokens_map.json',
 'models/allennlp-roberta-base/vocab.json',
 'models/allennlp-roberta-base/merges.txt',
 'models/allennlp-roberta-base/added_tokens.json',
 'models/allennlp-roberta-base/tokenizer.json')

In [5]:
from allennlp.data.vocabulary import Vocabulary

transformer_vocab = Vocabulary.from_pretrained_transformer(model_path)

In [6]:
from allennlp.data.tokenizers.pretrained_transformer_tokenizer import PretrainedTransformerTokenizer
from allennlp.data.token_indexers.pretrained_transformer_indexer import PretrainedTransformerIndexer

transformer_tokenizer = PretrainedTransformerTokenizer(model_path)
token_indexer = PretrainedTransformerIndexer(model_path)

In [7]:
from allennlp.modules.text_field_embedders import BasicTextFieldEmbedder
from allennlp.modules.token_embedders.pretrained_transformer_embedder import PretrainedTransformerEmbedder

token_embedder = BasicTextFieldEmbedder(
  { 
    "tokens": PretrainedTransformerEmbedder(model_path) 
  })

In [9]:
from allennlp.modules.seq2vec_encoders.bert_pooler import BertPooler

transformer_encoder = BertPooler(model_path)

In [14]:
from allennlp.models import MultiTaskModel
# 
# model = BasicClassifier(vocab=transformer_vocab, 
#                         text_field_embedder=token_embedder, 
#                         seq2vec_encoder=transformer_encoder, 
#                         dropout=0.1, 
#                         num_labels=5)

In [15]:
from allennlp_models.lm.models import masked_language_model as MLM

model = MLM(vocab: Vocabulary,
        text_field_embedder: TextFieldEmbedder,
        language_model_head: LanguageModelHead,
        contextualizer = transformer_encoder,
        target_namespace: str = "roberta",
        dropout: float = 0.1
        )

### Applying SimpleGradient for Word Importance

In [40]:
interpreter = SimpleGradient(predictor)
interpreter

<allennlp.interpret.saliency_interpreters.simple_gradient.SimpleGradient at 0x7fe597121f30>

In [41]:
interpretation = interpreter.saliency_interpret_from_json(inputs)
print(interpretation)



{'instance_1': {'grad_input_1': [0.00634726315979337, 0.02781314066772619, 0.1904769153366855, 0.07233389772839216, 0.03771242259156962, 0.06416733700697969, 0.04216160963962355, 0.030878413385859126, 0.014115116700583023, 0.14565831065275972, 0.0067774734223451346, 0.2893705709912004, 0.04854366943842647, 0.023643730871020476]}}


In [42]:
word_importances = interpretation["instance_1"]["grad_input_1"]
word_importances

[0.00634726315979337,
 0.02781314066772619,
 0.1904769153366855,
 0.07233389772839216,
 0.03771242259156962,
 0.06416733700697969,
 0.04216160963962355,
 0.030878413385859126,
 0.014115116700583023,
 0.14565831065275972,
 0.0067774734223451346,
 0.2893705709912004,
 0.04854366943842647,
 0.023643730871020476]

In [43]:
preds["tokens"]

['[CLS]',
 'The',
 'doctor',
 'ran',
 'to',
 'the',
 'emergency',
 'room',
 'to',
 'see',
 '[MASK]',
 'patient',
 '.',
 '[SEP]']

In [44]:
print(preds["tokens"])
len(preds["tokens"]), len(word_importances)

['[CLS]', 'The', 'doctor', 'ran', 'to', 'the', 'emergency', 'room', 'to', 'see', '[MASK]', 'patient', '.', '[SEP]']


(14, 14)

In [45]:
print(f"Original Sentence: {sentence} --- Predictions: {preds['words']}\n")

for token, word_imp in zip(preds["tokens"], word_importances):
    print(f"Token: {token} --- Word Importance: {word_imp}")

Original Sentence: The doctor ran to the emergency room to see [MASK] patient. --- Predictions: [['his', 'the', 'another', 'a', 'her']]

Token: [CLS] --- Word Importance: 0.00634726315979337
Token: The --- Word Importance: 0.02781314066772619
Token: doctor --- Word Importance: 0.1904769153366855
Token: ran --- Word Importance: 0.07233389772839216
Token: to --- Word Importance: 0.03771242259156962
Token: the --- Word Importance: 0.06416733700697969
Token: emergency --- Word Importance: 0.04216160963962355
Token: room --- Word Importance: 0.030878413385859126
Token: to --- Word Importance: 0.014115116700583023
Token: see --- Word Importance: 0.14565831065275972
Token: [MASK] --- Word Importance: 0.0067774734223451346
Token: patient --- Word Importance: 0.2893705709912004
Token: . --- Word Importance: 0.04854366943842647
Token: [SEP] --- Word Importance: 0.023643730871020476
