Documentation:
    https://huggingface.co/transformers/model_doc/bart.html
    
Tokenizer: https://arxiv.org/pdf/1910.13461.pdf

In [7]:
from transformers import BartForConditionalGeneration, BartTokenizerFast
# BartForConditionalGeneration.generate??

In [8]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')
tok = BartTokenizerFast.from_pretrained('facebook/bart-base')

In [9]:
text = 'Avril Haines, President-elect Biden\'s \
nominee for director of national intelligence, \
pledged at her Senate <mask> \
to conduct a public assessment of the threat of the far-right QAnon conspiracy theory.'' '
# text = 'I yesteday school went '

In [14]:
batch = tok(text, return_tensors='pt', padding=True,
            truncation=True,max_length = 200)
batch, tok.batch_decode(batch['input_ids'])

({'input_ids': tensor([[    0, 23389, 20447,   289,  1851,   293,     6,   270,    12,  6930,
          15478,    18,  6615,    13,   736,     9,   632,  2316,     6,  7114,
             23,    69,  1112, 50264,     7,  2883,    10,   285,  4990,     9,
              5,  1856,     9,     5,   444,    12,  4070,  1209,  4688,   261,
           6556,  6680,     4,  1437,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
          1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 ["<s>Avril Haines, President-elect Biden's nominee for director of national intelligence, pledged at her Senate<mask> to conduct a public assessment of the threat of the far-right QAnon conspiracy theory. </s>"])

In [11]:
generated_ids = model.generate(batch['input_ids'])
generated_ids

tensor([[    2,     0, 23389, 20447,   289,  1851,   293,     6,   270,    12,
          6930, 15478,    18,  6615,    13,   736,     9,   632,  2316,     2]])

In [39]:
import torch
t = torch.cat((generated_ids[0], torch.ones(10, dtype = int)*-100))

In [40]:
tok.batch_decode([t])

OverflowError: can't convert negative int to unsigned

In [32]:
tok.batch_decode(generated_ids, skip_special_tokens=True)

["Avril Haines, President-elect Biden's nominee for director of national intelligence"]

In [38]:
tok.batch_decode([t], skip_special_tokens=True)

["Avril Haines, President-elect Biden's nominee for director of national intelligence"]

# Summarization

In [10]:
model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
tok = BartTokenizerFast.from_pretrained('facebook/bart-large-cnn')

In [11]:
# text = "BLEURT is a novel, machine learning-based automatic metric \
# that can capture non-trivial semantic similarities between sentences.\
# It is trained on a public collection of ratings (the WMT Metrics Shared Task dataset) as well as \
# additional ratings provided by the user. Three candidate sentences rated by BLEURT."

text = "My friends are cool but they eat too many carbs."
summary = "BLEURT - the novel metric"

In [12]:
inputs = tok(text, max_length=1024, return_tensors='pt', truncation = True)
outputs = tok(summary, max_length=1024, return_tensors='pt', padding=True)
inputs, tok.batch_decode(inputs['input_ids'])

({'input_ids': tensor([[    0,  2387,   964,    32,  3035,    53,    51,  3529,   350,   171,
          33237,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])},
 ['<s>My friends are cool but they eat too many carbs.</s>'])

In [13]:
summary_ids = model.generate(inputs['input_ids'], num_beams = 4, max_length = 20, early_stopping = True)
print(summary_ids)
tok.batch_decode(summary_ids)

tensor([[    2,     0,     0,     0,  2387,   964,    32,  3035,    53,    51,
          3529,   350,   171, 33237,     4,    38,   437,    45,    10,     3]])


["</s><s><s><s>My friends are cool but they eat too many carbs. I'm not a<unk>"]

In [29]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig

model = BartForConditionalGeneration.from_pretrained('facebook/bart-large')
tokenizer = BartTokenizer.from_pretrained('facebook/bart-large')

ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='pt')

# Generate Summary
summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=5, early_stopping=True)
print([tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=False) for g in summary_ids])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


['MyMy friends']


In [66]:
from transformers.models.bart.modeling_bart import shift_tokens_right
labels = outputs['input_ids']
decoder_input_ids = shift_tokens_right(labels, model.config.pad_token_id)

In [67]:
tok.batch_decode(labels)

['<s>BLEURT - the novel metric</s>']

In [69]:
tok.batch_decode(decoder_input_ids) #shifting is handled in the code no need to pass now

['</s><s>BLEURT - the novel metric']

## Forward

In [70]:
result = model.forward(inputs['input_ids'],
             labels= labels)
result

Seq2SeqLMOutput(loss=tensor(3.9699, grad_fn=<NllLossBackward>), logits=tensor([[[12.0802,  1.0038,  3.9200,  ...,  1.2741,  1.1727,  1.1345],
         [12.0802,  1.0038,  3.9200,  ...,  1.2741,  1.1727,  1.1345],
         [-0.4274,  0.7113,  2.7261,  ...,  0.8051,  0.7094,  0.9495],
         ...,
         [-5.2528,  0.4868,  3.2654,  ...,  0.3953,  0.2706,  0.1126],
         [-4.3235,  0.4207,  2.8046,  ...,  0.4463,  0.0464,  0.1862],
         [-4.2121,  0.0172,  3.9347,  ...,  0.0976, -0.1799,  0.0361]]],
       grad_fn=<AddBackward0>), past_key_values=None, decoder_hidden_states=None, decoder_attentions=None, cross_attentions=None, encoder_last_hidden_state=tensor([[[ 0.0165,  0.0319,  0.0272,  ...,  0.0038, -0.0012, -0.0027],
         [ 0.2046, -0.1909,  0.1713,  ..., -0.2233,  0.1544,  0.1428],
         [ 0.0459,  0.0306,  0.4696,  ...,  0.0365,  0.1092,  0.3024],
         ...,
         [-0.2216,  0.2186, -0.5687,  ...,  0.2229, -0.1042,  0.1191],
         [-0.0073,  0.0120,  0.01

In [71]:
result.loss

tensor(3.9699, grad_fn=<NllLossBackward>)

In [76]:
model.model.encoder.embed_positions

BartLearnedPositionalEmbedding(1026, 1024, padding_idx=1)

In [77]:
tok?

In [78]:
inputs

{'input_ids': tensor([[    0, 30876,  2492,   565,    16,    10,  5808,     6,  3563,  2239,
            12,   805,  8408, 14823,    14,    64,  5604,   786,    12,    90,
         16936,  2617, 46195, 20097,   227, 11305,     4,   243,    16,  5389,
            15,    10,   285,  2783,     9,  2945,    36,   627,   305, 11674,
          4369, 18715, 38559, 12927, 41616,    43,    25,   157,    25,   943,
          2945,  1286,    30,     5,  3018,     4,  2873,  1984, 11305,  5211,
            30,   163,  3850,  2492,   565,     4,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}