# Summarization with BART - using Pipeline

In [2]:
from transformers import BartTokenizer, BartForConditionalGeneration, BartConfig
from transformers import pipeline

model = BartForConditionalGeneration.from_pretrained('sshleifer-distilbart-cnn-12-6')
tokenizer = BartTokenizer.from_pretrained('sshleifer/distilbart-cnn-12-6')


nlp=pipeline("summarization", model=model, tokenizer=tokenizer)

In [3]:
import pprint
pp = pprint.PrettyPrinter(indent=0, width=100)

In [4]:
text='''
We order two different types of jewelry from this
company the other jewelry we order is perfect.
However with this jewelry I have a few things I
don't like. The little Stone comes out of these
and customers are complaining and bringing them
back and we are having to put new jewelry in their
holes. You cannot sterilize these in an autoclave
as well because it heats up too much and the glue
does not hold up so the second group of these that
we used I did not sterilize them that way and the
stones still came out. When I use a dermal clamp
to put the top on the stones come out immediately.
DO not waste your money on this particular product
buy the three mm. that has the claws that hold the
jewelry in those are perfect. So now I'm stuck
with jewelry that I can't sell not good for
business.
'''
q=nlp(text)

In [5]:
pp.pprint(q[0]['summary_text'])

(' The little Stone comes out of these little stones and customers are complaining and bringing '
 'them back and we are having to put new jewelry in their holes . You cannot sterilize these in an '
 'autoclave because it heats up too much and the glue does not hold up so the second group of '
 'these that we used I did not sterilize them that way and the stones still came out .')


# Training AR Models

## Getting Training Data

In [12]:
!wget https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt

--2021-11-10 16:44:16--  https://raw.githubusercontent.com/teropa/nlp/master/resources/corpora/gutenberg/austen-emma.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 887071 (866K) [text/plain]
Saving to: ‘austen-emma.txt.1’


2021-11-10 16:44:18 (1.08 MB/s) - ‘austen-emma.txt.1’ saved [887071/887071]



## Training (BPE) Tokenizer

In [1]:
from tokenizers.models import BPE
from tokenizers import Tokenizer
from tokenizers.decoders import ByteLevel as ByteLevelDecoder
from tokenizers.normalizers import NFKC, Sequence, Lowercase
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer

In [2]:
# Modelling
tokenizer = Tokenizer(BPE())

# Normalizer
tokenizer.normalizer = Sequence([
    Lowercase()
])

# PreTokenizer
tokenizer.pre_tokenizer = ByteLevel()

# Decoder
tokenizer.decoder = ByteLevelDecoder()

In [3]:
trainer = BpeTrainer(vocab_size=50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=[
            "<s>",
            "<pad>",
            "</s>",
            "<unk>",
            "<mask>"
        ])

In [4]:
tokenizer.train(["austen-emma.txt"], trainer)






In [5]:
print(f"Trained vocab size: {tokenizer.get_vocab_size()}" )

Trained vocab size: 11954


## Saving Tokenizer

In [None]:
!mkdir tokenizer_gpt

In [6]:
tokenizer.save("tokenizer_gpt/tokenizer.json")

In [9]:
tokenizer.save_pretrained("tokenizer_gpt1/tokenizer.json")

AttributeError: 'tokenizers.Tokenizer' object has no attribute 'save_pretrained'

## Loading Tokenizer

In [9]:
from transformers import GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel

In [7]:
tokenizer_gpt = GPT2TokenizerFast.from_pretrained("tokenizer_gpt")

file tokenizer_gpt/config.json not found
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


We need to inform it regarding the special tokens

In [8]:
tokenizer_gpt.add_special_tokens({
  "eos_token": "</s>",
  "bos_token": "<s>",
  "unk_token": "<unk>",
  "pad_token": "<pad>",
  "mask_token": "<mask>"
})

0

In [9]:
# sanity check
tokenizer_gpt.eos_token_id

2

In [10]:
# sanity check
tokenizer_gpt.encode("<s> this is </s>")

[0, 469, 361, 225, 2]

## Saving Tokenizer

In [41]:
tokenizer_gpt.save_pretrained("tokenizer_gpt_auto/")

('tokenizer_gpt_auto/tokenizer_config.json',
 'tokenizer_gpt_auto/special_tokens_map.json',
 'tokenizer_gpt_auto/vocab.json',
 'tokenizer_gpt_auto/merges.txt',
 'tokenizer_gpt_auto/added_tokens.json',
 'tokenizer_gpt_auto/tokenizer.json')

## Loading Pre-trained Tokenizer

In [5]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("tokenizer_gpt_auto")

## Creating Configuration and Model

To create configuration, followed by the model, we use:

In [11]:
config = GPT2Config(
  vocab_size=tokenizer_gpt.vocab_size,
  bos_token_id=tokenizer_gpt.bos_token_id,
  eos_token_id=tokenizer_gpt.eos_token_id
)
model = GPT2LMHeadModel(config)

In [12]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(11954, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [13]:
config

GPT2Config {
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 0,
  "embd_pdrop": 0.1,
  "eos_token_id": 2,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 11954
}

## Loading Data for Pre-training

In [14]:
with open("austen-emma.txt", "r", encoding='utf-8') as f:
    content = f.readlines()

In [15]:
content_p = []
for c in content:
    if len(c)>10: # to drop shorter sentences 
        content_p.append(c.strip())

In [16]:
# Add EOS Token to end of document
content_p = " ".join(content_p)+tokenizer_gpt.eos_token

In [17]:
content_p[-20:]

'ss of the union.</s>'

## Tokenizing Data

In [18]:
tokenized_content = tokenizer_gpt.encode(content_p)

In [19]:
tokenized_content[:10], tokenized_content[-10:]

([63, 1888, 440, 594, 11583, 11074, 65, 394, 575, 16],
 [2201, 301, 275, 931, 1127, 288, 275, 5034, 18, 2])

## Preparing Samples

In [20]:
examples = []
block_size = 100
BUFFER_SIZE = 1000
for i in range(0, len(tokenized_content)):
    examples.append(tokenized_content[i:i + block_size])

In [21]:
train_data = [] 
train_labels = [] 
for example in examples: 
    train_data.append(example[:-1]) 
    train_labels.append(example[1:])

In [22]:
type(train_data[0])

list

In [23]:
import torch
train_data = torch.tensor(train_data[:195120])
train_labels = torch.tensor(train_labels[:195120])


In [24]:
len(train_data)

195120

## Parsing one Sample

In [25]:
outputs = model(train_data[0]) 

In [26]:
outputs.logits.shape

torch.Size([99, 11954])

## Pytorch Dataset

In [27]:
batch_size=4

In [28]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

In [29]:
# change 1000 if you want to train on full data
train_dataset = TensorDataset(train_data[:10], train_labels[:10])
train_sampler = RandomSampler(train_dataset)
train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size = batch_size)

## Training Loop

In [30]:
import time
import datetime

import torch
import torch.nn as nn

import pytorch_lightning as pl
from pytorch_lightning.loggers import CSVLogger
from pytorch_lightning.callbacks import ModelCheckpoint

import tableprint as tp
import torchmetrics

In [31]:
class Model_TL(pl.LightningModule):
    def __init__(self, model):
        super(Model_TL, self).__init__()
        self.model = model
        self.avg_train_loss = 0.
        self.avg_valid_loss = 0.
        self.table_context = None
        self.loss_fn = nn.CrossEntropyLoss()
        self.start_time = 0
        self.end_time = 0
        self.epoch_mins = 0
        self.epoch_secs = 0
        self.table_context = None
        self.train_accm = torchmetrics.Accuracy()
        self.valid_accm = torchmetrics.Accuracy()
        self.train_acc = 0.
        self.valid_acc = 0.
        

    def configure_optimizers(self):
        optim = torch.optim.Adam(self.parameters(), lr=3e-5)
        return optim


    def training_step(self, batch, batch_idx):
        input_ids, labels = batch
        outputs = self.model(input_ids)
        _, predictions = torch.max(outputs.logits, 2)
        acc_train = self.train_accm(predictions, labels)
        ol = outputs.logits.permute(0,2,1)
        loss = self.loss_fn(ol, labels)
        return {"loss": loss, "p": predictions, "y": labels}
    
    
    def validation_step(self, batch, batch_idx):
        input_ids, labels = batch
        outputs = self.model(input_ids)
        _, predictions = torch.max(outputs.logits, 2)
        acc_valid = self.valid_accm(predictions, labels)
        ol = outputs.logits.permute(0,2,1)
        loss_valid = self.loss_fn(ol, labels)
        return {"loss": loss_valid, "p": predictions, "y": labels}


    def on_train_epoch_start(self) :
        self.start_time = time.time()


    def validation_epoch_end(self, outputs):
        if self.trainer.sanity_checking:
            return
        
        self.avg_valid_loss = torch.stack([x['loss'] for x in outputs]).mean().item()
        self.valid_acc = (self.valid_accm.compute() * 100).item()
        self.valid_accm.reset()
        self.log("epoch_num", int(self.current_epoch+1), on_step=False, on_epoch=True, prog_bar=False, logger=False)
        self.log("val_loss", self.avg_valid_loss, on_step=False, on_epoch=True, prog_bar=False, logger=False)
        self.log("val_acc", self.valid_acc, on_step=False, on_epoch=True, prog_bar=False, logger=False)
        
#         if self.current_epoch == self.trainer.max_epochs - 1:
#             y = torch.cat([x['y'] for x in outputs])
#             p = torch.cat([x['p'] for x in outputs])
          

    def training_epoch_end(self, outputs):
        self.avg_train_loss = torch.stack([x['loss'] for x in outputs]).mean().item()
        self.train_acc = (self.train_accm.compute() * 100).item()
        self.train_accm.reset()

    def on_train_epoch_end(self):
        self.end_time = time.time()
        time_int = self.format_time(self.start_time, self.end_time)
    
        metrics = {'epoch': self.current_epoch+1, 'Train Acc': self.train_acc, 'Train Loss': self.avg_train_loss,  'Valid Acc': self.valid_acc, 'Valid Loss': self.avg_valid_loss}
        if self.table_context is None:
            self.table_context = tp.TableContext(headers=['epoch', 'Train Acc', 'Train Loss', 'Valid Acc', 'Valid Loss', 'Time'])
            self.table_context.__enter__()
        self.table_context([self.current_epoch+1, self.train_acc, self.avg_train_loss, self.valid_acc, self.avg_valid_loss, time_int])
        self.logger.log_metrics(metrics)

        if self.current_epoch == self.trainer.max_epochs - 1:
            self.table_context.__exit__()

    
    def format_time(self, start_time, end_time):
        elapsed_time = end_time - start_time
        elapsed_rounded = int(round((elapsed_time)))
        return str(datetime.timedelta(seconds=elapsed_rounded))

In [32]:
plmodel = Model_TL(model)

In [33]:
checkpoint_callback = ModelCheckpoint(
    monitor='val_acc',
    dirpath='./',
    filename='model',
    mode='max'
)
csvlogger = CSVLogger('csv_logs', name='Ch4', version=0)

  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


In [34]:
trainer = pl.Trainer(max_epochs=1, num_sanity_val_steps=0, logger=csvlogger, gpus=0, callbacks=[checkpoint_callback], log_every_n_steps=1)
trainer.fit(plmodel, train_dataloaders=train_loader)

GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
  rank_zero_warn(f"you defined a {step_name} but have no {loader_name}. Skipping {stage} loop")
  rank_zero_warn(

  | Name       | Type             | Params
------------------------------------------------
0 | model      | GPT2LMHeadModel  | 95.0 M
1 | loss_fn    | CrossEntropyLoss | 0     
2 | train_accm | Accuracy         | 0     
3 | valid_accm | Accuracy         | 0     
------------------------------------------------
95.0 M    Trainable params
0         Non-trainable params
95.0 M    Total params
380.092   Total estimated model params size (MB)
  rank_zero_warn(


Training: -1it [00:00, ?it/s]

╭─────────────┬─────────────┬─────────────┬─────────────┬─────────────┬─────────────╮
│       epoch │   Train Acc │  Train Loss │   Valid Acc │  Valid Loss │        Time │
├─────────────┼─────────────┼─────────────┼─────────────┼─────────────┼─────────────┤
│           1 │      3.3333 │       8.908 │           0 │           0 │     0:00:07 │
╰─────────────┴─────────────┴─────────────┴─────────────┴─────────────┴─────────────╯


## Saving Model

In [38]:
!mkdir my_gpt-2

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [39]:
model.save_pretrained("my_gpt-2/")

## Load Our Model

In [10]:
model_reloaded = GPT2LMHeadModel.from_pretrained("my_gpt-2/")

In [11]:
model_reloaded

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(11954, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [1]:
# Doesn't load the last layer (see the next cell). Don't know why...
from transformers import AutoModel
model = AutoModel.from_pretrained("my_gpt-2/") 

Some weights of the model checkpoint at my_gpt-2/ were not used when initializing GPT2Model: ['lm_head.weight']
- This IS expected if you are initializing GPT2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
model

GPT2Model(
  (wte): Embedding(11954, 768)
  (wpe): Embedding(1024, 768)
  (drop): Dropout(p=0.1, inplace=False)
  (h): ModuleList(
    (0): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP(
        (c_fc): Conv1D()
        (c_proj): Conv1D()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
    (1): GPT2Block(
      (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (attn): GPT2Attention(
        (c_attn): Conv1D()
        (c_proj): Conv1D()
        (attn_dropout): Dropout(p=0.1, inplace=False)
        (resid_dropout): Dropout(p=0.1, inplace=False)
      )
      (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (mlp): GPT2MLP

# Generating Text

In [17]:
def generate(start):  
    input_token_ids = tokenizer.encode(start, return_tensors='pt')  
    output = model_reloaded.generate(  
        input_token_ids,  
        max_length = 30,  
        num_beams = 5,  
        temperature = 0.7,  
        no_repeat_ngram_size=2,  
        num_return_sequences=1  
    )  
    return tokenizer.decode(output[0])

In [18]:
generate(" ")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


',, of of, a of a, and of her, her of had of the of indulgent ofatus of and, had, the,'

In [14]:
generate("wetson was very good")

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


'wetson was very good,, of,'