In [1]:
# !pip install accelerate -U
# !pip install evaluate
# !pip install protobuf==3.20.0
# ! pip install datasets
# !pip install wandb -U
# !pip uninstall torch-gpu
# !pip install sentencepiece

In [2]:
from transformers import pipeline, set_seed
from datasets import load_dataset, load_from_disk
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from transformers import AutoTokenizer
import evaluate
import nltk
import tqdm as tqdm
import torch
nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\papu_\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

### Uses pegasus model trained by google for text summarization task. Trained by Masked Language Modelling and Gap-Sentence Prediction methods. https://huggingface.co/google/pegasus-xsum

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
device

'cpu'

In [4]:
from transformers import AutoModelForSeq2SeqLM

# model_name = 'google/pegasus-xsum' # This is large model, trying smaller models

model_name = 'google/mt5-small'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model_mt = AutoModelForSeq2SeqLM.from_pretrained(model_name).to('cpu')

  "The sentencepiece tokenizer that you are converting to a fast tokenizer uses the byte fallback option"


In [5]:
# Finetune on samsum dataset
data = load_dataset('samsum')

Found cached dataset samsum (C:/Users/papu_/.cache/huggingface/datasets/samsum/samsum/0.0.0/f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e)
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3/3 [00:00<00:00, 46.12it/s]


In [6]:
data

# To set on gpu
# ds.set_format("pt")

DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary'],
        num_rows: 818
    })
})

In [7]:
print('Features in dataset: {}'.format(data['train'].column_names))
print('Dialogue: {}'.format(data['train'][10]['dialogue']))
print('Summary: {}'.format(data['train'][10]['summary']))



Features in dataset: ['id', 'dialogue', 'summary']
Dialogue: Lucas: Hey! How was your day?
Demi: Hey there! 
Demi: It was pretty fine, actually, thank you!
Demi: I just got promoted! :D
Lucas: Whoa! Great news!
Lucas: Congratulations!
Lucas: Such a success has to be celebrated.
Demi: I agree! :D
Demi: Tonight at Death & Co.?
Lucas: Sure!
Lucas: See you there at 10pm?
Demi: Yeah! See you there! :D
Summary: Demi got promoted. She will celebrate that with Lucas at Death & Co at 10 pm.


## Use of text_target in tokenizer: This is because the languages you're translating between (Engish and German in this case) have different tokenization vocabularies. This implies that tokens will get tokenized differently. MarianMT models have seq2seq (encoder-decoder) architectures, and both the encoder and decoder each have their own embedding matrix. This means that the encoder will have an embedding vector for the token '▁doctor', whereas the decoder will learn an embedding vector for the token '▁do', an embedding vector for the token 'ctor', etc.

In [8]:
# Tokenize the data by passing in batches

def tokenize_text_batches(batch_data):
    input_encodings = tokenizer(text = batch_data['dialogue'], padding=True, truncation=True, max_length=1024)
    with tokenizer.as_target_tokenizer():
        target_encodings = tokenizer(text = batch_data['summary'], padding=True, truncation=True, max_length=512)
    
    # Need to pass target encodings within the dict as labels key for the transformers model input
    return {
        'input_ids': input_encodings['input_ids'],
        'attention_mask': input_encodings['attention_mask'],
        'labels': target_encodings['input_ids']
    }



In [9]:
# with tokenizer.as_target_tokenizer():
#     a = tokenizer( text = data['train'][10]['dialogue'], padding=True, truncation=True, max_length=1024)
# a    

In [10]:
data_enc = data.map(tokenize_text_batches, batched=True, batch_size = 500)


Loading cached processed dataset at C:\Users\papu_\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-707f35c8110f794c.arrow
Loading cached processed dataset at C:\Users\papu_\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-340e266951b38132.arrow
Loading cached processed dataset at C:\Users\papu_\.cache\huggingface\datasets\samsum\samsum\0.0.0\f1d7c6b7353e6de335d444e424dc002ef70d1277109031327bc9cc6af5d3d46e\cache-64b8009eda7821ec.arrow


In [11]:
print(type(data_enc), data_enc)


<class 'datasets.dataset_dict.DatasetDict'> DatasetDict({
    train: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 14732
    })
    test: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['id', 'dialogue', 'summary', 'input_ids', 'attention_mask', 'labels'],
        num_rows: 818
    })
})


In [12]:
rouge = evaluate.load("rouge")

## Training model

## Use Data Collector Seq2Seq for transforming data into batches
Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.

To be able to build batches, data collators may apply some processing (like padding). Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) on the formed batch.


In [13]:
from transformers import DataCollatorForSeq2Seq

seq2seq_dc = DataCollatorForSeq2Seq(tokenizer, model = model_mt)
seq2seq_dc

DataCollatorForSeq2Seq(tokenizer=T5TokenizerFast(name_or_path='google/mt5-small', vocab_size=250100, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True), model=MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (l

In [14]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(output_dir = 'mt5_model_files', num_train_epochs = 1, warmup_steps = 100, 
                                  per_device_train_batch_size = 10, per_device_eval_batch_size = 10, weight_decay = 0.02,
                                 logging_steps = 10)


In [15]:
trainer = Trainer(model = model_mt.to('cpu'), args = training_args, tokenizer = tokenizer, data_collator = seq2seq_dc, 
                 train_dataset = data_enc['test'], eval_dataset = data_enc['validation'])

In [15]:
# trainer.train()


Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mpaveethran-s10[0m ([33mlegolas-team[0m). Use [1m`wandb login --relogin`[0m to force relogin


You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: [enforce fail at C:\actions-runner\_work\pytorch\pytorch\builder\windows\pytorch\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 786432000 bytes.

### Evaluate: Custom way of inference

In [34]:
trainer.model

MT5ForConditionalGeneration(
  (shared): Embedding(250112, 512)
  (encoder): MT5Stack(
    (embed_tokens): Embedding(250112, 512)
    (block): ModuleList(
      (0): MT5Block(
        (layer): ModuleList(
          (0): MT5LayerSelfAttention(
            (SelfAttention): MT5Attention(
              (q): Linear(in_features=512, out_features=384, bias=False)
              (k): Linear(in_features=512, out_features=384, bias=False)
              (v): Linear(in_features=512, out_features=384, bias=False)
              (o): Linear(in_features=384, out_features=512, bias=False)
              (relative_attention_bias): Embedding(32, 6)
            )
            (layer_norm): MT5LayerNorm()
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (1): MT5LayerFF(
            (DenseReluDense): MT5DenseGatedActDense(
              (wi_0): Linear(in_features=512, out_features=1024, bias=False)
              (wi_1): Linear(in_features=512, out_features=1024, bias=False)
          

In [83]:
def compute_metrics(dataset, metric, model, tokenizer, device, batch_size = 20, max_len=200):
    content_data = []
    target_data = []
    print(len(dataset['id']))
    
    for i in range(0, len(dataset['id']), batch_size):
        content_data.append(dataset['dialogue'][i:i+batch_size])
        target_data.append(dataset['summary'][i:i+batch_size])
        
    for content, target in tqdm.tqdm(zip(content_data, target_data), total = len(content_data)):
        content_enc = tokenizer(text = content, padding=True, truncation=True, max_length=1024, return_tensors = 'pt')
#         target_enc = tokenizer(target_text = target, padding=True, truncation=True, max_length=512)
        
        prediction_tokens = model.generate(input_ids = content_enc['input_ids'].to(device),
                                           attention_mask = content_enc['attention_mask'].to(device),
                                           length_penalty = 0.6, max_length = max_len)
        
        # To decode the generated tokens to words
        prediction_summary = [tokenizer.decode(token, skip_special_tokens=True, clean_up_tokenization_spaces = True) 
                             for token in prediction_tokens]
        
        # Compute ROUGE score
        metric.add_batch(predictions = prediction_summary, references=target)
        
    result = metric.compute(use_stemmer=True)
    return result

In [84]:
compute_metrics(data['validation'][:100], rouge, model_mt, tokenizer, device = 'cpu', batch_size = 20, max_len = 200)


5



  0%|                                                                                                                      | 0/5 [00:00<?, ?it/s][A
 20%|██████████████████████                                                                                        | 1/5 [00:12<00:50, 12.72s/it][A
 40%|████████████████████████████████████████████                                                                  | 2/5 [00:20<00:30, 10.02s/it][A
 60%|██████████████████████████████████████████████████████████████████                                            | 3/5 [00:29<00:18,  9.27s/it][A
 80%|████████████████████████████████████████████████████████████████████████████████████████                      | 4/5 [00:38<00:09,  9.11s/it][A
100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5/5 [00:44<00:00,  8.94s/it][A


{'rouge1': 0.005072661600525378,
 'rouge2': 0.0009090909090909091,
 'rougeL': 0.004950136035275355,
 'rougeLsum': 0.004953325827938831}

### Pipeline way of inference

In [86]:
from transformers import pipeline
summarizer = pipeline("summarization", model="google/mt5-small")

In [90]:
for i in data['test']['dialogue'][:10]:
    print('Dialogue: {}, Summary: {} '.format(i, summarizer(i)[0]['summary_text']))

Hannah: Hey, do you have Betty's number?
Amanda: Lemme check
Hannah: <file_gif>
Amanda: Sorry, can't find it.
Amanda: Ask Larry
Amanda: He called her last time we were at the park together
Hannah: I don't know him well
Hannah: <file_gif>
Amanda: Don't be shy, he's very nice
Hannah: If you say so..
Hannah: I'd rather you texted him
Amanda: Just text him 🙂
Hannah: Urgh.. Alright
Hannah: Bye
Amanda: Bye bye


TypeError: list indices must be integers or slices, not str

In [91]:
summarizer(i)

[{'summary_text': '<extra_id_0>.'}]