# General Imports

In [None]:
!pip install bert_score
from bert_score import score
import numpy as np
import pandas as pd

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# T5 Transformer

T5 is a pre-trained model structure, so maybe we can use this as a upper benchmark?

[BERTScore Paper](https://arxiv.org/pdf/1904.09675.pdf)<br>
[T5 Paper](https://arxiv.org/pdf/1910.10683v3.pdf)

In [None]:
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement datsets (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for datsets[0m[31m
[0mLooking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import cuda

In [None]:
!nvidia-smi

Fri Jun  2 10:06:08 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P8     9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class CustomDataset(Dataset): # https://www.learnpytorch.io/04_pytorch_custom_datasets/

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.original = self.data.src
        self.summary = self.data.tgt

    def __len__(self):
        return len(self.original)

    def __getitem__(self, index):
        summary = str(self.summary[index])
        summary = ' '.join(summary.split())

        original = str(self.original[index])
        original = ' '.join(original.split())

        source = self.tokenizer.batch_encode_plus([summary], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([original], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long), 
            'source_mask': source_mask.to(dtype=torch.long), 
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer): # https://www.learnpytorch.io/06_pytorch_transfer_learning/
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]
        
        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
EPOCHS = 2
MAX_LEN = 1024
SUMMARY_LEN = 300 

tokenizer = T5Tokenizer.from_pretrained("t5-base")
df = pd.read_csv('train_cut.csv',encoding='latin-1')
df = df[['src','tgt']]
df.src = 'summarize: ' + df.src

train_size = 0.8
train_dataset=df.sample(frac=train_size).reset_index(drop=True)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

train_params = {
    'batch_size': EPOCHS,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': EPOCHS,
    'shuffle': False,
    'num_workers': 0
    }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

for epoch in range(EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  15.12225341796875
Epoch: 1, Loss:  3.128291606903076


In [None]:
def predict(text, tokenizer, model, device, summary_len=300):
    model.eval()
    text = 'summarize: ' + text
    encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length=summary_len,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    output_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_str

In [None]:
df_test = pd.read_csv('test_cut.csv')
df_test['y_pred'] = ""

for i in range(len(df_test)):
  df_test.iloc[i]['y_pred'] = predict(df_test.iloc[i]['src'], tokenizer, model, device, summary_len=300)


df_test.to_csv('test_cut_gigi.csv', index=False)

Token indices sequence length is longer than the specified maximum sequence length for this model (1176 > 512). Running this sequence through the model will result in indexing errors
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_test.iloc[i]['y_pred'] = predict(df_test.iloc[i]['src'], tokenizer, model, device, summary_len=300)


In [None]:
df_test['y_pred'].iloc[0]

''

In [None]:
predict(df_test.iloc[2]['src'], tokenizer, model, device, summary_len=300)

'a photo with Sly Stallone cost $395, an autograph was $395 and a photo with him was $445. A woman selling tickets for Stallone\'s signature said she had only 20 tickets left unsold at the time of the event. Then there were those who paid more than $500 to pose with their hero. And that\'s not all. There are plenty of other celebrities who have been booked by Comic Con. Here\'s a price list of some of the stars who took photos with Stallone. "He\'s flying here straight from Bulgaria," said a woman selling tickets for Stallone\'s autograph. "This is a very limited opportunity," she said. "This is a very limited opportunity." At 3:30 pm, there were only 20 tickets left unsold, though she couldn\'t say out of how many available.'

In [None]:
%pip install rouge_score

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=2c34692abce68de1b40de17ea7b7c45e332dbcf9b4ffc6f73bde06ecf674bcdc
  Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2


In [None]:
from rouge_score import rouge_scorer
from tqdm import tqdm

{'rougeL': Score(precision=0.265625, recall=0.1717171717171717, fmeasure=0.2085889570552147)}

In [None]:
scores = []
for i in tqdm(range(len(df_test))):
  scores.append(rouge_scorer.RougeScorer(['rougeL'], use_stemmer=True).score(df_test.iloc[i]['tgt'], predict(df_test.iloc[i]['src'], tokenizer, model, device, summary_len=300)))

100%|██████████| 307/307 [16:35<00:00,  3.24s/it]


In [None]:
# print average score
precision = []
recall = []
fmeasure = []

for i in range(len(scores)):
    precision.append(scores[i]['rougeL'].precision)
    recall.append(scores[i]['rougeL'].recall)
    fmeasure.append(scores[i]['rougeL'].fmeasure)

print("Precision: " + str(np.mean(precision)))
print("Recall: " + str(np.mean(recall)))
print("F1: " + str(np.mean(fmeasure)))

Precision: 0.30209967686669587
Recall: 0.1879199834748391
F1: 0.22349411951337914


In [None]:
# create dataset pandas with precision, recall, f1 to send to Niko to plot :)
df_scores = pd.DataFrame(list(zip(precision, recall, fmeasure)), columns =['precision', 'recall', 'f1'])
df_scores.to_csv('scores.csv', index=False)


NameError: ignored