In [None]:
!pip install bert_score
!pip install datsets transformers[sentencepiece]
!pip install sentencepiece

In [None]:
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from bert_score import score
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
from torch import cuda

In [None]:
device = 'cuda' if cuda.is_available() else 'cpu'

In [None]:
class CustomDataset(Dataset): # https://www.learnpytorch.io/04_pytorch_custom_datasets/

    def __init__(self, dataframe, tokenizer, source_len, summ_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.summ_len = summ_len
        self.original = self.data.src
        self.summary = self.data.tgt

    def __len__(self):
        return len(self.original)

    def __getitem__(self, index):
        summary = str(self.summary[index])
        summary = ' '.join(summary.split())

        original = str(self.original[index])
        original = ' '.join(original.split())

        source = self.tokenizer.batch_encode_plus([summary], max_length= self.source_len, pad_to_max_length=True,return_tensors='pt')
        target = self.tokenizer.batch_encode_plus([original], max_length= self.summ_len, pad_to_max_length=True,return_tensors='pt')

        source_ids = source['input_ids'].squeeze()
        source_mask = source['attention_mask'].squeeze()
        target_ids = target['input_ids'].squeeze()
        target_mask = target['attention_mask'].squeeze()

        return {
            'source_ids': source_ids.to(dtype=torch.long),
            'source_mask': source_mask.to(dtype=torch.long),
            'target_ids': target_ids.to(dtype=torch.long),
            'target_ids_y': target_ids.to(dtype=torch.long)
        }

In [None]:
def train(epoch, tokenizer, model, device, loader, optimizer): # https://www.learnpytorch.io/06_pytorch_transfer_learning/
    model.train()
    for _,data in enumerate(loader, 0):
        y = data['target_ids'].to(device, dtype = torch.long)
        y_ids = y[:, :-1].contiguous()
        lm_labels = y[:, 1:].clone().detach()
        lm_labels[y[:, 1:] == tokenizer.pad_token_id] = -100
        ids = data['source_ids'].to(device, dtype = torch.long)
        mask = data['source_mask'].to(device, dtype = torch.long)

        outputs = model(input_ids = ids, attention_mask = mask, decoder_input_ids=y_ids, labels=lm_labels)
        loss = outputs[0]

        if _%500==0:
            print(f'Epoch: {epoch}, Loss:  {loss.item()}')

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

In [None]:
EPOCHS = 2
MAX_LEN = 1024
SUMMARY_LEN = 300

tokenizer = T5Tokenizer.from_pretrained("t5-base")
df = pd.read_csv('train_cut.csv',encoding='latin-1')
df = df[['src','tgt']]
df.src = 'summarize: ' + df.src

train_size = 0.8
train_dataset=df.sample(frac=train_size).reset_index(drop=True)
val_dataset=df.drop(train_dataset.index).reset_index(drop=True)

training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)
val_set = CustomDataset(val_dataset, tokenizer, MAX_LEN, SUMMARY_LEN)

train_params = {
    'batch_size': EPOCHS,
    'shuffle': True,
    'num_workers': 0
    }

val_params = {
    'batch_size': EPOCHS,
    'shuffle': False,
    'num_workers': 0
    }

training_loader = DataLoader(training_set, **train_params)
val_loader = DataLoader(val_set, **val_params)

model = T5ForConditionalGeneration.from_pretrained("t5-base")
model = model.to(device)

optimizer = torch.optim.Adam(params=model.parameters(), lr=1e-4)

for epoch in range(EPOCHS):
    train(epoch, tokenizer, model, device, training_loader, optimizer)

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Epoch: 0, Loss:  15.12225341796875
Epoch: 1, Loss:  3.128291606903076


In [None]:
def predict(text, tokenizer, model, device, summary_len=300):
    model.eval()
    text = 'summarize: ' + text
    encoding = tokenizer.encode_plus(text, pad_to_max_length=True, return_tensors="pt")
    input_ids, attention_masks = encoding["input_ids"].to(device), encoding["attention_mask"].to(device)

    outputs = model.generate(
        input_ids=input_ids,
        attention_mask=attention_masks,
        max_length=summary_len,
        num_beams=2,
        repetition_penalty=2.5,
        length_penalty=1.0,
        early_stopping=True
    )

    output_str = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return output_str