In [1]:
import pandas as pd
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import T5Tokenizer, T5ForConditionalGeneration, AdamW
from datasets import load_dataset


class SummarizationDataset(Dataset):
    def __init__(self, dataframe, tokenizer, source_len, target_len):
        self.tokenizer = tokenizer
        self.data = dataframe
        self.source_len = source_len
        self.target_len = target_len
        self.source_text = self.data['article']
        self.target_text = self.data['title']

    def __len__(self):
        return len(self.source_text)

    def __getitem__(self, index):
        source_text = str(self.source_text[index])
        target_text = str(self.target_text[index])

        source = self.tokenizer.batch_encode_plus(
            [source_text],
            max_length=self.source_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )
        target = self.tokenizer.batch_encode_plus(
            [target_text],
            max_length=self.target_len,
            padding='max_length',
            truncation=True,
            return_tensors="pt"
        )

        source_ids = source["input_ids"].squeeze()
        source_mask = source["attention_mask"].squeeze()
        target_ids = target["input_ids"].squeeze()
        target_mask = target["attention_mask"].squeeze()

        return {
            "source_ids": source_ids,
            "source_mask": source_mask,
            "target_ids": target_ids,
            "target_mask": target_mask,
        }


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
dataset = load_dataset("HooshvareLab/pn_summary")

In [3]:
tokenizer = T5Tokenizer.from_pretrained("google/mt5-small")
train_data = dataset['train']
test_data = dataset['test']
eval_data = dataset['validation']

# Extract articles and titles
train_articles = [train_data[i]['article'] for i in range(20000)]
train_titles = [train_data[i]['title'] for i in range(20000)]
test_articles = [test_data[i]['article'] for i in range(3000)]
test_titles = [test_data[i]['title'] for i in range(3000)]
eval_articles = [eval_data[i]['article'] for i in range(3000)]
eval_titles = [eval_data[i]['title'] for i in range(3000)]

# Create DataFrames
train_df = pd.DataFrame({
    'article': train_articles,
    'title': train_titles
})
test_df = pd.DataFrame({
    'article': test_articles,
    'title': test_titles
})
eval_df = pd.DataFrame({
    'article': eval_articles,
    'title': eval_titles
})

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [4]:
source_len = 768
target_len = 150
batch_size = 8
learning_rate = 3e-5
epochs = 4

# Create Dataset objects
train_dataset = SummarizationDataset(train_df, tokenizer, source_len, target_len)
test_dataset = SummarizationDataset(test_df, tokenizer, source_len, target_len)
eval_dataset = SummarizationDataset(eval_df, tokenizer, source_len, target_len)

# Create DataLoader objects
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

# Initialize the model
model = T5ForConditionalGeneration.from_pretrained("google/mt5-small")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=learning_rate)

You are using a model of type mt5 to instantiate a model of type t5. This is not supported for all configurations of models and can yield errors.


In [None]:
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()

        input_ids = batch["source_ids"].to(device, dtype=torch.long)
        attention_mask = batch["source_mask"].to(device, dtype=torch.long)
        labels = batch["target_ids"].to(device, dtype=torch.long)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss

        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch + 1}, Loss: {total_loss / len(train_loader)}")

    # Evaluation loop
    model.eval()
    total_eval_loss = 0
    with torch.no_grad():
        for batch in eval_loader:
            input_ids = batch["source_ids"].to(device, dtype=torch.long)
            attention_mask = batch["source_mask"].to(device, dtype=torch.long)
            labels = batch["target_ids"].to(device, dtype=torch.long)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

            total_eval_loss += loss.item()

    print(f"Validation Loss: {total_eval_loss / len(eval_loader)}")

print("Training completed.")

In [None]:
# Prediction function
def predict(model, tokenizer, text, device, max_length=512):
    model.eval()
    inputs = tokenizer.encode_plus(text, return_tensors="pt", max_length=max_length, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    with torch.no_grad():
        outputs = model.generate(input_ids=input_ids, attention_mask=attention_mask, max_length=150, num_beams=4, early_stopping=True)
    
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return prediction

# Example usage
sample_article = eval_df.iloc[0]['article']
print("Article:", sample_article)
predicted_title = predict(model, tokenizer, sample_article, device)
print("Predicted Title:", predicted_title)