In [None]:
import torch
from torch.utils.data import Dataset
from transformers import PegasusForConditionalGeneration, PegasusTokenizer, Trainer, TrainingArguments
import pandas as pd

class CustomDataset(Dataset):
    def __init__(self, tokenizer, data_file, max_input_length, max_target_length):
        self.tokenizer = tokenizer
        self.data = pd.read_csv(data_file)
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        input_text = self.data.iloc[index]["text"]
        target_text = self.data.iloc[index]["titles"]

        # Tokenize the input and target texts
        inputs = self.tokenizer(input_text, padding="max_length", truncation=True, max_length=self.max_input_length, return_tensors="pt")
        targets = self.tokenizer(target_text, padding="max_length", truncation=True, max_length=self.max_target_length, return_tensors="pt")

        return {
            "input_ids": inputs.input_ids[0],
            "attention_mask": inputs.attention_mask[0],
            "labels": targets.input_ids[0],
        }

# Check if CUDA is available and use the GPU if so
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load the tokenizer and the Pegasus model pretrained on the GPU
tokenizer = PegasusTokenizer.from_pretrained("google/pegasus-xsum")
model_pegasus = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum").to(device)

# Create the training dataset
train_dataset = CustomDataset(tokenizer, "data/train.csv", max_input_length=128, max_target_length=128)

# Define the training arguments
training_args = TrainingArguments(
    output_dir="./pegasus_trained_model",
    num_train_epochs=3,  # Number of training epochs
    per_device_train_batch_size=4,  # Batch size per GPU
    save_steps=100,  # Model saving frequency
    logging_dir="./logs",
)

# Define the Trainer for training
trainer = Trainer(
    model=model_pegasus,
    args=training_args,
    train_dataset=train_dataset,
)

# Train the model
trainer.train()

# Save the trained model
model_pegasus.save_pretrained("trained_pegasus_model")


In [None]:

# Generate summaries for the test data
batch_size = 4  # Adjust batch size as needed
test_data = pd.read_csv("data/test_text.csv")
num_batches = (len(test_data) + batch_size - 1) // batch_size

generated_summaries = []

# Set the model to evaluation mode
model_pegasus.eval()

with torch.no_grad():
    for i in range(num_batches):
        batch_data = test_data.iloc[i * batch_size: (i + 1) * batch_size]

        # Tokenize the input texts
        input_encodings = tokenizer(batch_data["text"].tolist(), truncation=True, padding=True, return_tensors="pt").to(device)

        # Generate summaries
        outputs = model_pegasus.generate(input_encodings["input_ids"], max_length=50, num_beams=4, early_stopping=True)

        # Decode the generated summaries
        decoded_summaries = tokenizer.batch_decode(outputs, skip_special_tokens=True)

        generated_summaries.extend(decoded_summaries)

# Create a DataFrame for the submission
submission_df = pd.DataFrame({"ID": test_data["ID"], "titles": generated_summaries})

# Save the DataFrame to a CSV file
submission_df.to_csv("pegasus.csv", index=False)