In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW
from sklearn.model_selection import train_test_split
import pandas as pd
from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

# Load and preprocess the CNN/Daily Mail dataset
class CNNDailyMailDataset(Dataset):
    def __init__(self, articles, highlights, tokenizer, max_length=512):
        self.articles = articles
        self.highlights = highlights
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.articles)

    def __getitem__(self, idx):
        article = self.articles[idx]
        highlight = self.highlights[idx]
        encoding = self.tokenizer.encode_plus(
          article,
          highlight,
          add_special_tokens=True,
          max_length=256,  # Reduce this from 512
          return_tensors='pt',
          padding='max_length',
          truncation=True,
      )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': encoding['input_ids'].flatten(),  # Use input_ids as labels for language modeling
        }

# Load the dataset
def load_data():
    # Replace with actual path to the dataset CSV file
    df = pd.read_csv('/content/validation.csv')  # Adjust path as necessary
    articles = df['article'].tolist()  # Change 'article' to your article column
    highlights = df['highlights'].tolist()  # Change 'highlight' to your highlight column
    return train_test_split(articles, highlights, test_size=0.1, random_state=42)

# Load pre-trained GPT-2 model and tokenizer
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = GPT2LMHeadModel.from_pretrained(model_name)  # Use the language model head for generation

# Load the dataset
train_articles, val_articles, train_highlights, val_highlights = load_data()

# Create DataLoaders
train_dataset = CNNDailyMailDataset(train_articles, train_highlights, tokenizer)
val_dataset = CNNDailyMailDataset(val_articles, val_highlights, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=4, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=4)

# Set up optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Training Loop
num_epochs = 3


accumulation_steps = 4  # Adjust based on your needs

for epoch in range(num_epochs):
    model.train()
    total_loss = 0

    for i, batch in enumerate(train_loader):
        optimizer.zero_grad()
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss / accumulation_steps  # Scale the loss
        loss.backward()

        if (i + 1) % accumulation_steps == 0:
            optimizer.step()

        total_loss += loss.item() * accumulation_steps  # Scale back the total loss

    avg_train_loss = total_loss / len(train_loader)
    print(f'Epoch {epoch + 1}/{num_epochs}, Train Loss: {avg_train_loss:.4f}')

# Validation Loop
model.eval()
total_val_loss = 0
with torch.no_grad():
    for batch in val_loader:
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        labels = batch['labels']

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        total_val_loss += outputs.loss.item()

avg_val_loss = total_val_loss / len(val_loader)
print(f'Validation Loss: {avg_val_loss:.4f}')

# Inference Function for Summarization
def summarize(text, model, tokenizer):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True)

        outputs = model.generate(
            input_ids=encoding,
            max_length=150,  # Adjust the max length for the summary
            num_beams=4,  # Use beam search
            early_stopping=True,
        )
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return summary

# Example Inference
test_article = "This is a test article to generate a summary."
generated_summary = summarize(test_article, model, tokenizer)
print(f'Generated Summary: {generated_summary}')

  scaler = GradScaler()
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'lon

KeyboardInterrupt: 

In [4]:
# Inference Function for Summarization
def summarize(text, model, tokenizer):
    model.eval()
    with torch.no_grad():
        encoding = tokenizer.encode(text, return_tensors='pt', max_length=512, truncation=True)

        outputs = model.generate(
            input_ids=encoding,
            max_length=150,  # Adjust the max length for the summary
            num_beams=4,  # Use beam search
            early_stopping=True,
        )
        summary = tokenizer.decode(outputs[0], skip_special_tokens=True)

    return summary

# Example Inference
test_article = "This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary"
generated_summary = summarize(test_article, model, tokenizer)
print(f'Generated Summary: {generated_summary}')

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Summary: This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.This is a test article to generate a summary.
