In [None]:
from transformers import EncoderDecoderModel, BartTokenizer, BertTokenizer

# Load tokenizer (we'll use BART's tokenizer consistently for both sides)
from transformers import BartTokenizer, BertTokenizer

# Separate tokenizers for decoder and encoder
decoder_tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")
encoder_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")


# Load full encoder-decoder model
model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    "bert-base-uncased", "facebook/bart-base"
)

# Configure special token IDs
model.config.decoder_start_token_id = tokenizer.bos_token_id
model.config.eos_token_id = tokenizer.eos_token_id
model.config.pad_token_id = tokenizer.pad_token_id


  from .autonotebook import tqdm as notebook_tqdm
Some weights of BartForCausalLM were not initialized from the model checkpoint at facebook/bart-base and are newly initialized: ['lm_head.weight', 'model.decoder.embed_tokens.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [2]:
import pandas as pd

train_df = pd.read_csv('samsum_csv_data/train.csv')
validate_df = pd.read_csv('samsum_csv_data/validation.csv')

In [3]:
from transformers import BartTokenizer
import torch

# Load BART tokenizer
tokenizer = BartTokenizer.from_pretrained("facebook/bart-base")

# Slice the DataFrame
sample_df = train_df.iloc[:100]
input_texts = sample_df["dialogue"].tolist()
target_texts = sample_df["summary"].tolist()

# Tokenize encoder inputs (dialogue)
encoder_inputs = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Tokenize decoder targets (summary)
with tokenizer.as_target_tokenizer():
    decoder_inputs = tokenizer(
        target_texts,
        padding=True,
        truncation=True,
        max_length=64,
        return_tensors="pt"
    )

# Prepare decoder labels (masking pad tokens)
labels = decoder_inputs["input_ids"].clone()
labels[labels == tokenizer.pad_token_id] = -100



In [4]:
from transformers import EncoderDecoderModel, BartTokenizer
from torch.utils.data import Dataset, DataLoader
import torch
from torch.optim import AdamW
from tqdm import tqdm

# Dataset wrapper
class SummarizationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Wrap your data
train_dataset = SummarizationDataset(encoder_inputs, labels)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Move model to device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
device = torch.device("cpu")

# Move model to CPU
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        print({k: v.shape for k, v in batch.items()})
        print("Max label ID:", batch["labels"].max().item())
        print("Decoder vocab size:", model.config.decoder.vocab_size)
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())


Epoch 1


                                                 

{'input_ids': torch.Size([1, 512]), 'attention_mask': torch.Size([1, 512]), 'labels': torch.Size([1, 64])}
Max label ID: 32589
Decoder vocab size: 50265


IndexError: index out of range in self

In [5]:
print("Labels:", batch["labels"][0][:10])
shifted = batch["labels"][0][:-1]
print("Shifted decoder_input_ids:", shifted)
print("Any negatives?", (shifted < 0).any())


Labels: tensor([    0, 27298,    16,   567,     4, 15336,    40,   907,  5306,    92])
Shifted decoder_input_ids: tensor([    0, 27298,    16,   567,     4, 15336,    40,   907,  5306,    92,
        17753,    25,    37, 15033,   106, 32589,     4,  4769,  2162,  2864,
           10,    92,  2125,     9,  1637,  2104,     4,     2,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100])
Any negatives? tensor(True)


In [7]:
# Dataset wrapper
class SummarizationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Wrap your data
train_dataset = SummarizationDataset(encoder_inputs, labels)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Move model to device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
device = torch.device("cpu")

# Move model to CPU
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        decoder_input_ids = batch["labels"][:, :-1].clone()
        decoder_input_ids[decoder_input_ids == -100] = tokenizer.pad_token_id
        batch["decoder_input_ids"] = decoder_input_ids
        batch["labels"] = batch["labels"][:, 1:]
        print({k: v.shape for k, v in batch.items()})
        print("Max label ID:", batch["labels"].max().item())
        print("Decoder vocab size:", model.config.decoder.vocab_size)
        print("decoder_input_ids[0][:10]:", batch["decoder_input_ids"][0][:10])
        print("Any values >= vocab_size?", (batch["decoder_input_ids"] >= model.config.decoder.vocab_size).any())
        print("Any negatives?", (batch["decoder_input_ids"] < 0).any())

        outputs = model(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            decoder_input_ids=batch["decoder_input_ids"],
            labels=batch["labels"]
        )
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        loop.set_postfix(loss=loss.item())

Epoch 1


                                                 

{'input_ids': torch.Size([1, 512]), 'attention_mask': torch.Size([1, 512]), 'labels': torch.Size([1, 63]), 'decoder_input_ids': torch.Size([1, 63])}
Max label ID: 21169
Decoder vocab size: 50265
decoder_input_ids[0][:10]: tensor([    0, 21169,   300,    69,    78,  1361,  1886,     4,     2,     1])
Any values >= vocab_size? tensor(False)
Any negatives? tensor(False)




IndexError: index out of range in self

In [8]:
# Dataset wrapper
class SummarizationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['labels'] = self.labels[idx]
        return item

    def __len__(self):
        return len(self.labels)

# Wrap your data
train_dataset = SummarizationDataset(encoder_inputs, labels)

# Dataloader
train_loader = DataLoader(train_dataset, batch_size=1, shuffle=True)

# Move model to device
# device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# model = model.to(device)
device = torch.device("cpu")

# Move model to CPU
model = model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Train
epochs = 3
model.train()

for epoch in range(epochs):
    print(f"Epoch {epoch+1}")
    loop = tqdm(train_loader, desc="Training", leave=False)
    for batch in loop:
        batch = {k: v.to(device) for k, v in batch.items()}
        decoder_input_ids = batch["labels"][:, :-1].clone()
        decoder_input_ids[decoder_input_ids == -100] = tokenizer.pad_token_id
        batch["decoder_input_ids"] = decoder_input_ids
        batch["labels"] = batch["labels"][:, 1:]
        encoder_outputs = model.encoder(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])


Epoch 1


                                                 

IndexError: index out of range in self

In [9]:
print("Max input ID:", batch["input_ids"].max().item())
print("Encoder vocab size:", model.config.encoder.vocab_size)
print("Any negatives?", (batch["input_ids"] < 0).any())


Max input ID: 50118
Encoder vocab size: 30522
Any negatives? tensor(False)
