In [1]:
import pandas as pd

train_df = pd.read_csv('samsum_csv_data/train.csv')
validate_df = pd.read_csv('samsum_csv_data/validation.csv')

In [4]:
from transformers import BertTokenizer, BertModel
import torch

# Load tokenizer and encoder
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
encoder = BertModel.from_pretrained("bert-base-uncased")

# Example: tokenize a batch of dialogues
sample_dialogues = train_df["dialogue"][:200].tolist()
tokenized = tokenizer(
    sample_dialogues,
    padding=True,
    truncation=True,
    max_length=512,
    return_tensors="pt"
)

# Pass through encoder
with torch.no_grad():
    encoder_outputs = encoder(
        input_ids=tokenized["input_ids"],
        attention_mask=tokenized["attention_mask"]
    )

# These are your encoded representations
print(encoder_outputs.last_hidden_state.shape)  # (batch_size, seq_len, hidden_dim)


torch.Size([200, 505, 768])


In [5]:
from transformers import BertTokenizer, BertModel, GPT2Tokenizer, GPT2LMHeadModel, EncoderDecoderModel

# Load encoder and decoder
encoder = BertModel.from_pretrained("bert-base-uncased")
decoder = GPT2LMHeadModel.from_pretrained("gpt2")

# GPT2 needs special handling: disable padding, resize for BOS/EOS tokens
decoder.config.pad_token_id = decoder.config.eos_token_id

# Tie it all together
model = EncoderDecoderModel(encoder=encoder, decoder=decoder)
model.config.decoder_start_token_id = decoder.config.bos_token_id
