# Small BERT

### 1. Load Model

In [3]:
from transformers import BertTokenizer, EncoderDecoderModel, BertModel

# Load BERT tokenizer and BERT2BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [4]:
print(model)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [5]:
# Calculate the number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print the number of parameters
print(f"Total number of parameters: {total_params}")

Total number of parameters: 247363386


### 2. Load Fine-tuning Dataset

In [6]:
from datasets import load_dataset

# Load a summarization dataset (CNN/DailyMail)
dataset = load_dataset('cnn_dailymail', '3.0.0')

In [7]:
# Set the `eos_token` as the `pad_token`
# tokenizer.pad_token = tokenizer.eos_token  # or use `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

# Tokenization of dataset
def tokenize_data(example):
    inputs = tokenizer(
        example['article'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    labels = tokenizer(
        example['highlights'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    # print(f"Input Length: {len(inputs['input_ids'])}, Label Length: {len(labels['input_ids'])}")
    inputs['labels'] = labels['input_ids']
    return inputs


In [8]:
# Tokenize dataset
train_data = dataset['train'].select(range(10000)).map(tokenize_data, batched=True)
val_data = dataset['validation'].select(range(500)).map(tokenize_data, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 50/50 [00:00<00:00, 204.71 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 10/10 [00:00<00:00, 216.30 examples/s]


In [9]:
len(train_data[0]['labels'])

512

In [10]:
len(val_data[0]['input_ids'])

512

### 3. Fine-tune the Model

In [11]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

# Data collator to handle padding dynamically
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Output directory for the model
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=4,     # Training batch size
    per_device_eval_batch_size=4,      # Evaluation batch size
    warmup_steps=500,                  # Number of warmup steps
    weight_decay=0.01,                 # Weight decay for optimizer
    logging_dir='./logs',              # Directory for logging
    logging_steps=50,
    evaluation_strategy="epoch"        # Evaluate after every epoch
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

  0%|          | 0/39 [00:00<?, ?it/s]We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


KeyboardInterrupt: 

In [30]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-bert2bert-summarization')
tokenizer.save_pretrained('./fine-tuned-bert2bert-summarization')

('./fine-tuned-bert2bert-summarization/tokenizer_config.json',
 './fine-tuned-bert2bert-summarization/special_tokens_map.json',
 './fine-tuned-bert2bert-summarization/vocab.txt',
 './fine-tuned-bert2bert-summarization/added_tokens.json')

### 4. Test: Summarization

In [12]:
from transformers import BertTokenizer, EncoderDecoderModel

# Load the fine-tuned model and tokenizer
fine_tuned_model = EncoderDecoderModel.from_pretrained('./fine-tuned-bert2bert-summarization')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert2bert-summarization')

fine_tuned_model.generation_config.decoder_start_token_id = tokenizer.cls_token_id

# Generate a summary
def generate_summary(article):
    # Tokenize the input article
    inputs = fine_tuned_tokenizer.encode(
        article, 
        return_tensors='pt', 
        max_length=512, 
        truncation=True
    )
    
    # Generate the summary
    outputs = fine_tuned_model.generate(
        inputs, 
        max_length=128,  # Limit summary length
        num_beams=5,     # Beam search for better results
        early_stopping=True
    )
    
    # Decode the output to get the summary text
    summary = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Test the summarization
sample_article = dataset['test'][0]['article']
summary = generate_summary(sample_article)
print("Generated Summary:")
print(summary)

Generated Summary:
[unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [

In [16]:
# Check the model configuration for input length limits
# Access the configuration for encoder and decoder
encoder_config = fine_tuned_model.config.encoder
decoder_config = fine_tuned_model.config.decoder

# Print out the maximum lengths for the encoder and decoder
print(f"Encoder's max position embeddings: {encoder_config.max_position_embeddings}")
print(f"Decoder's max position embeddings: {decoder_config.max_position_embeddings}")

Encoder's max position embeddings: 512
Decoder's max position embeddings: 512


In [13]:
from datasets import load_metric

# Load ROUGE metric
rouge = load_metric("rouge")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


In [14]:
# Prepare lists for predictions and references
predictions = []
references = []

# Evaluate the first 100 samples in the test set
for i in range(5):  # Adjust the number for a larger evaluation
    # Get the article and reference summary from the dataset
    article = dataset['test'][i]['article']
    reference_summary = dataset['test'][i]['highlights']

    # Generate a summary for the article
    generated_summary = generate_summary(article)
    
    # Append the generated summary and reference summary to lists
    predictions.append(generated_summary)
    references.append(reference_summary)
    print(f"Generated Summary {i+1}:", generated_summary)
    print(f"Reference Summary {i+1}:", reference_summary)
    print("predictions length", len(predictions), ", references length", len(references))

# Compute the ROUGE scores for the generated summaries
results = rouge.compute(predictions=predictions, references=references)

# Print the ROUGE results
print("ROUGE Scores:")
print(f"ROUGE-1: {results['rouge1'].mid.fmeasure:.4f}")
print(f"ROUGE-2: {results['rouge2'].mid.fmeasure:.4f}")
print(f"ROUGE-L: {results['rougeL'].mid.fmeasure:.4f}")

Generated Summary 1: [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4]