# Small BERT

### 1. Load Model

In [13]:
from transformers import BertTokenizer, EncoderDecoderModel, BertModel

# Load BERT tokenizer and BERT2BERT model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased')

Some weights of BertLMHeadModel were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['bert.encoder.layer.0.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.0.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.0.crossattention.output.dense.bias', 'bert.encoder.layer.0.crossattention.output.dense.weight', 'bert.encoder.layer.0.crossattention.self.key.bias', 'bert.encoder.layer.0.crossattention.self.key.weight', 'bert.encoder.layer.0.crossattention.self.query.bias', 'bert.encoder.layer.0.crossattention.self.query.weight', 'bert.encoder.layer.0.crossattention.self.value.bias', 'bert.encoder.layer.0.crossattention.self.value.weight', 'bert.encoder.layer.1.crossattention.output.LayerNorm.bias', 'bert.encoder.layer.1.crossattention.output.LayerNorm.weight', 'bert.encoder.layer.1.crossattention.output.dense.bias', 'bert.encoder.layer.1.crossattention.output.dense.weight', 'bert.encoder.layer.1.crossattention.self.key.bias', 'bert.e

In [14]:
print(model)

EncoderDecoderModel(
  (encoder): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

In [15]:
# Calculate the number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Print the number of parameters
print(f"Total number of parameters: {total_params}")

Total number of parameters: 247363386


### 2. Load Fine-tuning Dataset

In [7]:
from datasets import load_dataset

# Load a summarization dataset (CNN/DailyMail)
dataset = load_dataset('cnn_dailymail', '3.0.0')

In [18]:
# Set the `eos_token` as the `pad_token`
# tokenizer.pad_token = tokenizer.eos_token  # or use `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
model.encoder.resize_token_embeddings(len(tokenizer))
model.decoder.resize_token_embeddings(len(tokenizer))

# Tokenization of dataset
def tokenize_data(example):
    inputs = tokenizer(
        example['article'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    labels = tokenizer(
        example['highlights'],
        padding='max_length',  # Use padding here
        max_length=512,
        truncation=True,
    )
    # print(f"Input Length: {len(inputs['input_ids'])}, Label Length: {len(labels['input_ids'])}")
    inputs['labels'] = labels['input_ids']
    return inputs


In [25]:
# Tokenize dataset
train_data = dataset['train'].select(range(50)).map(tokenize_data, batched=True)
val_data = dataset['validation'].select(range(10)).map(tokenize_data, batched=True)


[A
Map: 100%|██████████| 50/50 [00:00<00:00, 184.61 examples/s]

Map: 100%|██████████| 10/10 [00:00<00:00, 188.49 examples/s]


In [20]:
len(train_data[0]['labels'])

512

In [21]:
len(val_data[0]['input_ids'])

512

### 3. Fine-tune the Model

In [29]:
from transformers import DataCollatorForSeq2Seq, TrainingArguments, Trainer

# Data collator to handle padding dynamically
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
model.config.pad_token_id = tokenizer.pad_token_id
model.config.decoder_start_token_id = tokenizer.cls_token_id

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',            # Output directory for the model
    num_train_epochs=3,                # Number of training epochs
    per_device_train_batch_size=4,     # Training batch size
    per_device_eval_batch_size=4,      # Evaluation batch size
    warmup_steps=5,                  # Number of warmup steps
    weight_decay=0.01,                 # Weight decay for optimizer
    logging_dir='./logs',              # Directory for logging
    logging_steps=10,
    evaluation_strategy="epoch"        # Evaluate after every epoch
)

# Trainer initialization
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=val_data,
    data_collator=data_collator
)

# Fine-tune the model
trainer.train()

  0%|          | 0/39 [06:49<?, ?it/s]
                                               
 26%|██▌       | 10/39 [02:10<06:31, 13.50s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 4.2647058823529415e-05, 'epoch': 0.77}


 33%|███▎      | 13/39 [02:46<05:21, 12.35s/it]
[A
[A
[A

                                               
[A                                            
 33%|███▎      | 13/39 [02:48<05:21, 12.35s/it]
[A

{'eval_loss': nan, 'eval_runtime': 1.8901, 'eval_samples_per_second': 5.291, 'eval_steps_per_second': 1.587, 'epoch': 1.0}


                                               
 51%|█████▏    | 20/39 [04:23<04:14, 13.41s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 2.7941176470588236e-05, 'epoch': 1.54}


 67%|██████▋   | 26/39 [05:39<02:43, 12.58s/it]
[A
[A
[A

                                               
[A                                            
 67%|██████▋   | 26/39 [05:41<02:43, 12.58s/it]
[A

{'eval_loss': nan, 'eval_runtime': 2.0173, 'eval_samples_per_second': 4.957, 'eval_steps_per_second': 1.487, 'epoch': 2.0}


                                               
 77%|███████▋  | 30/39 [06:33<01:59, 13.28s/it]

{'loss': 0.0, 'grad_norm': nan, 'learning_rate': 1.323529411764706e-05, 'epoch': 2.31}



[A
[A
[A

                                               
[A                                            
100%|██████████| 39/39 [08:40<00:00, 12.98s/it]
                                               
100%|██████████| 39/39 [08:40<00:00, 13.36s/it]

{'eval_loss': nan, 'eval_runtime': 2.2392, 'eval_samples_per_second': 4.466, 'eval_steps_per_second': 1.34, 'epoch': 3.0}
{'train_runtime': 520.902, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.075, 'train_loss': 0.0, 'epoch': 3.0}





TrainOutput(global_step=39, training_loss=0.0, metrics={'train_runtime': 520.902, 'train_samples_per_second': 0.288, 'train_steps_per_second': 0.075, 'total_flos': 92018115072000.0, 'train_loss': 0.0, 'epoch': 3.0})

In [30]:
# Save the fine-tuned model and tokenizer
model.save_pretrained('./fine-tuned-bert2bert-summarization')
tokenizer.save_pretrained('./fine-tuned-bert2bert-summarization')

('./fine-tuned-bert2bert-summarization/tokenizer_config.json',
 './fine-tuned-bert2bert-summarization/special_tokens_map.json',
 './fine-tuned-bert2bert-summarization/vocab.txt',
 './fine-tuned-bert2bert-summarization/added_tokens.json')

### 4. Test: Summarization

In [34]:
from transformers import BertTokenizer, EncoderDecoderModel

# Load the fine-tuned model and tokenizer
fine_tuned_model = EncoderDecoderModel.from_pretrained('./fine-tuned-bert2bert-summarization')
fine_tuned_tokenizer = BertTokenizer.from_pretrained('./fine-tuned-bert2bert-summarization')

fine_tuned_model.generation_config.decoder_start_token_id = tokenizer.cls_token_id

# Generate a summary
def generate_summary(article):
    # Tokenize the input article
    inputs = fine_tuned_tokenizer.encode(
        article, 
        return_tensors='pt', 
        max_length=512, 
        truncation=True
    )
    
    # Generate the summary
    outputs = fine_tuned_model.generate(
        inputs, 
        max_length=128,  # Limit summary length
        num_beams=5,     # Beam search for better results
        early_stopping=True
    )
    
    # Decode the output to get the summary text
    summary = fine_tuned_tokenizer.decode(outputs[0], skip_special_tokens=True)
    return summary

# Test the summarization
sample_article = dataset['test'][0]['article']
summary = generate_summary(sample_article)
print("Generated Summary:")
print(summary)

Generated Summary:
[unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [unused4] [