# Small BERT

### 0. Prepare

In [None]:
# Modify this
MODEL_NAME = 'checkpoint-15000' # model save path will be '/results/{MODEL_NAME}'
ENABLE_TRAIN = True
ENABLE_TEST = True
TRAIN_SIZE = 60000
VAL_SIZE = 6000
TEST_SIZE = 1000

In [None]:
import os
try:

    from google.colab import drive
    drive.mount('/content/gdrive')

    DRIVE_PATH = '/content/gdrive/MyDrive/BERT_scale/bert'
    DRIVE_PYTHON_PATH = DRIVE_PATH.replace('\\', '')
    if not os.path.exists(DRIVE_PYTHON_PATH):
      %mkdir $DRIVE_PATH

    ## the space in `My Drive` causes some issues,
    ## make a symlink to avoid this
    SYM_PATH = '/content/BERT_scale/bert'
    print(os.path.exists(SYM_PATH))
    if not os.path.exists(SYM_PATH):
      !ln -s $DRIVE_PATH $SYM_PATH

    running_in_colab = True

    # We already mounted in our google drive.
    # Enter the foler where you put files in:
    %cd '/content/gdrive/MyDrive/BERT_scale/bert'

    # Current working directory:
    !pwd

    # What files are there:
    !ls


except ModuleNotFoundError:
    running_in_colab = False
    print(
        "I guess you are running locally. If you get this message in Colab, check the files."
    )

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).
False
ln: failed to create symbolic link '/content/BERT_scale/bert': No such file or directory
/content/gdrive/MyDrive/BERT_scale/bert
/content/gdrive/MyDrive/BERT_scale/bert
fine-tuned-bert2bert-summarization  logs  results  wandb


In [None]:
!pip install transformers
!pip install datasets peft trl
!pip install evaluate
!pip install rouge_score



In [None]:
import torch
from transformers import (
    BertTokenizer,
    BartTokenizer,
    EncoderDecoderModel,
    Trainer,
    Seq2SeqTrainer,
    TrainingArguments,
    Seq2SeqTrainingArguments,
    DataCollatorForSeq2Seq,
    BartForConditionalGeneration
)
from datasets import load_dataset
from peft import get_peft_model, LoraConfig, TaskType, PeftModel, PeftConfig

### 1. Load Model

In [None]:
# Configure PEFT with LoRA
peft_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    # target_modules=target_modules
)

In [None]:
if ENABLE_TRAIN:
    # Load the tokenizer
    # tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base')

    # Initialize the encoder-decoder model
    # model = EncoderDecoderModel.from_encoder_decoder_pretrained(
    #     'bert-base-uncased', 'bert-base-uncased'
    # )
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

    if MODEL_NAME:
        model = PeftModel.from_pretrained(model, f'./results/{MODEL_NAME}')
        # Verify that only the LoRA parameters are trainable
        for name, param in model.named_parameters():
            if "lora" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False
    else:
        model = get_peft_model(model, peft_config)

# Optionally, print trainable parameters
model.print_trainable_parameters()

In [None]:
# if ENABLE_TRAIN:
#   """ For BERT decoder only. BART don't need these settings. """
#   # Set special tokens
#   model.config.decoder_start_token_id = tokenizer.cls_token_id
#   model.config.eos_token_id = tokenizer.sep_token_id
#   model.config.pad_token_id = tokenizer.pad_token_id

#   # Important generation parameters
#   model.config.vocab_size = model.config.encoder.vocab_size
#   model.config.max_length = 128
#   model.config.min_length = 30
#   model.config.no_repeat_ngram_size = 3
#   model.config.early_stopping = True
#   model.config.length_penalty = 2.0
#   model.config.num_beams = 4

In [None]:
if ENABLE_TRAIN:
  # Calculate the number of parameters
  total_params = sum(p.numel() for p in model.parameters())

  # Print the number of parameters
  print(f"Total number of parameters: {total_params}")

Total number of parameters: 139420416


### 2. Load Fine-tuning Dataset

In [None]:
# Load the cnn_dailymail dataset
dataset = load_dataset('cnn_dailymail', '3.0.0')
if ENABLE_TRAIN:
  if TRAIN_SIZE > 0:
    dataset['train'] = dataset['train'].shuffle(seed=0).select(range(TRAIN_SIZE))
  if VAL_SIZE > 0:
    dataset['validation'] = dataset['validation'].shuffle(seed=0).select(range(VAL_SIZE))
if ENABLE_TEST and TEST_SIZE > 0:
  dataset['test'] = dataset['test'].shuffle(seed=0).select(range(TEST_SIZE))

README.md:   0%|          | 0.00/15.6k [00:00<?, ?B/s]

train-00000-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00001-of-00003.parquet:   0%|          | 0.00/257M [00:00<?, ?B/s]

train-00002-of-00003.parquet:   0%|          | 0.00/259M [00:00<?, ?B/s]

validation-00000-of-00001.parquet:   0%|          | 0.00/34.7M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/30.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/287113 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/13368 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/11490 [00:00<?, ? examples/s]

In [None]:
if ENABLE_TRAIN:
  def preprocess_function(examples):
    inputs = examples['article']
    targets = examples['highlights']
    model_inputs = tokenizer(
        inputs, max_length=512, truncation=True, padding='max_length'
    )

    # Setup the tokenizer for targets
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, max_length=128, truncation=True, padding='max_length'
        )

    model_inputs['labels'] = labels['input_ids']
    return model_inputs

In [None]:
if ENABLE_TRAIN:
  # Tokenize the dataset
  tokenized_datasets_train = dataset['train'].map(
      preprocess_function,
      batched=True,
      remove_columns=['article', 'highlights', 'id']
  )
  tokenized_datasets_val = dataset['validation'].map(
      preprocess_function,
      batched=True,
      remove_columns=['article', 'highlights', 'id']
  )

Map:   0%|          | 0/60000 [00:00<?, ? examples/s]



Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [None]:
print(f"There are {len(dataset['train'])} train data, {len(dataset['validation'])} validation data, {len(dataset['test'])} test data.")

There are 287113 train data, 13368 validation data in cnn_dailymail.


In [None]:
for k, v in val_data[0].items():
  print(f"'{k}': ", end = "")
  if type(v) == list:
    print(len(v))
  else:
    print(v)

'article': Bacon is a classic American breakfast staple that's best served hot off of the grill- or more surprisingly, a gun. Skilled marksman Dustin Ellerman of Texas decided to take a shot at making his own version of the popular food by cooking it on the end of his M16. Ellerman, a competitive shooter and director of a Christian camp called His way, is also known for winning the third season of the History Channel's shooting competition Top Shot. Scroll down for video . Skilled marksman Dustin Ellerman of Texas decided to take a shot at making his own version of the popular food by cooking it on the end of his M16 . Preparation: Ellerman wraps a piece of raw bacon on the end of an M16 that will cook as he shoots the gun . Perhaps with his bacon cooking skills he also should have been a contestant on Top Chef. A sizzle reel of his cooking bacon on a 'Gemtech TREK 5.56mm suppressor heated by firing 90ish rounds of wolf 223 on a registered pre-86 M16,' according to his YouTube, shows t

### 3. Fine-tune the Model

In [None]:
""" For BERT decoder only. BART don't need these settings. """
# num_encoder_layers = model.config.encoder.num_hidden_layers
# num_decoder_layers = model.config.decoder.num_hidden_layers

# # Generate target modules for encoder self-attention
# encoder_target_modules = []
# for i in range(num_encoder_layers):
#     encoder_target_modules.extend([
#         f'encoder.encoder.layer.{i}.attention.self.query',
#         f'encoder.encoder.layer.{i}.attention.self.key',
#         f'encoder.encoder.layer.{i}.attention.self.value',
#     ])

# # Generate target modules for decoder self-attention
# decoder_self_attn_target_modules = []
# for i in range(num_decoder_layers):
#     # decoder_self_attn_target_modules.extend([
#         f'decoder.bert.encoder.layer.{i}.attention.self.query',
#         f'decoder.bert.encoder.layer.{i}.attention.self.key',
#         f'decoder.bert.encoder.layer.{i}.attention.self.value',
#     ])

# # Generate target modules for decoder cross-attention
# decoder_cross_attn_target_modules = []
# for i in range(num_decoder_layers):
#     decoder_cross_attn_target_modules.extend([
#         f'decoder.bert.encoder.layer.{i}.crossattention.self.query',
#         f'decoder.bert.encoder.layer.{i}.crossattention.self.key',
#         f'decoder.bert.encoder.layer.{i}.crossattention.self.value',
#     ])

# # Combine all target modules
# target_modules = (
#     encoder_target_modules +
#     decoder_self_attn_target_modules +
#     decoder_cross_attn_target_modules
# )

In [None]:
if ENABLE_TRAIN:
  # data_collator = DataCollatorForSeq2Seq(
  #     tokenizer=tokenizer, model=model, padding='longest'
  # )

  # Set up training arguments
  training_args = Seq2SeqTrainingArguments(
      output_dir='./results',
      # overwrite_output_dir=True,
      num_train_epochs=2,
      per_device_train_batch_size=4,
      per_device_eval_batch_size=4,
      evaluation_strategy='epoch',
      save_strategy='epoch',
      logging_steps=1,
      save_total_limit=1,
      load_best_model_at_end=True,
      metric_for_best_model='eval_loss',
      greater_is_better=False,
  )

  # Initialize the Trainer
  trainer = Seq2SeqTrainer(
      model=model,
      args=training_args,
      train_dataset=tokenized_datasets_train,
      eval_dataset=tokenized_datasets_val,
      tokenizer=tokenizer
  )

  # Start training
  trainer.train()

  trainer = Seq2SeqTrainer(


Epoch,Training Loss,Validation Loss
1,4.3492,3.454865


Epoch,Training Loss,Validation Loss
1,4.3492,3.454865


### 4. Test: Summarization

In [None]:
if ENABLE_TEST:
  import warnings
  # Load the base model
  # base_model = EncoderDecoderModel.from_encoder_decoder_pretrained(
  #     'bert-base-uncased', 'bert-base-uncased'
  # )
  base_model = BartForConditionalGeneration.from_pretrained('facebook/bart-base')

  # Load the fine-tuned PEFT model
  if MODEL_NAME:
    fintuned_model = PeftModel.from_pretrained(base_model, f'./results/{MODEL_NAME}')
  else:
    raise Exception("`MODEL_NAME` must be specified for test mode")

  # Move model to device
  device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
  if device == 'cpu':
    warnings.warn("GPU is not available. Using CPU instead...", UserWarning)

  fintuned_model.to(device)
  fintuned_model.eval()

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): BartForConditionalGeneration(
      (model): BartModel(
        (shared): BartScaledWordEmbedding(50265, 768, padding_idx=1)
        (encoder): BartEncoder(
          (embed_tokens): BartScaledWordEmbedding(50265, 768, padding_idx=1)
          (embed_positions): BartLearnedPositionalEmbedding(1026, 768)
          (layers): ModuleList(
            (0-5): 6 x BartEncoderLayer(
              (self_attn): BartSdpaAttention(
                (k_proj): Linear(in_features=768, out_features=768, bias=True)
                (v_proj): lora.Linear(
                  (base_layer): Linear(in_features=768, out_features=768, bias=True)
                  (lora_dropout): ModuleDict(
                    (default): Dropout(p=0.1, inplace=False)
                  )
                  (lora_A): ModuleDict(
                    (default): Linear(in_features=768, out_features=8, bias=False)
                  )
                  (lora_B): ModuleDict(


In [None]:
""" For BERT decoder only. BART don't need these settings. """
# # Set special tokens
# fintuned_model.config.decoder_start_token_id = tokenizer.cls_token_id
# fintuned_model.config.eos_token_id = tokenizer.sep_token_id
# fintuned_model.config.pad_token_id = tokenizer.pad_token_id

# # Important generation parameters
# fintuned_model.config.vocab_size = fintuned_model.config.encoder.vocab_size
# fintuned_model.config.max_length = 128
# fintuned_model.config.min_length = 30
# fintuned_model.config.no_repeat_ngram_size = 3
# fintuned_model.config.early_stopping = True
# fintuned_model.config.length_penalty = 2.0
# fintuned_model.config.num_beams = 4

In [None]:
def generate_summary(article_text, eval_model=fintuned_model):
    # Define maximum lengths
    max_model_length = 512
    max_new_tokens = 150
    max_input_length = max_model_length - max_new_tokens  # 362 tokens

    # Truncate the article text to fit within the max_input_length
    article_tokens = tokenizer.tokenize(article_text)
    if len(article_tokens) > max_input_length - 50:  # Reserve space for prompt text
        article_tokens = article_tokens[:max_input_length - 50]
        article_text = tokenizer.convert_tokens_to_string(article_tokens)

    # Create a summarization prompt
    prompt = f"Summarize the following article:\n\n{article_text}\n\nSummary:"

    # Tokenize the prompt with truncation
    inputs = tokenizer(
        prompt,
        return_tensors='pt',
        truncation=True,
        max_length=max_input_length,
        padding=True
    ).to(device)

    # Generate the summary using max_new_tokens
    summary_ids = eval_model.generate(
        input_ids=inputs['input_ids'],
        attention_mask=inputs['attention_mask'],
        max_new_tokens=max_new_tokens,  # Generate up to 150 new tokens
        temperature=0.7,
        top_p=0.9,
        num_return_sequences=1,
        no_repeat_ngram_size=2,
        early_stopping=True,
        eos_token_id=tokenizer.eos_token_id,
    )

    # Decode the summary
    summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

    return summary

In [None]:
if ENABLE_TEST:
  from datasets import load_dataset

  # Load the test dataset
  test_dataset = load_dataset('cnn_dailymail', '3.0.0', split='test')

  # Select a subset for testing
  test_dataset = test_dataset.shuffle(seed=0).select(range(1000))

In [None]:
if ENABLE_TEST:
  # Lists to store the generated summaries and reference summaries
  generated_summaries = []
  reference_summaries = []

  # Iterate over the test dataset
  for example in test_dataset:
      article = example['article']
      reference = example['highlights']

      # Generate summary
      generated_summary = generate_summary(article, eval_model=fintuned_model)

      # Append to lists
      generated_summaries.append(generated_summary)
      reference_summaries.append(reference)



In [None]:
if ENABLE_TEST:
  import evaluate

  # Load the ROUGE metric
  rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [None]:
if ENABLE_TEST:
  # Compute ROUGE scores
  results = rouge.compute(
      predictions=generated_summaries,
      references=reference_summaries,
      use_stemmer=True
  )

  # Print the results
  print("ROUGE scores:")
  for key, value in results.items():
      print(f"{key}: {value:.4f}")

ROUGE scores:
rouge1: 0.3534
rouge2: 0.1500
rougeL: 0.2103
rougeLsum: 0.2828
