# Setup Development Environment

In [None]:
from google.colab import drive
drive.mount('/drive')

Mounted at /drive


In [None]:
!pip install -q --upgrade git+https://github.com/huggingface/transformers
!pip install -q --upgrade git+https://github.com/huggingface/peft.git
!pip install -q bitsandbytes accelerate datasets tensorboardX loralib

# Purpose of notebook: fine-tune LongT5 on exctracted sentences from studies, but using LoRA and bitsandbytes quantization

import os
import pickle
from pprint import pprint
import gc

import pandas as pd
from datasets import Dataset, load_dataset
from transformers import (
    LongT5ForConditionalGeneration,
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    BitsAndBytesConfig,
    DataCollatorForSeq2Seq,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType, prepare_model_for_kbit_training
import bitsandbytes as bnb
import torch
import numpy as np

if torch.backends.mps.is_available():
    device = torch.device("mps")
    print("Using MPS device.")
    os.environ['PYTORCH_MPS_HIGH_WATERMARK_RATIO'] = "0.0"
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print("Using CUDA device.")
    max_split_size_mb = 256  # Set the max_split_size_mb value (e.g., 512 MB)
    os.environ["PYTORCH_CUDA_ALLOC_CONF"] = f"max_split_size_mb:{max_split_size_mb}"
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1"
else:
    device = torch.device("cpu")
    print("MPS/CUDA not available. Using CPU.")

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m290.1/290.1 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m23.7/23.7 MB[0m [31m62.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m823.6/823.6 kB[0m [31m60.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m14.1/14.1 MB[0m [31m91.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m731.7/731.7 MB[0m [31m1.9 MB/s[0

# Model Configuration

In [None]:
# -------- START CONFIG ----------
# Load tokenizer and model
model_id = 'pszemraj/long-t5-tglobal-base-16384-book-summary'
output_dir = "/drive/MyDrive/lora3/training_history"  # Colab

extracted_file_path = '/drive/MyDrive/lora3/biobert_extractive_only_training_dataset.csv.gz'  # Colab

# source_data_path = "data"
source_data_path = "/drive/MyDrive/lora3/data"  # Colab

# longT5 max token length is 16384, let's 1/2 that
max_input_token_length = 8192
# max_input_token_length = 1024

# -------- END CONFIG ----------

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
# label_pad_token_id = tokenizer.pad_token_id
label_pad_token_id = -100  # special label token that gets ignored in loss calculations

train_data_path = os.path.join(source_data_path, 'train_tokenized_dataset')
val_data_path = os.path.join(source_data_path, 'val_tokenized_dataset')

if os.path.exists(train_data_path) and os.path.exists(val_data_path):
    train_dataset = Dataset.load_from_disk(train_data_path)
    val_dataset = Dataset.load_from_disk(val_data_path)

else:
    ms2_dataset = load_dataset("allenai/mslr2022", "ms2", split="train")

    # Load your CSV file
    df = pd.read_csv(extracted_file_path, compression='gzip')

    # # ---- if full extracted data is not available yet:
    # all_extracted_summaries = []
    # for fpath in os.listdir('../experiment_1/biobert_extractive_only_training_dataset'):
    #     all_extracted_summaries.append(
    #         pickle.load(open(os.path.join('../experiment_1/biobert_extractive_only_training_dataset', fpath), 'rb'))
    #     )
    # df = pd.DataFrame(all_extracted_summaries)
    # # ----

    target_texts = ms2_dataset['target']
    input_texts = [
        df[df['review_id'] == int(i)]['summary'].tolist()[0] for i in ms2_dataset['review_id']
    ]
    dataset = Dataset.from_dict({'input_text': input_texts, 'target_text': target_texts})

    # Tokenize data
    def tokenize_function(examples):
        model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=max_input_token_length)
        with tokenizer.as_target_tokenizer():
            labels = tokenizer(text_target=examples['target_text'], padding='max_length', truncation=True, max_length=256)
            labels["input_ids"] = [
                [(l if l != tokenizer.pad_token_id else label_pad_token_id) for l in label] for label in labels["input_ids"]
            ]
        model_inputs['labels'] = labels['input_ids']
        return model_inputs

    tokenized_datasets = dataset.map(tokenize_function, batched=True, num_proc=4, remove_columns=["input_text", "target_text"])
    print(f"Keys of tokenized dataset: {list(tokenized_datasets.features)}")

    # Split the dataset
    shuffle_dataset = tokenized_datasets.shuffle(seed=42)
    shuffle_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
    train_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10))
    val_dataset = shuffle_dataset.select(range(len(tokenized_datasets) * 8 // 10, len(tokenized_datasets)))

    # save to disk for easy loading
    train_dataset.save_to_disk(train_data_path)
    val_dataset.save_to_disk(val_data_path)

print(train_dataset["input_ids"].shape)
print(val_dataset["input_ids"].shape)
type(train_dataset["input_ids"][0])

loading file spiece.model from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/spiece.model
loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/tokenizer.json
loading file added_tokens.json from cache at None
loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/special_tokens_map.json
loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/tokenizer_config.json


torch.Size([11350, 8192])
torch.Size([2838, 8192])


torch.Tensor

In [None]:
# ANALYSIS: what's the distribution of non-padding tokens in train_dataset["input_ids"]?
all_tokens = train_dataset["input_ids"].numpy()
non_pad_token_counts = np.array([len(np.where(tokens != 0)[0]) for tokens in all_tokens])
# distribution of non_pad_token_counts
display(pd.Series(non_pad_token_counts).describe())

# what's the 95% percentile?
print("95% percentile is", np.percentile(non_pad_token_counts, 95))

# which percentile is "8192 non-padding tokens" on?
print(
    "If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information):",
    (perc_8192 := pd.Series(non_pad_token_counts).rank(pct=True)[np.where(non_pad_token_counts <= 8192)[0]].max())
)
# confirm
print(np.percentile(non_pad_token_counts, perc_8192 * 100))

count    11350.000000
mean      3661.722291
std       2308.642882
min         71.000000
25%       1855.000000
50%       3050.000000
75%       5047.000000
max       8192.000000
dtype: float64

95% percentile is 8192.0
If we truncated input_ids to 8192, this is the percentile it'll be at (anything at a higher percentile could risk losing information): 0.9473568281938326
8192.0


In [None]:
# bitsandbytes
# Source notebooks:
# - https://colab.research.google.com/drive/1Vvju5kOyBsDr7RX_YAvp6ZsSOoSMjhKD?usp=sharing#scrollTo=E0Nl5mWL0k2T
# - https://colab.research.google.com/drive/1ge2F1QSK8Q7h0hn3YKuBCOAS0bK8E0wf?usp=sharing#scrollTo=HOWcL0LU3JYt
# More background info:
# - https://huggingface.co/blog/hf-bitsandbytes-integration

checkpoint_path = "longt5-qlora"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    # load_in_8bit=True,
)

base_model = LongT5ForConditionalGeneration.from_pretrained(model_id)
model = LongT5ForConditionalGeneration.from_pretrained(
    model_id,
    # quantization_config=bnb_config,  # enable when in CUDA
    # device_map="auto",
)

# # BUG: `model` has its embeddings reinitiated. Copy over from `base_model` but retain data type
# reinited_params = ['encoder.embed_tokens.weight', 'decoder.embed_tokens.weight']
# for param_name in reinited_params:
#     model_param = model.get_parameter(param_name)
#     base_model_param = base_model.get_parameter(param_name)
#     model_param.data = (
#         base_model_param.data
#         .to(model_param.dtype)  # or, comment out to remain in 32-bit for accuracy
#         .to(device)
#     )

# use PEFT LoRA

lora_config = LoraConfig(
    task_type=TaskType.SEQ_2_SEQ_LM,
    r=16,
    lora_alpha=32,
    # target_modules=["q", "v", "k"],
    target_modules=["q", "v"],
    # target_modules=["q"],
    layers_to_transform=list(range(0, 12)),  # 11 is max layer
    lora_dropout=0.05,
    bias="none",
)
model.gradient_checkpointing_enable()
# model = prepare_model_for_kbit_training(model)  # enable for 4bit or 8bit quantization
model.enable_input_require_grads()
model = get_peft_model(model, lora_config)
# Fix from this GitHub issue: https://github.com/huggingface/peft/issues/522#issuecomment-1705989330
model.base_model.model.encoder.enable_input_require_grads()
model.base_model.model.decoder.enable_input_require_grads()

model.train()
model.print_trainable_parameters()

# Training arguments
logpath = os.path.join(output_dir, checkpoint_path, "logs")

training_args = Seq2SeqTrainingArguments(
    output_dir=os.path.join(output_dir, checkpoint_path),
    evaluation_strategy="steps",  # alternatively, "epoch"
    logging_strategy="steps",
    learning_rate=1e-3,
    logging_dir=logpath,
    report_to="tensorboard",
    save_strategy="steps",
    fp16=False,
    # predict_with_generate=True,

    # FOR REAL TRAINING
    num_train_epochs=4,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    # auto_find_batch_size=True,
    eval_steps=200,
    logging_steps=100,
    save_steps=100,
    log_level="info",

    # FOR DEBUGGING
    # num_train_epochs=1,
    # per_device_train_batch_size=1,
    # per_device_eval_batch_size=1,
    # max_steps=20,
    # eval_steps=2,
    # logging_steps=2,  # should match eval_steps
    # save_steps=4,  # includes train loss metric
    # log_level="debug",

    # FOR 4BIT OR 8BIT QUANTIZATION
    # fp16=True,
    # optim="paged_adamw_8bit",  # default: adamw_torch
)

print("Tensorboard log path:", logpath)
print("run this in terminal: tensorboard --logdir", logpath)

# Initialize Trainer
model.config.use_cache = False

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    # model=model_id,
    # label_pad_token_id=label_pad_token_id,
    # pad_to_multiple_of=8,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset.shuffle(seed=42).select(range(200)),
    # eval_dataset=val_dataset.select(range(10, 20)),  # for debugging
    data_collator=data_collator,
    tokenizer=tokenizer,
)

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/config.json
Model config LongT5Config {
  "architectures": [
    "LongT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "early_stopping": true,
  "encoder_attention_type": "transient-global",
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "global_block_size": 16,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.8,
  "local_radius": 127,
  "max_length": 512,
  "min_length": 8,
  "model_type": "longt5",
  "n_positions": 4096,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_pas

trainable params: 1,769,472 || all params: 249,356,928 || trainable%: 0.7096141319161583
Tensorboard log path: /drive/MyDrive/lora3/training_history/longt5-qlora/logs
run this in terminal: tensorboard --logdir /drive/MyDrive/lora3/training_history/longt5-qlora/logs


# Model Weights

In [None]:
base_model.get_parameter("encoder.embed_tokens.weight")

Parameter containing:
tensor([[-0.5561,  0.4233,  0.8544,  ..., -0.9618,  0.6647,  0.9398],
        [ 0.4269,  1.6681,  4.5766,  ..., -2.2274, -0.5151,  2.1782],
        [-5.4195, -2.4177, -0.8740,  ..., -0.2788, -1.3139, -1.5880],
        ...,
        [ 1.5533,  0.5635,  1.6218,  ...,  1.9036,  0.7348,  0.1447],
        [ 0.2494,  0.8528, -0.6396,  ...,  0.1166, -1.1269,  0.8604],
        [ 0.8795, -0.3369, -1.7056,  ...,  0.4987,  1.2487,  0.6472]],
       requires_grad=True)

In [None]:
model.get_parameter("encoder.embed_tokens.weight")

Parameter containing:
tensor([[-0.5561,  0.4233,  0.8544,  ..., -0.9618,  0.6647,  0.9398],
        [ 0.4269,  1.6681,  4.5766,  ..., -2.2274, -0.5151,  2.1782],
        [-5.4195, -2.4177, -0.8740,  ..., -0.2788, -1.3139, -1.5880],
        ...,
        [ 1.5533,  0.5635,  1.6218,  ...,  1.9036,  0.7348,  0.1447],
        [ 0.2494,  0.8528, -0.6396,  ...,  0.1166, -1.1269,  0.8604],
        [ 0.8795, -0.3369, -1.7056,  ...,  0.4987,  1.2487,  0.6472]],
       device='cuda:0')

In [None]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight").shape)

Parameter containing:
tensor([[-0.0101, -0.0265, -0.0220,  ..., -0.0213,  0.0272,  0.0029],
        [ 0.0101,  0.0343, -0.0018,  ..., -0.0310, -0.0269,  0.0170],
        [-0.0231,  0.0304, -0.0076,  ..., -0.0079,  0.0180,  0.0260],
        ...,
        [-0.0268,  0.0280,  0.0079,  ..., -0.0042,  0.0326,  0.0029],
        [ 0.0194, -0.0282, -0.0063,  ..., -0.0314, -0.0312,  0.0103],
        [-0.0235,  0.0159, -0.0090,  ...,  0.0287,  0.0335, -0.0317]],
       device='cuda:0', requires_grad=True)
torch.Size([16, 768])


In [None]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight").shape)

Parameter containing:
tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], device='cuda:0', requires_grad=True)
torch.Size([768, 16])


In [None]:
print(train_dataset["labels"].device)
model.device

cpu


device(type='cuda', index=0)

In [None]:
# try inferring for a single example
id_to_choose = 1
base_model = base_model.to(device)
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = base_model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128, num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5
}



('The aim of this study is to describe the safety and efficacy of a local '
 'analgese in patients suffering from hypertensives. All patients presented '
 'with prehypertension, stage1 hypertension, and stage2 hypertension. There '
 'were no significant differences in blood pressure between the groups except '
 'for those with elevated pulse rate. This study was performed at Shaikhzayed '
 'Medical Complex on May to December 2008. Out of these sixty patients, 10 '
 'have pre-hyptertension, 10 hadstage 1 hypertension; 10 had stage 2, '
 'hypertension where it slightly increased. Mean Pul speed increased from 3 to '
 '4 beat')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe f

In [None]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128, num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5,
  "use_cache": false
}



('In this paper, we describe a method to assess the safety of two different '
 'types of local analization in patients with hypertension. We hypothesizes '
 'how fast blood pressure changes and pulse rate change after tooth extraction '
 'using a combination of 2 g/mL of ligonine and 1 mmoll of epitomephrine. This '
 'is sufficient information for us to conclude that these changes are not due '
 'to anti-hyperpensive drugs.')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [None]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    labels=train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id].unsqueeze(0).to(device),
)
print("Without padding tokens")
print(call_outputs.loss)
print(call_outputs.logits)

Without padding tokens
tensor(3.9818, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-19.3257,  -5.1725,  -5.3487,  ..., -19.1243, -19.4162, -19.3571],
         [-27.8897,  -6.3095,  -4.4837,  ..., -27.2778, -27.9679, -27.6794],
         [-33.3910,  -6.4139, -11.6583,  ..., -32.8830, -33.3757, -33.1600],
         ...,
         [-27.2299,  -2.6808,  -6.7620,  ..., -26.6056, -27.2207, -27.2371],
         [-33.2471,  -5.8694,  -7.2000,  ..., -32.6066, -33.2561, -33.2425],
         [-28.5755,   1.7134,  -6.0601,  ..., -27.8876, -28.5938, -28.4779]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


In [None]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    labels=train_dataset[id_to_choose]['labels'].unsqueeze(0).to(device),
)
print("With padding tokens in labels")
print(call_outputs.loss)
print(call_outputs.logits)

With padding tokens in labels
tensor(3.5110, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-17.5235,  -3.7862,  -5.2687,  ..., -17.2704, -17.4914, -17.4873],
         [-22.2564,  -4.6774,  -3.7386,  ..., -21.7536, -22.1952, -22.0019],
         [-24.6808,  -9.0822,  -8.0153,  ..., -24.3900, -24.6045, -24.5217],
         ...,
         [-19.1310,  -2.8402,  -4.2411,  ..., -18.7630, -19.0941, -19.0569],
         [-18.7427,  -2.2456,  -4.7864,  ..., -18.4570, -18.7313, -18.6435],
         [-17.9228,  -1.5371,  -4.3734,  ..., -17.5935, -17.9750, -17.8579]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


In [None]:
for name, param in trainer.model.named_parameters():
    print(name, param.requires_grad)

base_model.model.shared.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.base_layer.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.k.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.base_layer.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.lora_A.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.v.lora_B.default.weight True
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.o.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.relative_attention_bias.weight False
base_model.model.encoder.block.0.layer.0.TransientGlobalSelfAttention.global_relative_attention_bi

In [None]:
trainer.model

PeftModelForSeq2SeqLM(
  (base_model): LoraModel(
    (model): LongT5ForConditionalGeneration(
      (shared): Embedding(32128, 768)
      (encoder): LongT5Stack(
        (embed_tokens): Embedding(32128, 768)
        (block): ModuleList(
          (0): LongT5Block(
            (layer): ModuleList(
              (0): LongT5LayerTransientGlobalSelfAttention(
                (TransientGlobalSelfAttention): LongT5TransientGlobalAttention(
                  (q): lora.Linear(
                    (base_layer): Linear(in_features=768, out_features=768, bias=False)
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.05, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=768, out_features=16, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=16, out_features=768, bias=False)
                    )
   

# Train Model

In [None]:
# (If needed) Load model from checkpoint
latest_checkpoint = max([int(f.split('-')[1]) for f in os.listdir(os.path.join(output_dir, checkpoint_path)) if f.startswith('checkpoint')])
if latest_checkpoint:
    resume_from_checkpoint = os.path.join(output_dir, checkpoint_path, f"checkpoint-{latest_checkpoint}")
    print("Resuming from checkpoint:", resume_from_checkpoint)
else:
    resume_from_checkpoint = None

In [None]:
trainer.train(resume_from_checkpoint=False)

***** Running training *****
  Num examples = 11,350
  Num Epochs = 4
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 5,676
  Number of trainable parameters = 1,769,472


Step,Training Loss,Validation Loss
200,3.0616,2.658275
400,2.9966,2.594736
600,2.9524,2.577842
800,2.979,2.566224
1000,2.8682,2.546876
1200,2.8535,2.537159
1400,2.8611,2.526041
1600,2.8134,2.522116
1800,2.8242,2.510067
2000,2.8559,2.510564


Saving model checkpoint to /drive/MyDrive/lora3/training_history/longt5-qlora/checkpoint-100
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/config.json
Model config LongT5Config {
  "architectures": [
    "LongT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "early_stopping": true,
  "encoder_attention_type": "transient-global",
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "global_block_size": 16,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.8,
  "local_radius": 127,
  "max_length": 512,
  "min_length": 8,
  "model_type": "longt5",
  "n_positions": 4096,
  "no_repeat_ngram_size": 3,
  "nu

Step,Training Loss,Validation Loss
200,3.0616,2.658275
400,2.9966,2.594736
600,2.9524,2.577842
800,2.979,2.566224
1000,2.8682,2.546876
1200,2.8535,2.537159
1400,2.8611,2.526041
1600,2.8134,2.522116
1800,2.8242,2.510067
2000,2.8559,2.510564


***** Running Evaluation *****
  Num examples = 200
  Batch size = 8
Saving model checkpoint to /drive/MyDrive/lora3/training_history/longt5-qlora/checkpoint-4200
loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/config.json
Model config LongT5Config {
  "architectures": [
    "LongT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "early_stopping": true,
  "encoder_attention_type": "transient-global",
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "global_block_size": 16,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.8,
  "local_radius": 127,
  "max_length": 512,
  "min_length": 8,
  "model_type

TrainOutput(global_step=5676, training_loss=2.7999307741322426, metrics={'train_runtime': 37266.9698, 'train_samples_per_second': 1.218, 'train_steps_per_second': 0.152, 'total_flos': 5.013768845131776e+17, 'train_loss': 2.7999307741322426, 'epoch': 4.0})

# Evaluate Model

In [None]:
# evaluate on custom slice of train dataset
trainer.evaluate(train_dataset.select(range(0, 10)))

***** Running Evaluation *****
  Num examples = 10
  Batch size = 8


{'eval_loss': 2.4203221797943115,
 'eval_runtime': 2.2725,
 'eval_samples_per_second': 4.4,
 'eval_steps_per_second': 0.88,
 'epoch': 4.0}

In [None]:
# view results
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 200
  Batch size = 8


{'eval_loss': 2.4536397457122803,
 'eval_runtime': 42.7509,
 'eval_samples_per_second': 4.678,
 'eval_steps_per_second': 0.585,
 'epoch': 4.0}

In [None]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_A.default.weight").shape)

Parameter containing:
tensor([[-0.0013,  0.0397, -0.0375,  ...,  0.0296,  0.0266, -0.0718],
        [-0.1112,  0.0887, -0.0115,  ..., -0.0763, -0.0597,  0.0898],
        [ 0.0289,  0.0019,  0.0370,  ..., -0.0213,  0.0014,  0.0523],
        ...,
        [-0.0602,  0.0623,  0.0565,  ..., -0.0050, -0.0475,  0.0181],
        [-0.1165, -0.0449, -0.0347,  ..., -0.0963, -0.0890,  0.0524],
        [ 0.0728,  0.0018,  0.0158,  ...,  0.0616,  0.0553, -0.0699]],
       device='cuda:0', requires_grad=True)
torch.Size([16, 768])


In [None]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight").shape)

Parameter containing:
tensor([[ 0.0457,  0.0409,  0.0206,  ..., -0.0350,  0.0114,  0.0443],
        [ 0.0236,  0.0163,  0.0056,  ...,  0.0046, -0.0030,  0.0189],
        [-0.0067,  0.0025,  0.0173,  ..., -0.0040, -0.0092,  0.0102],
        ...,
        [-0.0105, -0.0284,  0.0229,  ..., -0.0418, -0.0465, -0.0166],
        [-0.0003,  0.0033,  0.0088,  ...,  0.0819,  0.0046, -0.0212],
        [ 0.0027,  0.0263, -0.0178,  ..., -0.0667, -0.0055,  0.0349]],
       device='cuda:0', requires_grad=True)
torch.Size([768, 16])


In [None]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128,
    num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5,
  "use_cache": false
}



('There was no statistically significant difference in blood pressure or pulse '
 'rate between the groups. Conclusions This meta- analysis suggests that there '
 'is no evidence to support the use of local anesthesia for restorative '
 'dentistry')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [None]:
# try inferring for a single example
id_to_choose = 1
inputs = train_dataset[id_to_choose: id_to_choose + 1]
output = trainer.model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128,
    num_beams=4,
)
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
pprint(tokenizer.decode(train_dataset[id_to_choose]['labels'][train_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))

Generate config GenerationConfig {
  "decoder_start_token_id": 0,
  "early_stopping": true,
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "length_penalty": 0.8,
  "max_length": 512,
  "min_length": 8,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "pad_token_id": 0,
  "repetition_penalty": 3.5,
  "use_cache": false
}



('There was no statistically significant difference in blood pressure or pulse '
 'rate between the groups. Conclusions This meta- analysis suggests that there '
 'is no evidence to support the use of local anesthesia for restorative '
 'dentistry')
('The most frequent complications in cardiovascular compromised patients after '
 'dental local anaesthesia with a vasoconstrictor agent were disclosed in ECG '
 'arrhythmias. Most of these disclosed arrhythmias were clinical ly '
 'insignificant. The use of  4 ampules of lignocaine with epinephrine 1:100000 '
 'as a dental anaesthetic seems to be relatively safe for cardiovascular '
 'compromised patients')


In [None]:
call_outputs = model(
    inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    labels=train_dataset[id_to_choose]['labels'].unsqueeze(0).to(device),
)
print("With padding tokens in labels")
print(call_outputs.loss)
print(call_outputs.logits)

With padding tokens in labels
tensor(2.3746, device='cuda:0', grad_fn=<NllLossBackward0>)
tensor([[[-14.7351,   0.4726,  -1.4425,  ..., -14.4264, -14.7627, -14.7134],
         [-17.2709,  -4.1753,  -3.2428,  ..., -16.9747, -17.3637, -17.1941],
         [-17.4424,  -4.9862,  -6.3705,  ..., -17.1304, -17.4767, -17.2945],
         ...,
         [-15.1513,   0.6501,  -1.4678,  ..., -14.8386, -15.1927, -15.1244],
         [-15.1512,   0.6500,  -1.4676,  ..., -14.8385, -15.1926, -15.1242],
         [-15.1510,   0.6498,  -1.4674,  ..., -14.8383, -15.1924, -15.1241]]],
       device='cuda:0', grad_fn=<UnsafeViewBackward0>)


In [None]:
# Save model
final_save_dir = "longt5-qlora-4-epochs-final"
trainer.model.save_pretrained(os.path.join(output_dir, final_save_dir))
tokenizer.save_pretrained(os.path.join(output_dir, final_save_dir))

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--pszemraj--long-t5-tglobal-base-16384-book-summary/snapshots/8988ae13e60c84ba15e894a934c4364afceedab6/config.json
Model config LongT5Config {
  "architectures": [
    "LongT5ForConditionalGeneration"
  ],
  "d_ff": 2048,
  "d_kv": 64,
  "d_model": 768,
  "decoder_start_token_id": 0,
  "dense_act_fn": "gelu_new",
  "dropout_rate": 0.1,
  "early_stopping": true,
  "encoder_attention_type": "transient-global",
  "encoder_no_repeat_ngram_size": 4,
  "eos_token_id": 1,
  "feed_forward_proj": "gated-gelu",
  "global_block_size": 16,
  "initializer_factor": 1.0,
  "is_encoder_decoder": true,
  "is_gated_act": true,
  "layer_norm_epsilon": 1e-06,
  "length_penalty": 0.8,
  "local_radius": 127,
  "max_length": 512,
  "min_length": 8,
  "model_type": "longt5",
  "n_positions": 4096,
  "no_repeat_ngram_size": 3,
  "num_beams": 2,
  "num_decoder_layers": 12,
  "num_heads": 12,
  "num_layers": 12,
  "output_pas

('/drive/MyDrive/lora3/training_history/longt5-qlora-4-epochs-final/tokenizer_config.json',
 '/drive/MyDrive/lora3/training_history/longt5-qlora-4-epochs-final/special_tokens_map.json',
 '/drive/MyDrive/lora3/training_history/longt5-qlora-4-epochs-final/spiece.model',
 '/drive/MyDrive/lora3/training_history/longt5-qlora-4-epochs-final/added_tokens.json',
 '/drive/MyDrive/lora3/training_history/longt5-qlora-4-epochs-final/tokenizer.json')

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
label_pad_token_id = -100  # special label token that gets ignored in loss calculations

test_data_path = os.path.join(source_data_path, 'test_tokenized_dataset')
if os.path.exists(test_data_path):
    test_dataset = Dataset.load_from_disk(test_data_path)
else:
    test_dataset = load_dataset("allenai/mslr2022", "ms2", split="validation")  # test set does not have target summaries
    test_dataset.save_to_disk(test_data_path)

# Load Kmeans extraction
df_kmeans_extractive_test = pd.read_csv(
    "/drive/MyDrive/lora3/data/BioBERT_K_Means_extractive.csv",
    index_col=0,
    dtype={'review_id': str, 'summary': str}
)
display(df_kmeans_extractive_test.head())

# df_kmeans_extractive_test's summary gets appended as "input_text" in test_dataset, but in the same order as test_dataset
input_text_ordered = [
    df_kmeans_extractive_test[df_kmeans_extractive_test['review_id'] == rid]['summary'].tolist()[0]
    for rid in test_dataset['review_id']
]
test_dataset = test_dataset.add_column('input_text', input_text_ordered)

# rename "target" to "target_text" to match training dataset
test_dataset = test_dataset.rename_column('target', 'target_text')

# Tokenize data
def tokenize_function(examples):
    model_inputs = tokenizer(examples['input_text'], padding='max_length', truncation=True, max_length=max_input_token_length)
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(text_target=examples['target_text'], padding='max_length', truncation=True, max_length=256)
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else label_pad_token_id) for l in label] for label in labels["input_ids"]
        ]
    model_inputs['labels'] = labels['input_ids']
    return model_inputs

tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, num_proc=4)
tokenized_test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
print(f"Keys of tokenized dataset: {list(tokenized_test_dataset.features)}")

Unnamed: 0,review_id,summary
0,28514886,Breast-fed infants typically have an intestina...
1,18842808,No adverse effects were observed . The effects...
2,24297836,Autonomic cardiovascular dysfunction accompani...
3,32367221,"Abstract . Pain on kneeling , KT-1000 measured..."
4,25038833,RESULTS Results of the Name-Face Association T...


Map (num_proc=4):   0%|          | 0/2021 [00:00<?, ? examples/s]



Keys of tokenized dataset: ['review_id', 'pmid', 'title', 'abstract', 'target_text', 'background', 'input_text', 'input_ids', 'attention_mask', 'labels']


In [None]:
# Load trained model!
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

final_save_dir = "longt5-qlora-4-epochs-final"
config = PeftConfig.from_pretrained(os.path.join(output_dir, final_save_dir))

model = AutoModelForSeq2SeqLM.from_pretrained(os.path.join(output_dir, final_save_dir))
tokenizer = AutoTokenizer.from_pretrained(os.path.join(output_dir, final_save_dir))

model = PeftModel.from_pretrained(model, os.path.join(output_dir, final_save_dir)).to(device)
model.eval();

config.json:   0%|          | 0.00/1.05k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

In [None]:
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight"))
print(model.get_parameter("encoder.block.11.layer.0.TransientGlobalSelfAttention.q.lora_B.default.weight").shape)

Parameter containing:
tensor([[ 0.0457,  0.0409,  0.0206,  ..., -0.0350,  0.0114,  0.0443],
        [ 0.0236,  0.0163,  0.0056,  ...,  0.0046, -0.0030,  0.0189],
        [-0.0067,  0.0025,  0.0173,  ..., -0.0040, -0.0092,  0.0102],
        ...,
        [-0.0105, -0.0284,  0.0229,  ..., -0.0418, -0.0465, -0.0166],
        [-0.0003,  0.0033,  0.0088,  ...,  0.0819,  0.0046, -0.0212],
        [ 0.0027,  0.0263, -0.0178,  ..., -0.0667, -0.0055,  0.0349]],
       device='cuda:0')
torch.Size([768, 16])


In [None]:
# try inferring for a single example
id_to_choose = 100
inputs = tokenized_test_dataset[id_to_choose: id_to_choose + 1]
output = model.generate(
    input_ids=inputs["input_ids"].to(device),
    attention_mask=inputs["attention_mask"].to(device),
    max_new_tokens=128,
    num_beams=4,
)
print("BACKGROUND")
pprint(tokenized_test_dataset["background"][id_to_choose])
print("GENERATED")
pprint(tokenizer.decode(output[0], skip_special_tokens=True))
print("TARGET")
pprint(tokenizer.decode(tokenized_test_dataset[id_to_choose]['labels'][tokenized_test_dataset[id_to_choose]['labels']!=label_pad_token_id], skip_special_tokens=True))



BACKGROUND
('Home-based resistance exercise is commonly used for individuals who might '
 'not have access or the ability to use traditional resistance exercise .\n'
 'However , the extent to which home-based resistance exercise can improve '
 'both strength and functional ability has not been investigated in healthy '
 'older individuals using a systematic analysis .')
GENERATED
('Conclusions This systematic review provides evidence that home-based '
 'strength training is associated with improved functional independence in '
 'older adults.')
TARGET
('Overall, home-based resistance exercise can improve both strength and '
 'functional ability, but the improvements are generally small. The intensity '
 'of the exercises might not progress sufficiently enough to produce large '
 'improvements in strength as a result of less supervision or a lack of '
 'motivation to increase the intensity further')


# Generate Test Samples

In [None]:
# tokenizer batch decode
tokenizer.batch_decode(output, skip_special_tokens=True)

['Conclusions This systematic review provides evidence that home-based strength training is associated with improved functional independence in older adults.']

In [None]:
from tqdm import tqdm

In [None]:
# Now generate for all test examples, save to disk for evaluation elsewhere

def generate_and_save(dataset, save_path, batch_size=8):
    if not os.path.exists(save_path):
        os.makedirs(save_path)

    # generate
    for i in tqdm(range(0, len(dataset), batch_size)):
        subset_dataset = dataset.select(range(i, (i + batch_size) if (i + batch_size) < len(dataset) else len(dataset)))
        inputs = subset_dataset[:]

        # check if generated summaries already exist. check individual files
        rows_to_keep = []
        for idx, review_id in enumerate(subset_dataset['review_id']):
            if os.path.exists(os.path.join(save_path, f"{review_id}.txt")):
                continue
            else:
                rows_to_keep.append(idx)

        if len(rows_to_keep) == 0:
            continue

        inputs = subset_dataset.select(rows_to_keep)[:]

        output = model.generate(
            input_ids=inputs["input_ids"].to(device),
            attention_mask=inputs["attention_mask"].to(device),
            max_new_tokens=128,
            num_beams=4,
        )
        generated_summaries = tokenizer.batch_decode(output, skip_special_tokens=True)

        # save individually
        for review_id, summary in zip(subset_dataset['review_id'], generated_summaries):
            with open(os.path.join(save_path, f"{review_id}.txt"), 'w') as f:
                f.write(summary)

    # save aggregated into csv
    # open all files
    all_generated_summaries = {}
    for fpath in os.listdir(save_path):
        if fpath.endswith('.txt'):
            all_generated_summaries[fpath.split('.')[0]] = open(os.path.join(save_path, fpath), 'r').read()

    all_generated_summaries_sorted = [all_generated_summaries[rid] for rid in dataset['review_id']]
    df = pd.DataFrame({
        'review_id': dataset['review_id'],
        'summary': all_generated_summaries_sorted
    })
    df.to_csv(os.path.join(save_path, 'generated_summaries.csv'), index=False)
    print(f"Saved generated summaries to {os.path.join(save_path, 'generated_summaries.csv')}")

    return df


df_generated = generate_and_save(
    tokenized_test_dataset,
    f"{output_dir}/generated_summaries",
    batch_size=2,
)

100%|██████████| 1011/1011 [55:35<00:00,  3.30s/it]


Saved generated summaries to /drive/MyDrive/lora3/training_history/generated_summaries/generated_summaries.csv


In [None]:
print(df_generated.shape)
df_generated

(2021, 2)


Unnamed: 0,review_id,summary
0,28514886,Conclusions The results of this systematic rev...
1,18842808,Conclusions The results of this meta- analysis...
2,24297836,Conclusions : This meta- analysis suggests tha...
3,32367221,There was no statistically significant differe...
4,25038833,There was no evidence of an association betwee...
...,...,...
2016,19776504,"In conclusion, the results of this systematic ..."
2017,27505198,Conclusions : There is insufficient evidence t...
2018,25251296,The results of this meta- analysis suggest tha...
2019,23235652,There was no evidence of a significant effect ...


In [None]:
try:
    from google.colab import runtime
    runtime.unassign()
except:
    pass

# Rouge Testing

In [None]:
validation_dataset = load_dataset("allenai/mslr2022", "ms2", split='validation')
val_df = pd.DataFrame(validation_dataset)

Downloading data:   0%|          | 0.00/260M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14188 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2021 [00:00<?, ? examples/s]

In [None]:
!pip install evaluate
!pip install rouge_score
import evaluate

In [None]:
rouge = evaluate.load('rouge')
predictions = df_generated['summary']
references = val_df['target']
rouge_results = rouge.compute(predictions=predictions,
                        references=references)
print(rouge_results)

{'rouge1': 0.18267268616252466, 'rouge2': 0.02901514483366999, 'rougeL': 0.13566238304056688, 'rougeLsum': 0.15032992923345617}


# BLEURT Evaluation

In [None]:
!pip install git+https://github.com/google-research/bleurt.git
from datasets import load_metric

# Load BLEURT from datasets
bleurt = load_metric('bleurt')

In [None]:
predictions = df_generated['summary']
references = val_df['target']
# Compute BLEURT scores
bleurt_results = bleurt.compute(predictions=predictions, references=references)

# Print BLEURT scores
print('bleurt_results:', bleurt_results)
print('Avg BLEURT Score:', str(sum(bleurt_results['scores'])/len(bleurt_results['scores'])))

bleurt_results: {'scores': [-0.43906593322753906, -0.7201102375984192, -0.6414153575897217, -1.0752925872802734, -0.04353736713528633, -0.7584176063537598, -0.5362364649772644, -0.4554423987865448, -1.1173832416534424, -0.21510204672813416, -0.6734787225723267, -0.756641149520874, -0.8470857739448547, -0.6999427676200867, -0.661134660243988, -0.9691223502159119, -0.41213589906692505, -0.7942270040512085, -0.14714835584163666, -0.7971200942993164, -0.6537171006202698, -0.9342362284660339, -0.9086545705795288, -0.8355856537818909, -0.2189977616071701, -0.5718160271644592, -0.1945147067308426, -1.6465190649032593, -1.0098047256469727, -0.9226223826408386, -0.8383930325508118, -0.7827581763267517, -1.0066509246826172, -0.44402265548706055, -0.9188247323036194, -1.3416494131088257, -0.9707406163215637, -0.40680429339408875, -1.013472318649292, -0.8357240557670593, -0.5488554835319519, -0.12399326264858246, -0.8670968413352966, -0.1156061440706253, -0.5830026865005493, -0.8187044262886047, -