# Setup Development Environment

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# install Hugging Face Libraries
!pip install "peft==0.2.0"
#!pip install "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" "bitsandbytes==0.37.1" loralib --upgrade --quiet
# install additional dependencies needed for training
!pip install rouge-score tensorboard py7zr
!pip install "transformers==4.27.1" "datasets==2.9.0" "accelerate==0.17.1" "evaluate==0.4.0" loralib --upgrade --quiet
!pip install -U bitsandbytes

# Load and prepare the dataset

In [None]:
from datasets import load_dataset
import pandas as pd

# Load dataset from the hub
dataset2 = load_dataset("allenai/mslr2022", "ms2")

print(f"Train dataset size: {len(dataset2['train'])}")
print(f"Test dataset size: {len(dataset2['test'])}")

# Train dataset size: 14732
# Test dataset size: 819

df2_train = pd.DataFrame(dataset2['train'])
df2_test = pd.DataFrame(dataset2['test'])
df2_val = pd.DataFrame(dataset2['validation'])

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script:   0%|          | 0.00/7.09k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.76k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading and preparing dataset mslr2022/ms2 (download: 252.15 MiB, generated: 768.17 MiB, post-processed: Unknown size, total: 1020.33 MiB) to /root/.cache/huggingface/datasets/allenai___mslr2022/ms2/1.0.0/383847f6631ddefc5b6ed7df606b6f17078bdd51f642209158ed6e4bea951bbb...


Downloading data:   0%|          | 0.00/264M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14188 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2021 [00:00<?, ? examples/s]

Dataset mslr2022 downloaded and prepared to /root/.cache/huggingface/datasets/allenai___mslr2022/ms2/1.0.0/383847f6631ddefc5b6ed7df606b6f17078bdd51f642209158ed6e4bea951bbb. Subsequent calls will reuse this data.


  0%|          | 0/3 [00:00<?, ?it/s]

Train dataset size: 14188
Test dataset size: 1667


  table = cls._concat_blocks(blocks, axis=0)


In [None]:
from datasets import Dataset

def concat_list_strings(lst):
    return ' '.join(lst)

# Apply the function to each element of the DataFrame
df2_train['abstract'] = df2_train['abstract'].apply(concat_list_strings)
df2_test['abstract'] = df2_test['abstract'].apply(concat_list_strings)
df2_val['abstract'] = df2_val['abstract'].apply(concat_list_strings)

dataset2['train'] = Dataset.from_pandas(df2_train)
dataset2['test'] = Dataset.from_pandas(df2_test)
dataset2['validation'] = Dataset.from_pandas(df2_val)

In [None]:
dataset2['test'][1]

In [None]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

model_id="google/flan-t5-xxl"

# Load tokenizer of FLAN-t5-XL
tokenizer = AutoTokenizer.from_pretrained(model_id)

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

In [None]:
# dialogue -> abstract
# summary -> target

In [None]:
from datasets import concatenate_datasets
import numpy as np
# The maximum total input sequence length after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded.
tokenized_inputs = concatenate_datasets([dataset2["train"], dataset2["test"]]).map(lambda x: tokenizer(x["abstract"], truncation=True), batched=True, remove_columns=["abstract", "target"])
input_lenghts = [len(x) for x in tokenized_inputs["input_ids"]]
# take 85 percentile of max length for better utilization
max_source_length = int(np.percentile(input_lenghts, 85))
print(f"Max source length: {max_source_length}")

# The maximum total sequence length for target text after tokenization.
# Sequences longer than this will be truncated, sequences shorter will be padded."
tokenized_targets = concatenate_datasets([dataset2["train"], dataset2["test"]]).map(lambda x: tokenizer(x["target"], truncation=True), batched=True, remove_columns=["abstract", "target"])
target_lenghts = [len(x) for x in tokenized_targets["input_ids"]]
# take 90 percentile of max length for better utilization
max_target_length = int(np.percentile(target_lenghts, 90))
print(f"Max target length: {max_target_length}")

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


  0%|          | 0/16 [00:00<?, ?ba/s]

Max source length: 512


  0%|          | 0/16 [00:00<?, ?ba/s]

Max target length: 164


In [None]:
def preprocess_function(sample,padding="max_length"):
    # add prefix to the input for t5
    inputs = ["summarize: " + item for item in sample["abstract"]]

    # tokenize inputs
    model_inputs = tokenizer(inputs, max_length=max_source_length, padding=padding, truncation=True)

    # Tokenize targets with the `text_target` keyword argument
    labels = tokenizer(text_target=sample["target"], max_length=max_target_length, padding=padding, truncation=True)

    # If we are padding here, replace all tokenizer.pad_token_id in the labels by -100 when we want to ignore
    # padding in the loss.
    if padding == "max_length":
        labels["input_ids"] = [
            [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
        ]

    model_inputs["labels"] = labels["input_ids"]
    return model_inputs

tokenized_dataset = dataset2.map(preprocess_function, batched=True, remove_columns=["abstract", "target", "review_id"])
print(f"Keys of tokenized dataset: {list(tokenized_dataset['train'].features)}")

# save datasets to disk for later easy loading
tokenized_dataset["train"].save_to_disk("drive/MyDrive/lora2/data/train")
tokenized_dataset["test"].save_to_disk("drive/MyDrive/lora2/data/eval")

  0%|          | 0/15 [00:00<?, ?ba/s]

  0%|          | 0/2 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

Keys of tokenized dataset: ['pmid', 'title', 'background', 'input_ids', 'attention_mask', 'labels']


Saving the dataset (0/1 shards):   0%|          | 0/14188 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/1667 [00:00<?, ? examples/s]

# Fine-Tune T5 with LoRA and bnb int-8

In [None]:
from transformers import AutoModelForSeq2SeqLM

# huggingface hub model id
model_id = "philschmid/flan-t5-xxl-sharded-fp16"

# load model from the hub
model = AutoModelForSeq2SeqLM.from_pretrained(model_id, load_in_8bit=True, device_map="auto")

config.json:   0%|          | 0.00/759 [00:00<?, ?B/s]



pytorch_model.bin.index.json:   0%|          | 0.00/50.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/12 [00:00<?, ?it/s]

pytorch_model-00001-of-00012.bin:   0%|          | 0.00/1.72G [00:00<?, ?B/s]

pytorch_model-00002-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00003-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00004-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00005-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00006-of-00012.bin:   0%|          | 0.00/1.97G [00:00<?, ?B/s]

pytorch_model-00007-of-00012.bin:   0%|          | 0.00/1.93G [00:00<?, ?B/s]

pytorch_model-00008-of-00012.bin:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

pytorch_model-00009-of-00012.bin:   0%|          | 0.00/2.00G [00:00<?, ?B/s]

pytorch_model-00010-of-00012.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00011-of-00012.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

pytorch_model-00012-of-00012.bin:   0%|          | 0.00/1.24G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
from peft import LoraConfig, get_peft_model, prepare_model_for_int8_training, TaskType

# Define LoRA Config
lora_config = LoraConfig(
 r=16,
 lora_alpha=32,
 target_modules=["q", "v"],
 lora_dropout=0.05,
 bias="none",
 task_type=TaskType.SEQ_2_SEQ_LM
)
# prepare int-8 model for training
model = prepare_model_for_int8_training(model)

# add LoRA adaptor
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817

trainable params: 18874368 || all params: 11154206720 || trainable%: 0.16921300163961817


In [None]:
from transformers import DataCollatorForSeq2Seq

# we want to ignore tokenizer pad token in the loss
label_pad_token_id = -100
# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer,
    model=model,
    label_pad_token_id=label_pad_token_id,
    pad_to_multiple_of=8
)

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments

output_dir="drive/MyDrive/lora2/lora-flan-t5-xxl"

# Define training args
training_args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
		auto_find_batch_size=True,
    learning_rate=1e-3, # higher learning rate
    num_train_epochs=4,
    logging_dir=f"{output_dir}/logs",
    logging_strategy="steps",
    logging_steps=500,
    save_strategy="no",
    report_to="tensorboard",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset["train"],
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!

In [None]:
# train model
trainer.train()

Step,Training Loss
500,2.3051
1000,2.3081
1500,2.3026
2000,2.2542
2500,2.2142
3000,2.2251
3500,2.2261
4000,2.1233
4500,2.1277
5000,2.1004


Step,Training Loss
500,2.3051
1000,2.3081
1500,2.3026
2000,2.2542
2500,2.2142
3000,2.2251
3500,2.2261
4000,2.1233
4500,2.1277
5000,2.1004


TrainOutput(global_step=7096, training_loss=2.161510622353538, metrics={'train_runtime': 51027.9944, 'train_samples_per_second': 1.112, 'train_steps_per_second': 0.139, 'total_flos': 1.9244137368193597e+18, 'train_loss': 2.161510622353538, 'epoch': 4.0})

In [None]:
# Save our LoRA model & tokenizer results
peft_model_id="results"
trainer.model.save_pretrained("drive/MyDrive/lora2/"+peft_model_id)
tokenizer.save_pretrained("drive/MyDrive/lora2/"+peft_model_id)
# if you want to save the base model to call
# trainer.model.base_model.save_pretrained(peft_model_id)

('drive/MyDrive/lora2/results/tokenizer_config.json',
 'drive/MyDrive/lora2/results/special_tokens_map.json',
 'drive/MyDrive/lora2/results/spiece.model',
 'drive/MyDrive/lora2/results/added_tokens.json',
 'drive/MyDrive/lora2/results/tokenizer.json')

# Evaluate & run Inference with LoRA FLAN-T5

In [None]:
import torch
from peft import PeftModel, PeftConfig
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

# Load peft config for pre-trained checkpoint etc.
peft_model_id = "results"
config = PeftConfig.from_pretrained("drive/MyDrive/lora2/"+peft_model_id)

# load base LLM model and tokenizer
model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path,  load_in_8bit=True,  device_map={"":0})
tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

# Load the Lora model
model = PeftModel.from_pretrained(model, "drive/MyDrive/lora2/"+peft_model_id, device_map={"":0})
model.eval()

print("Peft model loaded")



Loading checkpoint shards:   0%|          | 0/12 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Peft model loaded


In [None]:
len(dataset2["test"])

1667

In [None]:
from datasets import load_dataset
from random import randrange

# Load dataset from the hub and get a sample
#dataset2 = load_dataset("samsum")
sample = dataset2['test'][randrange(len(dataset2["test"]))]

input_ids = tokenizer(sample["abstract"], return_tensors="pt", truncation=True).input_ids.cuda()
# with torch.inference_mode():
outputs = model.generate(input_ids=input_ids, max_new_tokens=10, do_sample=True, top_p=0.9)
print(f"input sentence: {sample['abstract']}\n{'---'* 20}")

print(f"summary:\n{tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0]}")

input sentence: Please cite this paper as : Ijäs H , Vääräsmäki M , Morin‐Papunen L , Keravuo R , Ebeling T , Saarela T , Raudaskoski T. Metformin should be considered in the treatment of gestational diabetes : a prospect i ve r and omised study . BJOG 2011;118:880–885 AIM To compare treatment with metformin alone , metformin plus insulin and insulin alone in women with gestational diabetes ( GDM ) . METHOD A total of 150 gestational diabetic patients who fulfilled the eligibility criteria were included in this prospect i ve r and omized control open labeled study . A risk factor based screening was done followed by a GCT and then local GTT criteria from antenatal clinics . They were initially divided into two groups with odd numbers assigned to metformin treatment and even numbers to insulin treatment . Metformin and /or insulin treatment was given and target blood sugar levels aim ed at FBS ≤ 100 mg/dl and postpr and ial levels ≤ 126 mg/dl . Supplemental insulin was added to metformi

In [None]:
import evaluate
import numpy as np
from datasets import load_from_disk
from tqdm import tqdm

# Metric
metric = evaluate.load("rouge")

def evaluate_peft_model(sample,max_target_length=50):
    # generate summary
    outputs = model.generate(input_ids=sample["input_ids"].unsqueeze(0).cuda(), do_sample=True, top_p=0.9, max_new_tokens=max_target_length)
    prediction = tokenizer.decode(outputs[0].detach().cpu().numpy(), skip_special_tokens=True)
    # decode eval sample
    # Replace -100 in the labels as we can't decode them.
    labels = np.where(sample['labels'] != -100, sample['labels'], tokenizer.pad_token_id)
    labels = tokenizer.decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    return prediction, labels

# load test dataset from disk
#test_dataset = load_from_disk("drive/MyDrive/lora2/data/eval/").with_format("torch").select(range(100))
test_dataset = load_from_disk("drive/MyDrive/lora2/data/eval/").with_format("torch")
print(test_dataset)
# run predictions
# this can take ~45 minutes
predictions, references = [] , []
for sample in tqdm(test_dataset):
    p,l = evaluate_peft_model(sample)
    predictions.append(p)
    references.append(l)

# compute metric
rogue = metric.compute(predictions=predictions, references=references, use_stemmer=True)

# print results
print(f"Rogue1: {rogue['rouge1']* 100:2f}%")
print(f"rouge2: {rogue['rouge2']* 100:2f}%")
print(f"rougeL: {rogue['rougeL']* 100:2f}%")
print(f"rougeLsum: {rogue['rougeLsum']* 100:2f}%")

# Rogue1: 50.386161%
# rouge2: 24.842412%
# rougeL: 41.370130%
# rougeLsum: 41.394230%

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Dataset({
    features: ['pmid', 'title', 'background', 'input_ids', 'attention_mask', 'labels'],
    num_rows: 1667
})


100%|██████████| 1667/1667 [3:44:37<00:00,  8.08s/it]


Rogue1: 0.000000%
rouge2: 0.000000%
rougeL: 0.000000%
rougeLsum: 0.000000%
