In [None]:
# !pip install rouge.score nltk py7zr
# !pip install datasets
# !pip install transformers[torch]
# !pip install accelerate -U
# !pip install peft
# !pip install evaluate

Collecting rouge.score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.15.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.2/411.2 kB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading

In [2]:
#!pip install 'jupyter-resource-usage<1.0.0'

In [16]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import transformers
from datasets import load_dataset, load_metric, load_from_disk
from peft import get_peft_config, PeftModel, PeftConfig, get_peft_model, LoraConfig, TaskType
import evaluate

In [17]:
# Load model directly
tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")

In [18]:
from datasets import load_dataset

dataset = load_dataset("allenai/mslr2022", "ms2")

In [19]:
dataset['validation']['target'][0]

'Current evidence from systematic review and meta- analysis revealed that probiotics are the most promising intervention in reduction of the incidence of NEC in VLBW neonates .\nAs per the evidence , prebiotics modulate the composition of human intestine microflora to the benefit of the host by suppression of colonization of harmful microorganism and /or the stimulation of bifidobacterial growth , decreased stool viscosity , reduced gastrointestinal transit time , and better feed tolerance .'

In [20]:
X = 'abstract'
y = 'target'

max_input = 1000
max_target = 400
batch_size = 1

In [21]:
#sample the data
train_dataset = dataset['train']
validation_dataset = dataset['validation']

In [23]:
del dataset

In [24]:
def preprocess_data(data_to_process):

    # get all the abstracts
    inputs = ["".join(abst) for abst in data_to_process[X]]

    #tokenize the abstracts
    model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

    #tokenize the summaries
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data_to_process[y], max_length=max_target, padding='max_length', truncation=True)

    #set labels
    model_inputs['labels'] = targets['input_ids']

    #return the tokenized data
    #input_ids, attention_mask and labels

    return model_inputs

In [25]:
train_sample = train_dataset.map(preprocess_data, batched = True, remove_columns=['review_id', 'pmid', 'title', 'abstract', 'target', 'background'])
validation_sample = validation_dataset.map(preprocess_data, batched = True, remove_columns=['review_id', 'pmid', 'title', 'abstract', 'target', 'background'])
#test_sample = test_dataset.map(preprocess_data, batched = True, remove_columns=['review_id', 'pmid', 'title', 'abstract', 'target', 'background'])


Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

In [26]:
del train_dataset
del validation_dataset

In [27]:
# lora hyperparameters
r = 8
lora_alpha = 32
lora_dropout = 0.1

In [28]:

peft_config = LoraConfig(
    task_type=TaskType.SEQ_CLS, inference_mode=False, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout
)

In [29]:
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 442,368 || all params: 139,862,784 || trainable%: 0.3162871404018384


In [30]:
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)


In [51]:

#metric = load_metric('rouge')
metric = evaluate.load('rouge')
#metric = evaluate.load('accuracy')

In [None]:
def compute_accuracy(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

In [52]:
def compute_rouge(pred):
    predictions, labels = pred

    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)

    return res
    # #get %
    # res = {key: value.mid.fmeasure * 100 for key, value in res.items()}

    # pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    # res['gen_len'] = np.mean(pred_lens)

    # return {k: round(v, 4) for k, v in res.items()}

In [53]:
l_args = Seq2SeqTrainingArguments(
   'model_artifacts', #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True, #available only with CUDA
    generation_max_length = 20
    )

l_trainer = Seq2SeqTrainer(
    model,
    l_args,
    train_dataset=train_sample,
    eval_dataset=validation_sample,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [54]:
l_trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum
1,0.5609,0.530282,0.161836,0.029639,0.127094,0.138448
2,0.5811,0.527982,0.159302,0.027961,0.125497,0.136135
3,0.5469,0.527284,0.161149,0.028002,0.126869,0.137488


Checkpoint destination directory model_artifacts/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Checkpoint destination directory model_artifacts/checkpoint-3500 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=10641, training_loss=0.5707087194591344, metrics={'train_runtime': 3926.3459, 'train_samples_per_second': 10.841, 'train_steps_per_second': 2.71, 'total_flos': 2.5457527037952e+16, 'train_loss': 0.5707087194591344, 'epoch': 3.0})

In [None]:
from lora_plus import LoraPlusTrainingArguments, LoraPlusTrainer

model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")
#model_name = model_checkpoint.split("/")[-1]
#batch_size = 128

lp_args = LoraPlusTrainingArguments(
    "finetuned-loraplus-food101",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=4e-3,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=2,
    fp16=True,
    num_train_epochs=3,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    report_to="none",
    seed=0,
    overwrite_output_dir=True,
    push_to_hub=False,
    label_names=["labels"],
    lr_scheduler_type="linear",
    loraplus_lr_embedding=1e-06,
    loraplus_lr_ratio=1.25,
)

lp_trainer = LoraPlusTrainer(
    model,
    lp_args,
    train_dataset=train_sample,
    eval_dataset=validation_sample,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
  )

lp_train_results = lp_trainer.train()