In [1]:
!pip install rouge.score nltk py7zr
!pip install datasets
!pip install transformers[torch]
!pip install accelerate -U

Collecting rouge.score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting py7zr
  Downloading py7zr-0.21.0-py3-none-any.whl (67 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.6/67.6 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting texttable (from py7zr)
  Downloading texttable-1.7.0-py2.py3-none-any.whl (10 kB)
Collecting pycryptodomex>=3.16.0 (from py7zr)
  Downloading pycryptodomex-3.20.0-cp35-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m13.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyzstd>=0.15.9 (from py7zr)
  Downloading pyzstd-0.15.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (411 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m411.2/411.2 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyppmd<1.2.0,>=1.1.0 (from py7zr)
  Downloading

In [3]:
import numpy as np
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, Seq2SeqTrainingArguments, Seq2SeqTrainer
import transformers
from datasets import load_dataset, load_metric, load_from_disk
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

import warnings


# Settings the warnings to be ignored
warnings.filterwarnings('ignore')


In [4]:
# Load model directly

tokenizer = AutoTokenizer.from_pretrained("GanjinZero/biobart-base")
model = AutoModelForSeq2SeqLM.from_pretrained("GanjinZero/biobart-base")

tokenizer_config.json:   0%|          | 0.00/1.06k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.72k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/279M [00:00<?, ?B/s]

In [5]:

dataset = load_dataset("allenai/mslr2022", "ms2")

Downloading data:   0%|          | 0.00/260M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/48.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/39.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/46.7M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/14188 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/1667 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2021 [00:00<?, ? examples/s]

In [6]:
dataset['validation']['target'][0]

'Current evidence from systematic review and meta- analysis revealed that probiotics are the most promising intervention in reduction of the incidence of NEC in VLBW neonates .\nAs per the evidence , prebiotics modulate the composition of human intestine microflora to the benefit of the host by suppression of colonization of harmful microorganism and /or the stimulation of bifidobacterial growth , decreased stool viscosity , reduced gastrointestinal transit time , and better feed tolerance .'

In [7]:
# setting X matrix and y vector for the sample
X = 'abstract'
y = 'target'

max_input = 1000
max_target = 400
batch_size = 1

In [8]:
#sample the data
train_dataset = dataset['train']
validation_dataset = dataset['validation']

In [10]:
# deleting the dataset to save memory space
del dataset

In [11]:
# prepreprossing via a function
def preprocess_data(data_to_process):

    # get all the abstracts
    inputs = ["".join(abst) for abst in data_to_process[X]]

    #tokenize the abstracts
    model_inputs = tokenizer(inputs,  max_length=max_input, padding='max_length', truncation=True)

    #tokenize the summaries
    with tokenizer.as_target_tokenizer():
        targets = tokenizer(data_to_process[y], max_length=max_target, padding='max_length', truncation=True)

    #set labels
    model_inputs['labels'] = targets['input_ids']

    #return the tokenized data
    #input_ids, attention_mask and labels

    return model_inputs

In [12]:
#preprocessing the traing and validation datasets
train_sample = train_dataset.map(preprocess_data, batched = True, remove_columns=['review_id', 'pmid', 'title', 'abstract', 'target', 'background'])
validation_sample = validation_dataset.map(preprocess_data, batched = True, remove_columns=['review_id', 'pmid', 'title', 'abstract', 'target', 'background'])


Map:   0%|          | 0/14188 [00:00<?, ? examples/s]

Map:   0%|          | 0/2021 [00:00<?, ? examples/s]

In [13]:
#deleting train_dataset and validation dataset 
# we are not using them going forward
del train_dataset
del validation_dataset

In [14]:
# creating a data collator using model and tokenizer
collator = transformers.DataCollatorForSeq2Seq(tokenizer, model=model)


In [15]:
# using Rouge as metric
metric = load_metric('rouge')


Downloading builder script:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

In [19]:
# computing rouge
def compute_rouge(pred):
    predictions, labels = pred

    #decode the predictions
    decode_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)

    #decode labels
    decode_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    #compute results
    res = metric.compute(predictions=decode_predictions, references=decode_labels, use_stemmer=True)

    #get %
    #res = {key: value.mid.fmeasure * 100 for key, value in res.items()}
    res = {key: value.mid.fmeasure for key, value in res.items()}

    #return res

    pred_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in predictions]

    res['gen_len'] = np.mean(pred_lens)

    return {k: round(v, 4) for k, v in res.items()}

In [20]:
# setting training configurations
args = Seq2SeqTrainingArguments(
   'model_artifacts', #save directory
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=2,
    per_device_eval_batch_size= 2,
    gradient_accumulation_steps=2,
    weight_decay=0.01,
    save_total_limit=2,
    num_train_epochs=3,
    predict_with_generate=True,
    eval_accumulation_steps=3,
    fp16=True, #available only with CUDA
    generation_max_length = 20
    )

# initializign the trainer
trainer = Seq2SeqTrainer(
    model,
    args,
    train_dataset=train_sample,
    eval_dataset=validation_sample,
    data_collator=collator,
    tokenizer=tokenizer,
    compute_metrics=compute_rouge
)

In [21]:
# training the model
trainer.train()


Epoch,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
1,0.4698,0.497425,0.162,0.0308,0.1269,0.1371,19.9931
2,0.4825,0.49257,0.1528,0.0281,0.1204,0.1305,19.9891
3,0.437,0.493653,0.1546,0.0288,0.1223,0.1322,19.9797


Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Checkpoint destination directory model_artifacts/checkpoint-3000 already exists and is non-empty. Saving will proceed but saved results may be invalid.
Non-default generation parameters: {'early_stopping': True, 'num_beams': 4, 'no_repeat_ngram_size': 3, 'forced_eos_token_id': 2}
Checkpoint destination directory model_artifacts/checkpoint-3500 already e

TrainOutput(global_step=10641, training_loss=0.4674667653794598, metrics={'train_runtime': 5059.3427, 'train_samples_per_second': 8.413, 'train_steps_per_second': 2.103, 'total_flos': 2.534455332864e+16, 'train_loss': 0.4674667653794598, 'epoch': 3.0})