In [1]:
import os
import logging
from os.path import abspath
from pathlib import Path
from dataclasses import field
from typing import Optional
from datasets import load_dataset
import transformers
import torch
import numpy as np
from transformers import (
    T5Tokenizer,
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq,
    HfArgumentParser,
    TrainingArguments,
)
from wasabi import msg

  from .autonotebook import tqdm as notebook_tqdm


# Load Config

In [2]:
# Load the config
import yaml
with open('config/config_testing.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
# Pretty print the config using json as a shortcut
import json
print(json.dumps(config, sort_keys=False, indent=4))

{
    "model_name": "google/t5-v1_1-base",
    "dataset_vars": {
        "type": "csv",
        "dir": "data/cdr_seq2rel",
        "split": "train",
        "column_names": [
            "input",
            "relations"
        ]
    },
    "output_dir": "./fine_tune_results",
    "local_rank": -1,
    "per_device_train_batch_size": 1,
    "per_device_eval_batch_size": 1,
    "gradient_accumulation_steps": 8,
    "learning_rate": 0.0002,
    "max_grad_norm": 0.3,
    "weight_decay": 0.001,
    "max_seq_length": 512,
    "max_target_length": 512,
    "num_train_epochs": 0.5,
    "packing": false,
    "padding": true,
    "ignore_pad_token_for_loss": true,
    "truncation": true,
    "predict_with_generate": true,
    "gradient_checkpointing": true,
    "optim": "paged_adamw_32bit",
    "lr_scheduler_type": "constant",
    "max_steps": 12,
    "warmup_ratio": 0.03,
    "group_by_length": true,
    "save_steps": 4,
    "do_eval": true,
    "evaluation_strategy": "steps",
    "eval_steps":

# Load dataset

First we'll load the dataset. The dataset used is the [CDR dataset in seq2rel format](https://github.com/JohnGiorgi/seq2rel-ds), This dataset is slightly altered to make it easier to use with [huggingface dataset](https://huggingface.co/docs/datasets/index).

In [4]:
dataset = load_dataset(
        config['dataset_vars']['type'], 
        data_dir=config['dataset_vars']['dir'],
        column_names=config['dataset_vars']['column_names']
        )

dataset_train = dataset['train'].select(range(1,501)) # remove first row that contains column names
dataset_eval = dataset['validation'].select(range(1,501)) # remove first row that contains column names

In [5]:
# Useful function to see a random row in the dataset
import datasets
import random
import pandas as pd
from IPython.display import display, HTML

def show_random_elements(dataset, num_examples=5):
    assert num_examples <= len(dataset), "Can't pick more elements than there are in the dataset."
    picks = []
    for _ in range(num_examples):
        pick = random.randint(0, len(dataset)-1)
        while pick in picks:
            pick = random.randint(0, len(dataset)-1)
        picks.append(pick)
    
    df = pd.DataFrame(dataset[picks])
    for column, typ in dataset.features.items():
        if isinstance(typ, datasets.ClassLabel):
            df[column] = df[column].transform(lambda i: typ.names[i])
    display(HTML(df.to_html()))

In [6]:
show_random_elements(dataset_train, 1)

Unnamed: 0,input,relations
0,"Metabolic involvement in adriamycin cardiotoxicity. The cardiotoxic effects of adriamycin were studied in mammalian myocardial cells in culture as a model system. Adriamycin inhibited cell growth and the rhythmic contractions characteristic of myocardial cells in culture. A possible involvement of energy metabolism was suggested previously, and in this study the adenylate energy charge and phosphorylcreatine mole fraction were determined in the adriamycin-treated cells. The adenylate energy charge was found to be significantly decreased, while the phophorylcreatine mole fraction was unchanged. Such disparity suggests an inhibition of creatine phosphokinase. The addition of 1 mM adenosine to the myocardial cell cultures markedly increases the ATP concentration through a pathway reportedly leading to a compartmentalized ATP pool. In the adriamycin-treated cells, the addition of adenosine increased the adenylate charge and, concomitant with this inrcease, the cells' functional integrity, in terms of percentage of beating cells and rate of contractions, was maintained.",adriamycin @CHEMICAL@ cardiotoxicity ; cardiotoxic @DISEASE@ @CID@


The dataset has 2 columns, "Input" which are abstracts from various papers from pubmed and "relations" which are the relations between chemicals and diseases. The relations are in an novel format explained in the [seq2rel paper](https://aclanthology.org/2022.bionlp-1.2/). 

# Load tokenizer and model

In [7]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Mon Feb 26 23:13:00 2024       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 515.43.04    Driver Version: 515.43.04    CUDA Version: 11.7     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  NVIDIA A10          On   | 00000000:00:04.0 Off |                    0 |
|  0%   31C    P8    19W / 150W |      4MiB / 23028MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [8]:
model_name = config['model_name']
device_map = {"": 0}

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map=device_map
) # we specificly use T5 for COnfitional generations because it has a language modeling head

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
generation_config.json: 100%|██████████| 147/147 [00:00<00:00, 1.29MB/s]


# Test tokenizer and model

In [9]:
# Show a single example of the dataset
single_example = dataset_train[0]
print(f"A single row in the dataset is of type {type(single_example)}")
print(single_example)

A single row in the dataset is of type <class 'dict'>
{'input': 'Naloxone reverses the antihypertensive effect of clonidine. In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate receptors. As naloxone and clonidine do not appear to interact with the same recepto

Before going into the model the data has to be tokenized. Tokenization is a preprocess where the text is split up into tokens, these tokens are then conferted to their respective token ids using the vocab, token ids can then be converted into word vectors which are the raw input of the model. The attention mask shows which tokens should be given attention to in the attention layer in the model. A 1 means the token is given attention to in the attention layer a 0 means the token is ignored in that process. This is important when using padding for example, you don't want the padding tokens to be factored in during the attention process. 

In [10]:
column_name_input, column_name_output = config['dataset_vars']['column_names']

In [11]:
# Show what the output of the tokenizer looks like for the input task
tokenizer_output_text = tokenizer(single_example[column_name_input])
tokenizer_output_text

{'input_ids': [1823, 24938, 782, 7211, 7, 8, 1181, 13397, 49, 324, 7, 757, 1504, 13, 3, 3903, 29, 30095, 5, 86, 73, 152, 222, 88, 17, 1601, 6, 23496, 120, 6676, 324, 7, 757, 20063, 8, 6313, 16, 1717, 1666, 11, 842, 1080, 2546, 57, 6344, 162, 10529, 3, 3903, 29, 30095, 6, 305, 12, 460, 2179, 5096, 7, 87, 8711, 6, 47, 19921, 15, 26, 42, 7211, 26, 57, 3, 29, 138, 32, 9431, 6, 3, 18189, 12, 204, 5453, 87, 8711, 5, 37, 10950, 324, 7, 757, 1504, 13, 910, 5453, 87, 8711, 491, 6977, 18, 22758, 26, 32, 102, 9, 47, 92, 14610, 7211, 26, 57, 3, 29, 9, 24938, 782, 5, 1823, 24938, 782, 2238, 410, 59, 2603, 893, 1717, 1666, 42, 842, 1080, 5, 86, 2241, 13304, 7, 45, 23496, 120, 6676, 324, 7, 757, 20063, 3, 3903, 29, 30095, 6, 335, 599, 18, 13520, 12, 335, 599, 18, 9120, 283, 6, 410, 59, 2860, 16687, 7, 15, 3437, 757, 11293, 13, 784, 519, 566, 908, 18, 29, 9, 24938, 782, 13642, 3, 29, 329, 201, 11, 3, 29, 9, 24938, 782, 6, 335, 599, 18, 13520, 12, 335, 599, 18, 7256, 283, 6, 410, 59, 2860, 3, 3903, 29,

In [12]:
# Show what the output of the tokenizer looks like for the output text
with tokenizer.as_target_tokenizer():
    tokenizer_output_rel = tokenizer(single_example[column_name_output])
tokenizer_output_rel



{'input_ids': [491, 6977, 18, 22758, 26, 32, 102, 9, 3320, 13717, 329, 23936, 1741, 10950, 324, 7, 757, 3320, 308, 19056, 17892, 1741, 3320, 254, 4309, 1741, 1], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}

Why tokenize the output? During training the model tasks is given a certain input match the expected output. To match it it needs to know where it's going wrong, so for the calculation of loss it needs the input ids of the tokenized output text. The exact training algorithm is presuambly [teacher forcing](https://machinelearningmastery.com/teacher-forcing-for-recurrent-neural-networks/). (I say presuamly here because i don't really know what training algoritm the huggingface api uses, where it's documentated, defined in the code and if it differs between models. And teacher forcing was used for training the T5 model)

The T5 model has a max sequence input of 512. If an input exceed that it is truncated. This could cause the model to output the wrong relations because it is missing some information, certainly when a lot of important information output the conclusion of a paper is usally at the end of an abstract. So we'll check how many rows exceed the max sequence length:

In [13]:
tokenized_dataset = tokenizer(dataset_train[column_name_input])
count = 0
for idx, input_ids in enumerate(tokenized_dataset['input_ids']):
    if len(input_ids) > 512:
        # print(idx, len(input_ids))
        count+=1
print(f'Percentage that comes above max sequence length: {count/(idx+1)*100}')

Token indices sequence length is longer than the specified maximum sequence length for this model (619 > 512). Running this sequence through the model will result in indexing errors


Percentage that comes above max sequence length: 13.4


# Create Preprocessing function

In [14]:
ignore_pad_token_for_loss=config['ignore_pad_token_for_loss']

In [15]:
def preprocess_function(examples):
    '''
    This function takes a dataset of input and target sequences.
    meant to be used with the dataset.map() function
    '''

    column_name_input, column_name_output = config['dataset_vars']['column_names']

    # Split input and target
    inputs, targets = [], []
    for i in range(len(examples[column_name_input])):
        if examples[column_name_input][i] and examples[column_name_output][i]: # remove pairs where one is None
            inputs.append(examples[column_name_input][i])
            targets.append(examples[column_name_output][i])

    # Tokenize the input
    model_inputs = tokenizer(
        inputs, 
        max_length=config['max_seq_length'], 
        padding=config['padding'], 
        truncation=config['truncation'], 
        return_tensors='pt'
    )

    # Tokenize the target sequence
    with tokenizer.as_target_tokenizer():
        labels = tokenizer(
            targets, 
            max_length=config['max_seq_length'], 
            padding=config['padding'], 
            truncation=config['truncation'], 
            return_tensors='pt'
        )

    # Replace pad tokens with -100 so they don't contribute too the loss
    if ignore_pad_token_for_loss:
        labels["input_ids"] = [
                    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                ]

    # Add tokenized target text to output
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [16]:
train_dataset = dataset_train.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset"
)

eval_dataset = dataset_eval.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset"
)

Running tokenizer on train dataset: 100%|██████████| 500/500 [00:02<00:00, 199.40 examples/s]
Running tokenizer on train dataset: 100%|██████████| 500/500 [00:01<00:00, 252.47 examples/s]


In [17]:
# A look into the pre-tokenized dataset
single_example = train_dataset[0]
print(f"A single row in the dataset is of type {type(single_example)}")
print(single_example)

A single row in the dataset is of type <class 'dict'>
{'input': 'Naloxone reverses the antihypertensive effect of clonidine. In unanesthetized, spontaneously hypertensive rats the decrease in blood pressure and heart rate produced by intravenous clonidine, 5 to 20 micrograms/kg, was inhibited or reversed by nalozone, 0.2 to 2 mg/kg. The hypotensive effect of 100 mg/kg alpha-methyldopa was also partially reversed by naloxone. Naloxone alone did not affect either blood pressure or heart rate. In brain membranes from spontaneously hypertensive rats clonidine, 10(-8) to 10(-5) M, did not influence stereoselective binding of [3H]-naloxone (8 nM), and naloxone, 10(-8) to 10(-4) M, did not influence clonidine-suppressible binding of [3H]-dihydroergocryptine (1 nM). These findings indicate that in spontaneously hypertensive rats the effects of central alpha-adrenoceptor stimulation involve activation of opiate receptors. As naloxone and clonidine do not appear to interact with the same recepto

# Create evaluation strategy

In [18]:
#model.generate(single_example)

In [19]:
import evaluate

In [20]:
metric = evaluate.load("rouge")

Downloading builder script: 100%|██████████| 6.27k/6.27k [00:00<00:00, 10.7MB/s]


In [21]:
single_eval_example=[dataset_eval[0]['relations']]
print(single_eval_example)
# whats the output of metric.compute?
metric.compute(
    predictions=single_eval_example,
    references=single_eval_example,
    rouge_types=['rouge1', 'rouge2']
)


['lithium carbonate @CHEMICAL@ neurologic depression @DISEASE@ @CID@ lithium carbonate @CHEMICAL@ cyanosis @DISEASE@ @CID@ lithium carbonate @CHEMICAL@ cardiac arrhythmia @DISEASE@ @CID@']


{'rouge1': 1.0, 'rouge2': 1.0}

In [22]:
predictions = ["hello there", "general kenobi"]
references = ["hello there", "general kenobi"]
metric.compute(predictions=predictions, references=references, rouge_types=['rouge1', 'rouge2'])

{'rouge1': 1.0, 'rouge2': 1.0}

We need to define two functions:
- a postprocess function that does the post processing of a prediction
- A compute_metrics function that takes as input an prediction and output the mean score of that prediction?

the compute metrics functions alwas takes a tuple of 

In [23]:
def postprocess_text(preds, labels):
        preds = [pred.strip() for pred in preds]
        labels = [label.strip() for label in labels]

        # rougeLSum expects newline after each sentence
        # preds = ["\n".join(nltk.sent_tokenize(pred)) for pred in preds]
        # labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

        return preds, labels

In [24]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, references=decoded_labels, use_stemmer=False)
    result = {k: round(v * 100, 4) for k, v in result.items()} # rounds all metric values to 4 numvers behind the comma
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens) # mean length of the generated sequences
    return result

# Setting up trainer

In [25]:
training_arguments = Seq2SeqTrainingArguments(
        output_dir=config['output_dir'],
        per_device_train_batch_size=config['per_device_train_batch_size'],
        gradient_accumulation_steps=config['gradient_accumulation_steps'],
        optim=config['optim'],
        save_steps=config['save_steps'],
        logging_steps=config['logging_steps'],
        learning_rate=config['learning_rate'],
        fp16=config['fp16'],
        bf16=config['bf16'],
        max_grad_norm=config['max_grad_norm'],
        max_steps=config['max_steps'],
        warmup_ratio=config['warmup_ratio'],
        group_by_length=config['group_by_length'],
        lr_scheduler_type=config['lr_scheduler_type'],
        predict_with_generate=True,
        save_total_limit=2,
        save_strategy='steps',
        load_best_model_at_end=True,
        do_eval=config['do_eval'],
        evaluation_strategy=config['evaluation_strategy'],
        eval_steps=config['eval_steps']
    )

In [26]:
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8 if config['fp16'] else None,
    )

In [27]:
trainer = Seq2SeqTrainer(
        model=model,
        train_dataset=train_dataset,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        args=training_arguments,
    )

# Training

In [29]:
trainer.train()

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Rouge1,Rouge2,Rougel,Rougelsum,Gen Len
4,No log,9.560709,7.4093,1.646,7.049,7.0715,19.0




ValueError: You are trying to save a non contiguous tensor: `encoder.block.0.layer.0.SelfAttention.q.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.