In [1]:
from transformers import (
    T5Tokenizer,
    AutoTokenizer,
    T5ForConditionalGeneration,
    Seq2SeqTrainingArguments,
    Seq2SeqTrainer,
    DataCollatorForSeq2Seq
)
import torch
from datasets import load_dataset
from wasabi import msg
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Load the config
import yaml
with open('config/config_T5-L_cdr.yaml') as f:
    config = yaml.load(f, Loader=yaml.FullLoader)

In [3]:
dataset = load_dataset(
        config['dataset_vars']['type'], 
        data_dir=config['dataset_vars']['dir'],
        column_names=config['dataset_vars']['column_names']
        )

eval_dataset = dataset['validation'].select(range(1,501)) # remove first row that contains column names

In [4]:
# Gather random examples from the evaluation dataset
amount_examples_to_show = 5
random_examples = []
for i in range(amount_examples_to_show):
    pick = random.randint(0, len(eval_dataset)-1)
    random_examples.append({'Input':eval_dataset[pick]['input'],
                            'Expected output':eval_dataset[pick]['relations']})

In [5]:
# Load model and tokenizer
model_name = config['model_name']
device_map = {"": 0}

global tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(
    model_name,
    device_map=device_map
) # we specificly use T5 for Conditional generations because it has a language modeling head

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on google-t5/t5-large automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
# Model performance before training
inputs = [i["Input"] for i in random_examples]
expected_output = [i["Expected output"] for i in random_examples]

for input, expected in zip(inputs, expected_output):
    # inference
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to('cuda') 
    output = model.generate(input_ids, max_new_tokens=128)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # print overview
    msg.info("Input:")
    print(input)
    msg.good("Expected output:")
    print(expected)
    msg.info("Actual output:")
    print(decoded_output, "\n\n\n")

[38;5;4mℹ Input:[0m
Nightmares and hallucinations after long-term intake of tramadol combined with antidepressants. Tramadol is a weak opioid with effects on adrenergic and serotonergic neurotransmission that is used to treat cancer pain and chronic non malignant pain. This drug was initiated in association with paroxetine and dosulepine hydrochloride in a tetraparetic patient with chronic pain. Fifty-six days after initiation of the treatment the patient presented hallucinations that only stopped after the withdrawal of psycho-active drugs and tramadol. The case report questions the long term use of pain killers combined with psycho-active drugs in chronic non malignant pain, especially if pain is under control.
[38;5;2m✔ Expected output:[0m
tramadol @CHEMICAL@ hallucinations @DISEASE@ @CID@ paroxetine @CHEMICAL@ hallucinations @DISEASE@ @CID@ dosulepine hydrochloride @CHEMICAL@ hallucinations @DISEASE@ @CID@
[38;5;4mℹ Actual output:[0m
tramadol and antidepressants.....s after l

Token indices sequence length is longer than the specified maximum sequence length for this model (677 > 512). Running this sequence through the model will result in indexing errors


[38;5;4mℹ Input:[0m
The antiarrhythmic effect and possible ionic mechanisms of pilocarpine on animal models. This study was designed to evaluate the effects of pilocarpine and explore the underlying ionic mechanism, using both aconitine-induced rat and ouabain-induced guinea pig arrhythmia models. Confocal microscopy was used to measure intracellular free-calcium concentrations ([Ca(2+)](i)) in isolated myocytes. The current data showed that pilocarpine significantly delayed onset of arrhythmias, decreased the time course of ventricular tachycardia and fibrillation, reduced arrhythmia score, and increased the survival time of arrhythmic rats and guinea pigs. [Ca(2+)](i) overload induced by aconitine or ouabain was reduced in isolated myocytes pretreated with pilocarpine. Moreover, M(3)-muscarinic acetylcholine receptor (mAChR) antagonist 4-DAMP (4-diphenylacetoxy-N-methylpiperidine-methiodide) partially abolished the beneficial effects of pilocarpine. These data suggest that pilocarp

In [7]:
# Load model after training
model = T5ForConditionalGeneration.from_pretrained(
    "fine_tune_results/checkpoint-1200",
    device_map=device_map,
    local_files_only=True
)

In [8]:
# Model performance before training
inputs = [i["Input"] for i in random_examples]
expected_output = [i["Expected output"] for i in random_examples]

for input, expected in zip(inputs, expected_output):
    # inference
    input_ids = tokenizer(input, return_tensors="pt").input_ids.to('cuda') 
    output = model.generate(input_ids, max_new_tokens=128)
    decoded_output = tokenizer.decode(output[0], skip_special_tokens=True)

    # print overview
    msg.info("Input:")
    print(input)
    msg.good("Expected output:")
    print(expected)
    msg.info("Actual output:")
    print(decoded_output, "\n\n\n")

[38;5;4mℹ Input:[0m
Nightmares and hallucinations after long-term intake of tramadol combined with antidepressants. Tramadol is a weak opioid with effects on adrenergic and serotonergic neurotransmission that is used to treat cancer pain and chronic non malignant pain. This drug was initiated in association with paroxetine and dosulepine hydrochloride in a tetraparetic patient with chronic pain. Fifty-six days after initiation of the treatment the patient presented hallucinations that only stopped after the withdrawal of psycho-active drugs and tramadol. The case report questions the long term use of pain killers combined with psycho-active drugs in chronic non malignant pain, especially if pain is under control.
[38;5;2m✔ Expected output:[0m
tramadol @CHEMICAL@ hallucinations @DISEASE@ @CID@ paroxetine @CHEMICAL@ hallucinations @DISEASE@ @CID@ dosulepine hydrochloride @CHEMICAL@ hallucinations @DISEASE@ @CID@
[38;5;4mℹ Actual output:[0m
tramadol @CHEMICAL@ nightmares @DISEASE@ @

# Evaluation using scores

In [9]:
from run import *
import numpy as np
import evaluate
import re

### Setting up trainer

In [10]:
training_arguments = Seq2SeqTrainingArguments(
        output_dir=config['output_dir'],
        per_device_train_batch_size=config['per_device_train_batch_size'],
        gradient_accumulation_steps=config['gradient_accumulation_steps'],
        optim=config['optim'],
        save_steps=config['save_steps'],
        logging_steps=config['logging_steps'],
        learning_rate=config['learning_rate'],
        fp16=config['fp16'],
        bf16=config['bf16'],
        max_grad_norm=config['max_grad_norm'],
        max_steps=config['max_steps'],
        warmup_ratio=config['warmup_ratio'],
        group_by_length=config['group_by_length'],
        lr_scheduler_type=config['lr_scheduler_type'],
        predict_with_generate=True,
        save_total_limit=2,
        save_strategy='steps',
        load_best_model_at_end=True,
        do_eval=config['do_eval'],
        evaluation_strategy=config['evaluation_strategy'],
        eval_steps=config['eval_steps'],
        remove_unused_columns=True,
        generation_max_length=152
    )

In [11]:
data_collator = DataCollatorForSeq2Seq(
        tokenizer,
        model=model,
        label_pad_token_id=-100,
        pad_to_multiple_of=8 if config['fp16'] else None,
    )

In [12]:
def preprocess_function(examples):
    '''
    This function takes a dataset of input and target sequences.
    meant to be used with the dataset.map() function
    '''
    
    text_column = dataset_vars['column_names'][0]
    rel_column = dataset_vars['column_names'][1]

    # Split input and target
    inputs, targets = [], []
    for i in range(len(examples[text_column])):
        if examples[text_column][i] and examples[rel_column][i]: # remove pairs where one is None
            inputs.append(examples[text_column][i])
            targets.append(examples[rel_column][i])

    # Tokenize the input
    model_inputs = tokenizer(
        inputs, 
        max_length=max_seq_length, 
        padding=padding, 
        truncation=truncation, 
        return_tensors='pt'
    )

    # Tokenize the target sequence
    labels = tokenizer(
        text_target=targets, 
        max_length=max_seq_length, 
        padding=padding, 
        truncation=truncation,  
        return_tensors='pt'
    )

    # Replace pad tokens with -100 so they don't contribute too the loss
    if ignore_pad_token_for_loss:
        labels["input_ids"] = [
                    [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
                ]

    # Add tokenized target text to output
    model_inputs["labels"] = labels["input_ids"]

    return model_inputs

In [13]:
def compute_metrics(eval_preds):
    preds, labels = eval_preds
    if isinstance(preds, tuple):
        preds = preds[0]
        
    # Replace -100s used for padding as we can't decode them
    preds = np.where(preds != -100, preds, tokenizer.pad_token_id)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # Some simple post-processing
    decoded_preds, decoded_labels = postprocess_text(decoded_preds, decoded_labels)

    result = metric.compute(predictions=decoded_preds, rouge_types=['rouge1', 'rouge2'], references=decoded_labels, use_stemmer=False)
    result.update(re_metric(predictions=decoded_preds, references=decoded_labels))
    result.update(ner_metric(predictions=decoded_preds, references=decoded_labels, re_labels=['@CID@']))
    result = {k: round(v * 100, 4) for k, v in result.items()} # rounds all metric values to 4 numvers behind the comma and make them percentages
    prediction_lens = [np.count_nonzero(pred != tokenizer.pad_token_id) for pred in preds]
    result["gen_len"] = np.mean(prediction_lens) # mean length of the generated sequences
    return result

In [14]:
dataset_vars = config['dataset_vars']
max_seq_length = config['max_seq_length']
padding = config['padding']
truncation = config['truncation']
ignore_pad_token_for_loss = config['ignore_pad_token_for_loss']

eval_dataset = eval_dataset.map(
    preprocess_function,
    batched=True,
    desc="Running tokenizer on train dataset"
)

In [15]:
# Load metric
global metric # Otherwise the metric object won't be accessible from within compute_metric()
metric = evaluate.load("rouge")

In [16]:
trainer = Seq2SeqTrainer(
        model=model,
        eval_dataset=eval_dataset,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        args=training_arguments,
    )

In [17]:
trainer.evaluate()

ValueError: too many values to unpack (expected 2)