In [1]:
############################################################################
##  Simple transformers Python t5 Module - Example of medical/answer using T5 encoder/decoder model   
##  https://simpletransformers.ai/docs/usage/
##
## Training data is parsed from:
## https://med-mu.com/wp-content/uploads/2018/06/medsouls.blogspot.com-1000-Questions-and-Answers-from-Kumar-_-Clark_s-Clinical-Medicine-2e-Saunders-2011.pdf
##
## Author: Chris Meaney
## Date: June 2021 
############################################################################

In [2]:
## Delete check-pointed files (else disk fills up) 
## Note: these are BASH commands (not python anymore)
!rm -rf outputs/check*

In [3]:
##########################
## Dependency modules
##########################

## Pandas for data wrangling
import pandas as pd

## matplotlib plotting
%matplotlib inline
import matplotlib.pyplot as plt
plt.style.use("ggplot")

## Numpy for numerics
import numpy as np
np.random.seed(0)

## sklearn for train/val/test split
from sklearn.model_selection import train_test_split

## Simple Transformers
!pip install --quiet tokenizers
!pip install --quiet transformers -U
!pip install --quiet simpletransformers
from scipy.special import softmax
from simpletransformers.t5 import T5Model, T5Args

## For system info
!pip install --quiet sinfo
from sinfo import sinfo

[K     |████████████████████████████████| 3.3MB 12.0MB/s 
[K     |████████████████████████████████| 2.3MB 12.3MB/s 
[K     |████████████████████████████████| 901kB 37.7MB/s 
[K     |████████████████████████████████| 225kB 11.8MB/s 
[K     |████████████████████████████████| 51kB 6.4MB/s 
[K     |████████████████████████████████| 1.8MB 20.6MB/s 
[K     |████████████████████████████████| 8.2MB 36.9MB/s 
[K     |████████████████████████████████| 235kB 42.1MB/s 
[K     |████████████████████████████████| 1.2MB 32.7MB/s 
[K     |████████████████████████████████| 122kB 42.3MB/s 
[K     |████████████████████████████████| 81kB 9.6MB/s 
[K     |████████████████████████████████| 174kB 45.7MB/s 
[K     |████████████████████████████████| 102kB 11.4MB/s 
[K     |████████████████████████████████| 133kB 44.7MB/s 
[K     |████████████████████████████████| 4.2MB 31.0MB/s 
[K     |████████████████████████████████| 112kB 45.4MB/s 
[K     |████████████████████████████████| 81kB 9.7MB/s 
[K

In [4]:
## Options for printing more rows/columns in Jupyter Notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

In [5]:

##########################################################
## Use pandas to import data, and store as data.frame
##########################################################

## Read in data from Google Drive account (this will force mount step, authentication step, etc.)
## https://stackoverflow.com/questions/48340341/how-to-read-csv-to-dataframe-in-google-colab

from google.colab import drive 
drive.mount('/content/gdrive', force_remount=True)

import pandas as pd 
dat = pd.read_csv('gdrive/My Drive/ColabData/KumarClark_2011_1000ClinicalQuestionsAnswered_Parsed.csv', encoding='latin1')
dat.head(n=15)

Mounted at /content/gdrive


Unnamed: 0,questions,answers
0,"QUESTION1 regarding medical ethics, if a man i...",ANSWER1 no. the doctor can only give confident...
1,QUESTION2 is it unlawful in most countries to ...,"ANSWER2 in all healthcare systems, rationing h..."
2,QUESTION3 what is meant by qalys? is there a d...,ANSWER3 qalys are quality adjusted life years....
3,QUESTION4 are âdo not resuscitateâ orders ...,ANSWER4 it is accepted in most countries that ...
4,QUESTION5 what is a living will?,ANSWER5 this is a written advanced directive m...
5,"QUESTION8 as a junior doctor, i have to attend...",ANSWER8 your patients must always be informed ...
6,QUESTION9 is the role of the advocate in a med...,"ANSWER9 both! however, an advocate represents ..."
7,QUESTION10 we are always asked by our seniors ...,ANSWER10 the law in the uk is clear: touching ...
8,"QUESTION1 as cells grow and regenerate, what m...",ANSWER1 cells are continually dying by a proce...
9,QUESTION2 i cannot find out why some of the au...,"ANSWER2 by definition, the genes responsible f..."


In [6]:
## Remove QUESTION/ANSWER labels from header of text strings
dat['questions'] = dat.questions.replace(to_replace='^QUESTION[0-9]+ ', value='', regex=True)
dat['answers'] = dat.answers.replace(to_replace='^ANSWER[0-9]+ ', value='', regex=True)
## Add prefix column
dat['prefix'] = 'ask_question'
## Clean up column names
dat.columns = ['input_text','target_text','prefix']
## Re-order column order
col_order = ['prefix','input_text','target_text']
dat = dat[col_order]
## Print first couple rows of new dataset
dat.head(n=15)

Unnamed: 0,prefix,input_text,target_text
0,ask_question,"regarding medical ethics, if a man is discover...",no. the doctor can only give confidential info...
1,ask_question,is it unlawful in most countries to limit medi...,"in all healthcare systems, rationing has becom..."
2,ask_question,what is meant by qalys? is there a difference ...,qalys are quality adjusted life years. these w...
3,ask_question,are âdo not resuscitateâ orders illegal in...,it is accepted in most countries that âfutil...
4,ask_question,what is a living will?,this is a written advanced directive made by c...
5,ask_question,"as a junior doctor, i have to attend many mult...",your patients must always be informed about th...
6,ask_question,is the role of the advocate in a medical inter...,"both! however, an advocate represents the valu..."
7,ask_question,we are always asked by our seniors to make sur...,the law in the uk is clear: touching a patient...
8,ask_question,"as cells grow and regenerate, what mechanism d...",cells are continually dying by a process of ap...
9,ask_question,i cannot find out why some of the autosomal do...,"by definition, the genes responsible for autos..."


In [7]:
## Break into train/evaluation datasets
train_data, eval_data = train_test_split(dat, test_size=0.2)
[train_data.shape, eval_data.shape]

[(688, 3), (172, 3)]

In [8]:
## T5 model configuration arguments
model_args = T5Args()

## Set T5 model arguments
model_args.reprocess_input_data=True
model_args.overwrite_output_dir=True
model_args.max_seq_length=128
model_args.eval_batch_size=16
model_args.num_train_epochs=10
model_args.save_eval_checkpoints=False
model_args.use_multiprocessing=True
model_args.silent=True
model_args.num_beams=None
model_args.do_sample=True
model_args.max_length=50
model_args.top_k=50
model_args.top_p=0.95
model_args.num_return_sequences=3

In [9]:
## Can print list of all tunable model hyper-parameters to console
help(model_args)

Help on T5Args in module simpletransformers.config.model_args object:

class T5Args(ModelArgs)
 |  T5Args(adafactor_beta1: float = None, adafactor_clip_threshold: float = 1.0, adafactor_decay_rate: float = -0.8, adafactor_eps: tuple = <factory>, adafactor_relative_step: bool = False, adafactor_scale_parameter: bool = False, adafactor_warmup_init: bool = False, adam_epsilon: float = 1e-08, best_model_dir: str = 'outputs/best_model', cache_dir: str = 'cache_dir/', config: dict = <factory>, cosine_schedule_num_cycles: float = 0.5, custom_layer_parameters: list = <factory>, custom_parameter_groups: list = <factory>, dataloader_num_workers: int = 0, do_lower_case: bool = False, dynamic_quantize: bool = False, early_stopping_consider_epochs: bool = False, early_stopping_delta: float = 0, early_stopping_metric: str = 'eval_loss', early_stopping_metric_minimize: bool = True, early_stopping_patience: int = 3, encoding: str = None, eval_batch_size: int = 8, evaluate_during_training: bool = False

In [10]:
## Create a simple transformers T5Model
model = T5Model("t5", 
                  "t5-base", 
                  args=model_args,
                  use_cuda=True)

Downloading:   0%|          | 0.00/1.20k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

In [11]:
## Print additional information about T5 model to console
help(model)

Help on T5Model in module simpletransformers.t5.t5_model object:

class T5Model(builtins.object)
 |  T5Model(model_type, model_name, args=None, tokenizer=None, use_cuda=True, cuda_device=-1, **kwargs)
 |  
 |  Methods defined here:
 |  
 |  __init__(self, model_type, model_name, args=None, tokenizer=None, use_cuda=True, cuda_device=-1, **kwargs)
 |      Initializes a T5Model model.
 |      
 |      Args:
 |          model_type: The type of model (t5, mt5)
 |          model_name: The exact architecture and trained weights to use. This may be a Hugging Face Transformers compatible pre-trained model, a community model, or the path to a directory containing model files.
 |          args (optional): Default args will be used if this parameter is not provided. If provided, it should be a dict containing the args that should be changed in the default args.
 |          use_cuda (optional): Use GPU if available. Setting to False will force model to use CPU only.
 |          cuda_device (optiona

In [12]:
# Train the simple transformers NER model
import time
t0 = time.time()
model.train_model(train_data, eval_data=eval_data)
t1 = time.time()
runtime = t1 - t0
runtime



Using Adafactor for T5


	add_(Number alpha, Tensor other)
Consider using one of the following signatures instead:
	add_(Tensor other, *, Number alpha) (Triggered internally at  /pytorch/torch/csrc/utils/python_arg_parser.cpp:1005.)
  exp_avg_sq_row.mul_(beta2t).add_(1.0 - beta2t, update.mean(dim=-1))


1854.3823835849762

In [13]:
## Evaluate the model
result = model.eval_model(eval_data)



In [14]:
## Result is evaluation metric
result

{'eval_loss': 3.9549167806451972}

In [42]:
## Make predictions with the model
## Fun question indices may include: 0, 27,
question = eval_data['input_text'].iloc[52]
answer = eval_data['target_text'].iloc[52]

In [43]:
## Print the question
question

'1. how effective is renal duplex in detecting renal artery stenosis? 2. is magnetic resonance angiography superior to renal duplex in detecting renal artery stenosis?'

In [44]:
## Print the true answer (from Kumar/Collins)
answer

'1. duplex scanning compared to arteriography is over 90% sensitive and specific. 2. yes, and this is now best practice for the diagnosis.'

In [45]:
## Make predictions given input
preds = model.predict(question)
preds



[['1. how effective is this method? it has been used in 1 trial (winning both the domestic and international standards) in 1 trial. 2. how effective is it? ratings are 3.4.',
  '1. how effective is the treatment? 1. effectiveness of ace inhibitors is assessed after x-ray. use a statin. it often helps, although sometimes not helpful. 2. ineffective: use a statin.',
  '1. you are right in saying that the placebo works and that the placebo doesnât matter. 1. how effective the placebo is shown is unclear.'],
 ['renal duplex is a complex plexus containing sodium chloride ii, potassium ii, sodium ii or calcium duplex.',
  'renal duplex consists of a renal component and a renal duplex occurs when there is renal deplexing.',
  'renal duplex is a complex renal infarction that consists of a renal component consisting of calcium channel dihydrochloride (dh), pyruvate diuretic and thiamine containing phosphat'],
 ['detection of recurrence is now routine in almost all cases of recurrence.',
  'the 

In [19]:
#############################
## Print system info
#############################
#!pip install sinfo
#import sinfo from sinfo
sinfo()

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
google              NA
matplotlib          3.2.2
numpy               1.19.5
pandas              1.1.5
scipy               1.4.1
simpletransformers  NA
sinfo               0.3.4
sklearn             0.22.2.post1
-----
IPython             5.5.0
jupyter_client      5.3.5
jupyter_core        4.7.1
notebook            5.3.1
-----
Python 3.7.10 (default, May  3 2021, 02:48:31) [GCC 7.5.0]
Linux-5.4.109+-x86_64-with-Ubuntu-18.04-bionic
2 logical CPU cores, x86_64
-----
Session information updated at 2021-06-04 1

In [20]:
###################################################################
## Save the rendered .ipynb files to HTML to share with others
###################################################################

In [21]:
#from google.colab import drive 
#drive.mount('/content/gdrive', force_remount=True)

In [22]:
%%shell
jupyter nbconvert --to html 'gdrive/My Drive/Colab Notebooks/SimpleTransformers_BERT_NER_DEID_i2b2_2014_May2021.ipynb'

[NbConvertApp] Converting notebook gdrive/My Drive/Colab Notebooks/SimpleTransformers_BERT_NER_DEID_i2b2_2014_May2021.ipynb to html
[NbConvertApp] Writing 8755801 bytes to gdrive/My Drive/Colab Notebooks/SimpleTransformers_BERT_NER_DEID_i2b2_2014_May2021.html


