In [None]:
############################################################################
##  Transformers token classification pipeline/fine-tuning for NER
##  From tutorial: https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb
##
## Modified by Author: Chris Meaney
## Date: June 2021
##
## Purpose: apply transformers NER module over i2b2 2014 DEID dataset (train/val results; with hyper-parm tuning; final eval - best model - on test)
##
############################################################################

In [None]:
## Print information about the specific NVIDIA GPU which COLAB has assigned to this session
!nvidia-smi

Tue Jul  6 01:00:31 2021       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 465.27       Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   79C    P0    32W /  70W |      0MiB / 15109MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
##########################
## Dependency modules
##########################

## For system info
!pip install sinfo
from sinfo import sinfo

## For timing
import time

## Pandas for data wrangling (import data)
import pandas as pd

## Used to display pandas data frame in a nice HTML format
from IPython.display import display, HTML

## Numpy for numerics
import random
import numpy as np
## Do I set seed for reproducibility? - How will this work on PyTorch, Transformers, etc. (i.e. is there a gloabl seed; or is this np.seed sufficient)
np.random.seed(12345)

## sklearn for eval metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
## sklearn model selection tools
from sklearn.model_selection import train_test_split

## Torch (for base NN layers/act-funs, loss, train/updates, etc.)
!pip install torch
import torch

## Transformers
! pip install transformers
import transformers
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

## Datasets for CONLL example
! pip install datasets
from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence
from datasets import Dataset

## For sequence evaluation functions (to run against CONLL-NER format datasets)
## https://pypi.org/project/seqeval/0.0.10/
! pip install seqeval



In [None]:
## Options for printing more rows/columns in Jupyter Notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

In [None]:
##########################
## Specific transformer model/architecture
##########################

# model_checkpoint = "bert-base-uncased"
# model_checkpoint = "bert-large-uncased"
# model_checkpoint = "albert-base-v2"
# model_checkpoint = "albert-xxlarge-v2"
model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = "xlm-roberta-base"
# model_checkpoint = "xlm-roberta-large"


## Warning: for roberta models; need to instantiate tokenizer with add_prefix_space=True

# model_checkpoint = "roberta-base"
# model_checkpoint = "roberta-large"


#################
## Batch size
#################
batch_size = 16


############################
## Number training epochs
############################
n_train_epochs = 17


###################
## Learning Rate
###################
learn_rate = 1e-5


###########################
## Weight decay (L2 regularization - on final weight layer? or all layers?)
###########################
wt_decay = 0



In [None]:
! pip freeze > requirements.txt

In [None]:
##########################################################
## Use pandas to import data, and store as data.frame
##########################################################

## Read in data from Google Drive account (this will force mount step, authentication step, etc.)
## https://stackoverflow.com/questions/48340341/how-to-read-csv-to-dataframe-in-google-colab

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

import pandas as pd
dat = pd.read_csv('gdrive/My Drive/ColabData/bio_df_st.csv', encoding='latin1')
dat.head(n=15)

Mounted at /content/gdrive


Unnamed: 0,doc_id,is_test,bio,tok_text,bio_r
0,220-01,False,O,Record,O
1,220-01,False,O,date:,O
2,220-01,False,B-DATE,2067-05-03,DATE
3,220-01,False,O,Narrative,O
4,220-01,False,O,History,O
5,220-01,False,B-AGE,55,AGE
6,220-01,False,O,yo,O
7,220-01,False,O,woman,O
8,220-01,False,O,who,O
9,220-01,False,O,presents,O


In [None]:
## Check is tok_text is "string"; if True then keep; if False (since int/float/None/etc.) then delete

# dat['tok_text_flag'] = dat.tok_text.isnull()
# dat.tok_text_flag.value_counts()

# dat['tok_text_flag'] = dat.tok_text.str.isnumeric()
# dat.tok_text_flag.value_counts()

dat['tok_text_flag'] = dat.tok_text.isnull() | dat.tok_text.str.isnumeric()
dat.tok_text_flag.value_counts()

False    742926
True      22591
Name: tok_text_flag, dtype: int64

In [None]:
## Drop these above rows from the data.frame
dat = dat[dat['tok_text_flag']==False]
dat.shape

(742926, 6)

In [None]:
## Map the bio tags to integer indices
codes, unique = pd.factorize(dat['bio'])
dat['bio_int'] = codes
dat.bio_int.value_counts()

0     713050
1      10538
2       4453
4       3170
11      1997
3       1983
9       1608
10      1085
13       657
14       581
12       465
16       461
6        380
21       348
7        344
8        323
19       319
26       197
5        189
20       174
18       161
15       156
27        89
22        83
25        19
39        18
33        14
32        13
28        10
23         9
17         8
34         5
29         5
38         4
31         3
37         2
30         1
24         1
35         1
36         1
40         1
Name: bio_int, dtype: int64

In [None]:
## Group the rows of the dataframe by doc_id
dat_group = dat.groupby(['doc_id'],as_index=False)['is_test', 'bio', 'bio_r', 'bio_int', 'tok_text'].agg(lambda x: list(x))

## Print head of data
dat_group.head()

  


Unnamed: 0,doc_id,is_test,bio,bio_r,bio_int,tok_text
0,100-01,"[False, False, False, False, False, False, Fal...","[O, O, B-DATE, B-HOSPITAL, O, O, I-STREET, I-S...","[O, O, DATE, HOSPITAL, O, O, STREET, STREET, C...","[0, 0, 1, 3, 0, 0, 13, 13, 14, 0, 16, 2, 4, 0,...","[Record, date:, 2106-02-12, Campbell, Orthoped..."
1,100-02,"[False, False, False, False, False, False, Fal...","[O, O, B-DATE, B-HOSPITAL, O, O, O, B-PATIENT,...","[O, O, DATE, HOSPITAL, O, O, O, PATIENT, MEDIC...","[0, 0, 1, 3, 0, 0, 0, 11, 19, 0, 0, 1, 0, 0, 0...","[Record, date:, 2108-03-14, CAMPBELL, EMERGENC..."
2,100-04,"[False, False, False, False, False, False, Fal...","[O, O, B-DATE, O, O, O, O, O, O, B-DATE, O, B-...","[O, O, DATE, O, O, O, O, O, O, DATE, O, DATE, ...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 11,...","[Record, date:, 2111-10-10, CCU, JAR, Transfer..."
3,100-05,"[False, False, False, False, False, False, Fal...","[O, O, B-DATE, O, O, O, O, O, B-PATIENT, I-PAT...","[O, O, DATE, O, O, O, O, O, PATIENT, PATIENT, ...","[0, 0, 1, 0, 0, 0, 0, 0, 11, 10, 0, 1, 0, 0, 1...","[Record, date:, 2111-12-14, NEUROLOGY, CMF, AD..."
4,101-01,"[False, False, False, False, False, False, Fal...","[O, O, B-DATE, B-HOSPITAL, I-HOSPITAL, I-HOSPI...","[O, O, DATE, HOSPITAL, HOSPITAL, HOSPITAL, HOS...","[0, 0, 1, 3, 9, 9, 3, 0, 0, 0, 13, 13, 14, 0, ...","[Record, date:, 2079-05-12, MERCY, CARE, CENTE..."


In [None]:
## Create flag for train/test datasets
dat_group['is_test_flag'] = [is_test[0] for is_test in dat_group.is_test]
dat_group.is_test_flag.value_counts()

False    737
True     486
Name: is_test_flag, dtype: int64

In [None]:
## Create train and test datasets
train_dat = dat_group.loc[dat_group['is_test_flag'] == False, ['doc_id','bio','bio_int','tok_text']]
test_dat = dat_group.loc[dat_group['is_test_flag'] == True, ['doc_id','bio','bio_int','tok_text']]

[train_dat.shape, test_dat.shape]

[(737, 4), (486, 4)]

In [None]:
## Further sample the training dataset into two distinct chunks (i.e. train and val)
train_size = 500
test_size = train_dat.shape[0] - train_size

train_dat, val_dat = train_test_split(train_dat, train_size=train_size, test_size=test_size)

[train_dat.shape, val_dat.shape, test_dat.shape]

[(500, 4), (237, 4), (486, 4)]

In [None]:
## Inspect what one of the datasets above looks like
train_dat.head(n=5)

Unnamed: 0,doc_id,bio,bio_int,tok_text
705,276-03,"[O, O, B-DATE, O, O, B-PATIENT, I-PATIENT, O, ...","[0, 0, 1, 0, 0, 11, 10, 0, 1, 0, 2, 4, 0, 0, 0...","[Record, date:, 2061-01-31, EDVISIT^, ^, GIPSO..."
931,328-01,"[O, O, O, I-DOCTOR, O, O, O, O, O, O, B-PATIEN...","[0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 11, 10, 0, 0, 0...","[Record, date:, B-DAT, Urzua, Medicine, Servic..."
848,309-02,"[O, O, B-DATE, O, B-PATIENT, I-PATIENT, O, O, ...","[0, 0, 1, 0, 11, 10, 0, 0, 0, 0, 0, 0, 0, 0, 0...","[Record, date:, 2092-12-15, NAME:, Wilson,, Ga..."
925,326-04,"[O, O, B-DATE, O, O, O, O, O, O, O, O, O, O, O...","[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[Record, date:, 2087-03-04, Renal, Fellow, ID/..."
317,177-04,"[O, O, O, I-DOCTOR, O, O, O, O, O, O, O, B-PAT...","[0, 0, 0, 4, 0, 0, 0, 0, 0, 0, 0, 11, 10, 0, 0...","[Record, date:, B-DA, Baldwin, E, Internal, Me..."


In [None]:
## Get list/set of unique tags
dat.bio.value_counts()
#dat.bio.value_counts().sort_index()

O                   713050
B-DATE               10538
B-DOCTOR              4453
I-DOCTOR              3170
B-PATIENT             1997
B-HOSPITAL            1983
I-HOSPITAL            1608
I-PATIENT             1085
I-STREET               657
B-CITY                 581
I-DATE                 465
B-STATE                461
B-PROFESSION           380
B-USERNAME             348
B-IDNUM                344
I-PROFESSION           323
B-MEDICALRECORD        319
B-PHONE                197
B-ORGANIZATION         189
B-COUNTRY              174
I-ORGANIZATION         161
I-CITY                 156
I-PHONE                 89
B-AGE                   83
I-COUNTRY               19
I-STATE                 18
I-LOCATION-OTHER        14
B-LOCATION-OTHER        13
I-IDNUM                 10
B-DEVICE                 9
B-FAX                    8
I-AGE                    5
B-EMAIL                  5
I-URL                    4
B-ZIP                    3
B-URL                    2
I-HEALTHPLAN             1
I

In [None]:
## Get names of IDs
label_list = dat.bio.unique().tolist()
label_list

['O',
 'B-DATE',
 'B-DOCTOR',
 'B-HOSPITAL',
 'I-DOCTOR',
 'B-ORGANIZATION',
 'B-PROFESSION',
 'B-IDNUM',
 'I-PROFESSION',
 'I-HOSPITAL',
 'I-PATIENT',
 'B-PATIENT',
 'I-DATE',
 'I-STREET',
 'B-CITY',
 'I-CITY',
 'B-STATE',
 'B-FAX',
 'I-ORGANIZATION',
 'B-MEDICALRECORD',
 'B-COUNTRY',
 'B-USERNAME',
 'B-AGE',
 'B-DEVICE',
 'I-MEDICALRECORD',
 'I-COUNTRY',
 'B-PHONE',
 'I-PHONE',
 'I-IDNUM',
 'I-AGE',
 'B-BIOID',
 'B-ZIP',
 'B-LOCATION-OTHER',
 'I-LOCATION-OTHER',
 'B-EMAIL',
 'B-HEALTHPLAN',
 'I-HEALTHPLAN',
 'B-URL',
 'I-URL',
 'I-STATE',
 'I-FAX']

In [None]:
## Get number of unique BIO tags for the i2b2 DEID NER task
num_tags = len(label_list)
num_tags

41

In [None]:
###################################################
## Install the tokenizer (note it will be specific to the model define above)
###################################################

if model_checkpoint=='roberta-base':
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
elif model_checkpoint=='roberta-large':
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
## Assertion/check against the particular tokenizer installed
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
## Tokenize tha piece of training data
example = train_dat.iloc[0,]
tokenized_input = tokenizer(example.tok_text, is_split_into_words=True, truncation=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
print(tokens)

['[CLS]', 'record', 'date', ':', '206', '##1', '-', '01', '-', '31', 'ed', '##vis', '##it', '^', '^', 'gi', '##ps', '##on', ',', 'alexander', '^', '01', '/', '31', '/', '61', '^', 've', '##las', '##que', '##z', ',', 'walter', 'this', 'patient', 'was', 'seen', 'by', 'myself', 'and', 'dr', '.', 'hardin', 'on', '01', '/', '31', '/', '206', '##1', '.', 'i', 'confirm', 'i', 'interviewed', 'and', 'examined', 'the', 'patient', ',', 'reviewed', 'the', 'resident', "'", 's', 'documentation', ',', 'and', 'discussed', 'the', 'plan', 'of', 'care', 'with', 'the', 'patient', '.', 'history', 'of', 'present', 'illness', ':', 'this', 'is', 'a', '-', 'year', '-', 'old', 'gentleman', 'who', 'has', 'a', 'history', 'of', 'cad', 'status', 'post', 'ste', '##nt', 'in', ',', 'who', 'developed', 'acute', 'chest', 'discomfort', ',', 'pressure', 'like', ',', 'dia', '##ph', '##ores', '##is', ',', 'and', 'weakness', ',', 'while', 'he', 'was', 'working', 'as', 'a', 'chi', '##rop', '##rac', '##tor', '.', 'he', 'says',

In [None]:
## Note difference in length
len(example['bio']), len(tokenized_input["input_ids"])

(480, 512)

In [None]:
## And look at IDs of input tokens (noting it is of length 39 - not 31 - as perhaps expected)
print(tokenized_input.word_ids())

[None, 0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 5, 5, 6, 7, 8, 8, 8, 8, 8, 9, 10, 10, 10, 10, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 19, 20, 21, 22, 22, 22, 22, 22, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 31, 32, 33, 34, 34, 34, 35, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 44, 45, 46, 47, 48, 48, 49, 50, 51, 52, 52, 52, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 62, 63, 64, 65, 66, 67, 68, 69, 69, 70, 71, 71, 72, 72, 72, 72, 72, 73, 74, 74, 75, 76, 77, 78, 79, 80, 81, 81, 81, 81, 82, 83, 84, 85, 86, 87, 88, 89, 89, 90, 91, 92, 93, 94, 95, 96, 97, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 107, 108, 109, 110, 111, 112, 112, 113, 114, 115, 116, 116, 117, 117, 117, 118, 119, 119, 119, 120, 121, 122, 123, 123, 124, 125, 126, 127, 128, 129, 130, 130, 131, 132, 133, 134, 134, 135, 136, 137, 138, 139, 140, 141, 141, 141, 142, 142, 142, 143, 144, 145, 146, 147, 147, 148, 149, 150, 151, 152, 152, 152, 152, 153, 154, 155, 156, 156, 156, 156, 156, 156, 157, 158, 159, 159, 160, 16

In [None]:
## We can align the labels to match the new token encodings
word_ids = tokenized_input.word_ids()
aligned_labels = [-100 if i is None else example['bio'][i] for i in word_ids]
print(len(aligned_labels), len(tokenized_input["input_ids"]))

512 512


In [None]:
## We set flag to label all tokens
label_all_tokens = True

In [None]:
##################################################################
## Function to apply transormers tokenizer to sequence; then re-align labels to match newly encoded (new-length) sequence
##################################################################

## Note: if any of the token elements are 'None' or str.isnumeric=True then I think this will fail?
## Note: I handled this above by deleting these problematic tokens. That said, I could have handled by assigning to new string?? 's' + 'old_token'
def tokenize_and_align_labels(tokens, tags):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
###########################################################################################
## Example of applying the tokenize/align function over first 5 docs in training dataset
###########################################################################################
ex = tokenize_and_align_labels(tokens=train_dat.tok_text[0:5].to_list(), tags=train_dat.bio_int[0:5].to_list())
ex

{'input_ids': [[101, 2501, 3058, 1024, 18744, 2487, 1011, 5890, 1011, 2861, 3968, 11365, 4183, 1034, 1034, 21025, 4523, 2239, 1010, 3656, 1034, 5890, 1013, 2861, 1013, 6079, 1034, 2310, 8523, 4226, 2480, 1010, 4787, 2023, 5776, 2001, 2464, 2011, 2870, 1998, 2852, 1012, 27522, 2006, 5890, 1013, 2861, 1013, 18744, 2487, 1012, 1045, 12210, 1045, 10263, 1998, 8920, 1996, 5776, 1010, 8182, 1996, 6319, 1005, 1055, 12653, 1010, 1998, 6936, 1996, 2933, 1997, 2729, 2007, 1996, 5776, 1012, 2381, 1997, 2556, 7355, 1024, 2023, 2003, 1037, 1011, 2095, 1011, 2214, 10170, 2040, 2038, 1037, 2381, 1997, 28353, 3570, 2695, 26261, 3372, 1999, 1010, 2040, 2764, 11325, 3108, 17964, 1010, 3778, 2066, 1010, 22939, 8458, 16610, 2483, 1010, 1998, 11251, 1010, 2096, 2002, 2001, 2551, 2004, 1037, 9610, 18981, 22648, 4263, 1012, 2002, 2758, 2348, 2023, 2001, 3243, 3278, 1010, 2009, 2001, 2025, 2066, 2010, 3188, 2771, 17964, 1012, 2002, 23439, 2151, 8249, 2000, 2010, 2067, 2030, 2010, 2849, 1012, 2002, 23439, 2151

In [None]:
type(ex)

transformers.tokenization_utils_base.BatchEncoding

In [None]:
## There will now be three elements in the "list": 1) input_ids (for the tokens), 2) attention_mask (for the attn mask), 3) labels (for i2b2 bio labels)
len(ex)

3

In [None]:
## Note: Outer list of length 5; since we took first 5 rows/docs in this example
[len(ex['input_ids']), len(ex['attention_mask']), len(ex['labels'])]

[5, 5, 5]

In [None]:
## Check the length of each of the inner lists; show that the padding/truncation combination worked
[[len(x) for x in ex['input_ids']], [len(x) for x in ex['attention_mask']], [len(x) for x in ex['labels']]]

[[512, 512, 512, 512, 512],
 [512, 512, 512, 512, 512],
 [512, 512, 512, 512, 512]]

In [None]:
##################################################################################
## Batch encode tokens/attention-mask/labels for train val and test datasets
##################################################################################

## Training data
train_encode = tokenize_and_align_labels(tokens=train_dat.tok_text.to_list(), tags=train_dat.bio_int.to_list())

## Validation data
val_encode = tokenize_and_align_labels(tokens=val_dat.tok_text.to_list(), tags=val_dat.bio_int.to_list())

## Test data
test_encode = tokenize_and_align_labels(tokens=test_dat.tok_text.to_list(), tags=test_dat.bio_int.to_list())

## Check attributes/shape of train/val/test encoded datasets
[[len(train_encode['input_ids']), len(val_encode['input_ids']), len(test_encode['input_ids'])],
[len(train_encode['attention_mask']), len(val_encode['attention_mask']), len(test_encode['attention_mask'])],
[len(train_encode['labels']), len(val_encode['labels']), len(test_encode['labels'])]]

[[500, 237, 486], [500, 237, 486], [500, 237, 486]]

In [None]:
## Len of the encoded strings
len_train_encode_input_ids = [len(x) for x in train_encode['input_ids']]
len_train_encode_attention_mask = [len(x) for x in train_encode['attention_mask']]
len_train_encode_labels = [len(x) for x in train_encode['labels']]

len_data = pd.DataFrame({'len_input_ids':len_train_encode_input_ids,
              'len_attention_mask':len_train_encode_attention_mask,
              'len_labels':len_train_encode_labels})

len_data.head(n=5)

Unnamed: 0,len_input_ids,len_attention_mask,len_labels
0,512,512,512
1,512,512,512
2,512,512,512
3,512,512,512
4,512,512,512


In [None]:
## Create dataframe with columns
train_df = pd.DataFrame({'input_ids':train_encode['input_ids'],
                         'attention_mask':train_encode['attention_mask'],
                         'labels':train_encode['labels'],})

val_df = pd.DataFrame({'input_ids':val_encode['input_ids'],
                         'attention_mask':val_encode['attention_mask'],
                         'labels':val_encode['labels'],})

test_df = pd.DataFrame({'input_ids':test_encode['input_ids'],
                         'attention_mask':test_encode['attention_mask'],
                         'labels':test_encode['labels'],})

#train_df.head(n=5)
#val_df.head(n=5)
#test_df.head(n=5)

## Convert each of the above objects into a HuggingFace Dataset (note: based on Apache Arrow dataset)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
###############################################################
## Instantiate a transformers Token Classification of NER model
##
## Model type should be one of BigBirdConfig, ConvBertConfig, LayoutLMConfig, DistilBertConfig, CamembertConfig, FlaubertConfig, XLMConfig, XLMRobertaConfig, LongformerConfig, RobertaConfig, SqueezeBertConfig, BertConfig, MegatronBertConfig, MobileBertConfig, XLNetConfig, AlbertConfig, ElectraConfig, FunnelConfig, MPNetConfig, DebertaConfig, DebertaV2Config, IBertConfig.
###############################################################
model = AutoModelForTokenClassification.from_pretrained(model_checkpoint,
                                                        num_labels=num_tags)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN t

In [None]:
help(model)

Help on DistilBertForTokenClassification in module transformers.models.distilbert.modeling_distilbert object:

class DistilBertForTokenClassification(DistilBertPreTrainedModel)
 |  DistilBertForTokenClassification(config)
 |  
 |  DistilBert Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g.
 |  for Named-Entity-Recognition (NER) tasks.
 |  
 |  
 |  This model inherits from :class:`~transformers.PreTrainedModel`. Check the superclass documentation for the generic
 |  methods the library implements for all its model (such as downloading or saving, resizing the input embeddings,
 |  pruning heads etc.)
 |  
 |  This model is also a PyTorch `torch.nn.Module <https://pytorch.org/docs/stable/nn.html#torch.nn.Module>`__
 |  subclass. Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to
 |  general usage and behavior.
 |  
 |  Parameters:
 |      config (:class:`~transformers.DistilBertConfig`)

In [None]:
##############################################################
## Hyper-parameters from NER model
##############################################################
args = TrainingArguments(
    output_dir='i2b2_output',
    evaluation_strategy="epoch",
    learning_rate=learn_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=n_train_epochs,
    weight_decay=wt_decay,
)

In [None]:
## Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
## Sequence evaluation metric (from CONLL - used to eval NER, etc. type tasks)
metric = load_metric("seqeval")

In [None]:
###############################################
## Function to compute evaluation metrics on train/val/test samples
###############################################

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
####################################################################
## Specify a training function; this will train/fine-tune NER model; and print metrics on train/val sets
####################################################################
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
##############################################################
## Train the model - print per-epoch training/val metrics to console
##############################################################
t0 = time.time()
trainer.train()
t1 = time.time()


***** Running training *****
  Num examples = 500
  Num Epochs = 17
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 544


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.422094,0.0,0.0,0.0,0.930674
2,No log,0.248078,0.714845,0.508209,0.594071,0.955479
3,No log,0.181397,0.780822,0.534741,0.634766,0.960061
4,No log,0.149126,0.776485,0.557608,0.649091,0.961965
5,No log,0.124653,0.655328,0.680592,0.667721,0.970595
6,No log,0.110807,0.653238,0.72017,0.685073,0.973162
7,No log,0.101132,0.666354,0.728672,0.696121,0.97435
8,No log,0.096547,0.661493,0.749634,0.70281,0.974979
9,No log,0.090097,0.682759,0.754617,0.716892,0.976417
10,No log,0.086386,0.692084,0.767663,0.727917,0.977219


***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 237
  Batch size = 16
***** Running Evaluation *****
  Num examples = 23

In [None]:
## Print training time
t1-t0

564.8424315452576

In [None]:
## Evaluate a trained model
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 237
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


{'epoch': 17.0,
 'eval_accuracy': 0.9804053297503079,
 'eval_f1': 0.7650987836602687,
 'eval_loss': 0.07543304562568665,
 'eval_precision': 0.7351709228482638,
 'eval_recall': 0.7975666959835825,
 'eval_runtime': 6.5884,
 'eval_samples_per_second': 35.972,
 'eval_steps_per_second': 2.277}

In [None]:
#############################################
## Evaluate model on validation set - per tag analysis and overall analysis
#############################################
predictions, labels, _ = trainer.predict(val_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

## See results of trained model applied to eval/test set (evalued on a per-tag basis - this is like sklearn.metrics.classification_report)
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 237
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


{'AGE': {'f1': 0.0, 'number': 12, 'precision': 0.0, 'recall': 0.0},
 'CITY': {'f1': 0.17045454545454544,
  'number': 132,
  'precision': 0.3409090909090909,
  'recall': 0.11363636363636363},
 'COUNTRY': {'f1': 0.0, 'number': 9, 'precision': 0.0, 'recall': 0.0},
 'DATE': {'f1': 0.9660262119862578,
  'number': 3911,
  'precision': 0.961499493414387,
  'recall': 0.9705957555612376},
 'DOCTOR': {'f1': 0.4556354916067147,
  'number': 765,
  'precision': 0.42081949058693247,
  'recall': 0.49673202614379086},
 'FAX': {'f1': 0.0, 'number': 8, 'precision': 0.0, 'recall': 0.0},
 'HOSPITAL': {'f1': 0.569377990430622,
  'number': 512,
  'precision': 0.4811320754716981,
  'recall': 0.697265625},
 'IDNUM': {'f1': 0.6,
  'number': 43,
  'precision': 0.6486486486486487,
  'recall': 0.5581395348837209},
 'LOCATION-OTHER': {'f1': 0.0, 'number': 1, 'precision': 0.0, 'recall': 0.0},
 'MEDICALRECORD': {'f1': 0.889453621346887,
  'number': 369,
  'precision': 0.8373205741626795,
  'recall': 0.94850948509485

In [None]:
#############################################
## Evaluate model on test set - per tag analysis and overall analysis
#############################################
predictions, labels, _ = trainer.predict(test_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

## See results of trained model applied to eval/test set (evalued on a per-tag basis - this is like sklearn.metrics.classification_report)
results = metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 486
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


{'AGE': {'f1': 0.0, 'number': 39, 'precision': 0.0, 'recall': 0.0},
 'CITY': {'f1': 0.2719546742209632,
  'number': 262,
  'precision': 0.5274725274725275,
  'recall': 0.183206106870229},
 'COUNTRY': {'f1': 0.0, 'number': 87, 'precision': 0.0, 'recall': 0.0},
 'DATE': {'f1': 0.9747453438611295,
  'number': 8240,
  'precision': 0.9682672733804335,
  'recall': 0.9813106796116505},
 'DEVICE': {'f1': 0.0, 'number': 4, 'precision': 0.0, 'recall': 0.0},
 'DOCTOR': {'f1': 0.4608089260808926,
  'number': 1578,
  'precision': 0.4115595416043846,
  'recall': 0.523447401774398},
 'HOSPITAL': {'f1': 0.602110022607385,
  'number': 1046,
  'precision': 0.4968905472636816,
  'recall': 0.7638623326959847},
 'IDNUM': {'f1': 0.5058823529411764,
  'number': 221,
  'precision': 0.7226890756302521,
  'recall': 0.3891402714932127},
 'LOCATION-OTHER': {'f1': 0.0, 'number': 4, 'precision': 0.0, 'recall': 0.0},
 'MEDICALRECORD': {'f1': 0.8630234208658623,
  'number': 640,
  'precision': 0.7906371911573472,
  '

In [None]:
## See what bio_tags map to what integers (bio_int)
#bio_ct = pd.DataFrame(pd.crosstab(dat.bio,dat.bio_int)).to_dict()
#bio_ct

In [None]:
###########################################
## Encode a random string and apply model.predict() method to see if it captures PHI needed to be DEID
###########################################
my_string = "Date: June 2020: Patient - christopher meaney - a biostatistician at UT presented to Dr. K. Tu with back pain from sedentary lifestyle and RSI."

## Tokenize string
my_tokens = my_string.split(' ')
# my_tokens

## Get associated tags (labels for string)
my_tags = ['O','B-DATE','I-DATE','O','O',"B-PATIENT","I-PATIENT",'O','O','B-PROFESSSION','O','B-ORGANIZATION','O','O','O','B-DOCTOR',"I-DOCTOR",'O','O','O','O','O','O','O','O']
my_tags_int = [0,1,12,0,0,11,10,0,0,6,0,5,0,0,0,2,4,0,0,0,0,0,0,0,0]

## Check that token/tag length are the same
# [len(my_tokens), len(my_tags), len(my_tags_int)]
my_string_df = pd.DataFrame({'tokens': [my_tokens],
              'bio': [my_tags],
               'bio_int': [my_tags_int]})

my_string_df_long = pd.DataFrame({'tokens': my_tokens,
              'bio': my_tags,
               'bio_int': my_tags_int})

my_string_df

Unnamed: 0,tokens,bio,bio_int
0,"[Date:, June, 2020:, Patient, -, christopher, ...","[O, B-DATE, I-DATE, O, O, B-PATIENT, I-PATIENT...","[0, 1, 12, 0, 0, 11, 10, 0, 0, 6, 0, 5, 0, 0, ..."


In [None]:
## Pass the dataframe
my_string_encode = tokenize_and_align_labels(tokens=my_string_df.tokens.to_list(), tags=my_string_df.bio_int.to_list())

## Convert tokenized input into pandas dataframe
my_string_encoded_df = pd.DataFrame({'input_ids': my_string_encode['input_ids'],
                              'attention_mask': my_string_encode['attention_mask'],
                              'labels': my_string_encode['labels']})
my_string_encoded_df

Unnamed: 0,input_ids,attention_mask,labels
0,"[101, 3058, 1024, 2238, 12609, 1024, 5776, 101...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...","[-100, 0, 0, 1, 12, 12, 0, 0, 11, 10, 10, 0, 0..."


In [None]:
## Convert pandas dataFrame into HuggingFace Dataset (an Apache Arrow dataset)
my_string_dataset = Dataset.from_pandas(my_string_encoded_df)
#my_string_dataset[0]

In [None]:
## Feed formatted string (in Dataset structure); to fine-tuned Transformer model (and obtain predictions)
predictions, labels, _ = trainer.predict(my_string_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

## Put into dataframe
pd.DataFrame({'tokens': my_string_encode.tokens()[1:-1],
              'true_labels': true_labels[0],
              'true_preds': true_predictions[0]})

***** Running Prediction *****
  Num examples = 1
  Batch size = 16


  _warn_prf(average, modifier, msg_start, len(result))


Unnamed: 0,tokens,true_labels,true_preds
0,date,O,O
1,:,O,O
2,june,B-DATE,B-DATE
3,2020,I-DATE,B-DATE
4,:,I-DATE,O
5,patient,O,O
6,-,O,O
7,christopher,B-PATIENT,B-DOCTOR
8,mean,I-PATIENT,I-DOCTOR
9,##ey,I-PATIENT,I-DOCTOR


In [None]:
## Decode the string from the list of tokenized integer indices
# tokenizer.decode(my_string_dataset['input_ids'][0])

In [None]:
## How you can grab the character level encoding for the sentence
# len(my_string_encode.tokens()[1:-1])

In [None]:
# len(true_labels[0])

In [None]:
############################
## Save final model and tokenizer to disk
############################
model.save_pretrained('gdrive/My Drive/Colab Notebooks/my_albert_model')
tokenizer.save_pretrained('gdrive/My Drive/Colab Notebooks/my_albert_model')

Configuration saved in gdrive/My Drive/Colab Notebooks/my_albert_model/config.json
Model weights saved in gdrive/My Drive/Colab Notebooks/my_albert_model/pytorch_model.bin
tokenizer config file saved in gdrive/My Drive/Colab Notebooks/my_albert_model/tokenizer_config.json
Special tokens file saved in gdrive/My Drive/Colab Notebooks/my_albert_model/special_tokens_map.json


('gdrive/My Drive/Colab Notebooks/my_albert_model/tokenizer_config.json',
 'gdrive/My Drive/Colab Notebooks/my_albert_model/special_tokens_map.json',
 'gdrive/My Drive/Colab Notebooks/my_albert_model/vocab.txt',
 'gdrive/My Drive/Colab Notebooks/my_albert_model/added_tokens.json',
 'gdrive/My Drive/Colab Notebooks/my_albert_model/tokenizer.json')

In [None]:
#############################
## Print system info
#############################
#!pip install sinfo
#import sinfo from sinfo
sinfo()

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
datasets            1.9.0
datasets_modules    NA
google              NA
numpy               1.19.5
pandas              1.1.5
sinfo               0.3.4
sklearn             0.22.2.post1
torch               1.9.0+cu102
transformers        4.8.2
-----
IPython             5.5.0
jupyter_client      5.3.5
jupyter_core        4.7.1
notebook            5.3.1
-----
Python 3.7.10 (default, May  3 2021, 02:48:31) [GCC 7.5.0]
Linux-5.4.104+-x86_64-with-Ubuntu-18.04-bionic
2 logical CPU cores, x86_64
-----
Session inf

In [None]:
###################################################################
## Save the rendered .ipynb files to HTML to share with others
###################################################################

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive', force_remount=True)

In [None]:
time.sleep(15)

In [None]:
%%shell
jupyter nbconvert --to html 'gdrive/My Drive/Colab Notebooks/Transformers_NER_FineTune_i2b2_2014_DEID.ipynb'

[NbConvertApp] Converting notebook gdrive/My Drive/Colab Notebooks/Transformers_NER_FineTune_i2b2_2014_DEID.ipynb to html
[NbConvertApp] Writing 568808 bytes to gdrive/My Drive/Colab Notebooks/Transformers_NER_FineTune_i2b2_2014_DEID.html


