In [None]:
############################################################################
##  Transformers token classification pipeline/fine-tuning for NER
##  From tutorial: https://github.com/huggingface/notebooks/blob/master/examples/token_classification.ipynb
##
## Modified by Author: Chris Meaney
## Date: June 2021
##
## Purpose: apply transformers NER module over i2b2 2014 DEID dataset (train/val results; with hyper-parm tuning; final eval - best model - on test)
##
############################################################################

In [None]:
## Print information about the specific NVIDIA GPU which COLAB has assigned to this session
!nvidia-smi 

Thu Feb 24 15:11:03 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla K80           Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P8    28W / 149W |      0MiB / 11441MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [None]:
##########################
## Dependency modules
##########################

## For system info
!pip install sinfo
from sinfo import sinfo

## For os tasks
import os
import shutil

## For timing
import time

## Pandas for data wrangling (import data)
import pandas as pd

## Used to display pandas data frame in a nice HTML format
from IPython.display import display, HTML

## Numpy for numerics
import random
import numpy as np
## Do I set seed for reproducibility? - How will this work on PyTorch, Transformers, etc. (i.e. is there a gloabl seed; or is this np.seed sufficient)
np.random.seed(12345)

## For pickling numpy arrays
import pickle as pkl

## sklearn for eval metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
## sklearn model selection tools
from sklearn.model_selection import train_test_split

## Torch (for base NN layers/act-funs, loss, train/updates, etc.)
!pip install torch 
import torch

## Transformers 
! pip install transformers 
import transformers
from transformers import DataCollatorForTokenClassification
from transformers import AutoTokenizer
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

## Datasets for CONLL example
! pip install datasets 
from datasets import load_dataset, load_metric
from datasets import ClassLabel, Sequence
from datasets import Dataset

## For sequence evaluation functions (to run against CONLL-NER format datasets)
## https://pypi.org/project/seqeval/0.0.10/
! pip install seqeval

Collecting sinfo
  Downloading sinfo-0.3.4.tar.gz (24 kB)
Collecting stdlib_list
  Downloading stdlib_list-0.8.0-py3-none-any.whl (63 kB)
[K     |████████████████████████████████| 63 kB 1.2 MB/s 
[?25hBuilding wheels for collected packages: sinfo
  Building wheel for sinfo (setup.py) ... [?25l[?25hdone
  Created wheel for sinfo: filename=sinfo-0.3.4-py3-none-any.whl size=7899 sha256=c2e79dd2a9adef7ecbabae682c47f690ea7fe99262968e487b19271b59a13c5b
  Stored in directory: /root/.cache/pip/wheels/68/ca/56/344d532fe53e855ccd6549795d370588ab8123907eecf4cf30
Successfully built sinfo
Installing collected packages: stdlib-list, sinfo
Successfully installed sinfo-0.3.4 stdlib-list-0.8.0
Collecting transformers
  Downloading transformers-4.16.2-py3-none-any.whl (3.5 MB)
[K     |████████████████████████████████| 3.5 MB 5.3 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K   

In [None]:
## Options for printing more rows/columns in Jupyter Notebook
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 100)

In [None]:
################################################
## Connect to Google Colab
################################################


## Read in data from Google Drive account (this will force mount step, authentication step, etc.)
## https://stackoverflow.com/questions/48340341/how-to-read-csv-to-dataframe-in-google-colab

from google.colab import drive 
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
##########################
## Specific transformer model/architecture
##########################

# model_checkpoint = "bert-base-uncased"
# model_checkpoint = "bert-large-uncased"
# model_checkpoint = "albert-base-v2"
# model_checkpoint = "albert-xxlarge-v2"
# model_checkpoint = "distilbert-base-uncased"
# model_checkpoint = "xlm-roberta-base"
# model_checkpoint = "xlm-roberta-large"

## Warning: for roberta models; need to instantiate tokenizer with add_prefix_space=True

# model_checkpoint = "roberta-base"
# model_checkpoint = "distilroberta-base"
model_checkpoint = "roberta-large"

#################
## Batch size
#################
batch_size = 1

############################
## Number training epochs
############################
n_train_epochs = 5

###################
## Learning Rate
###################
learn_rate = 2e-5 

###########################
## Weight decay (L2 regularization - on final weight layer? or all layers?)
###########################
wt_decay = 0.01

In [None]:
## Model prefix string - will be used as prefix for models results
model_prefix = 'model=' + model_checkpoint + "_numepochs=" + str(n_train_epochs) + "_learnrate=" + str(learn_rate) + '_wtdecay=' + str(wt_decay)
# model_prefix

In [None]:
##########################################################
## Paths to model/output dir
##########################################################
model_path = "gdrive/My Drive/Colab Notebooks/transformer_model_dir/" + model_checkpoint + "/"

output_path = "gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/" + model_prefix + "/"

if os.path.exists(output_path):
    shutil.rmtree(output_path)

os.makedirs(output_path)

In [None]:
##########################################################
## Use pandas to import data, and store as data.frame
##########################################################
dat = pd.read_csv('gdrive/My Drive/ColabData/bio_df_st.csv', encoding='latin1')
# dat.head(n=15)

In [None]:
## Check is tok_text is "string"; if True then keep; if False (since int/float/None/etc.) then delete

# dat['tok_text_flag'] = dat.tok_text.isnull() 
# dat.tok_text_flag.value_counts()

# dat['tok_text_flag'] = dat.tok_text.str.isnumeric() 
# dat.tok_text_flag.value_counts()

dat['tok_text_flag'] = dat.tok_text.isnull() | dat.tok_text.str.isnumeric() 
# dat.tok_text_flag.value_counts()

In [None]:
## Drop these above rows from the data.frame
dat = dat[dat['tok_text_flag']==False]
# dat.shape

In [None]:
## Map the bio tags to integer indices
codes, unique = pd.factorize(dat['bio'])
dat['bio_int'] = codes
# dat.bio_int.value_counts()

In [None]:
## Group the rows of the dataframe by doc_id
dat_group = dat.groupby(['doc_id'],as_index=False)['is_test', 'bio', 'bio_r', 'bio_int', 'tok_text'].agg(lambda x: list(x))

## Print head of data
# dat_group.head()

  


In [None]:
## Create flag for train/test datasets
dat_group['is_test_flag'] = [is_test[0] for is_test in dat_group.is_test]
# dat_group.is_test_flag.value_counts()

In [None]:
## Create train and test datasets
train_dat = dat_group.loc[dat_group['is_test_flag'] == False, ['doc_id','bio','bio_int','tok_text']]
test_dat = dat_group.loc[dat_group['is_test_flag'] == True, ['doc_id','bio','bio_int','tok_text']]

# [train_dat.shape, test_dat.shape]

In [None]:
## Further sample the training dataset into two distinct chunks (i.e. train and val)
train_size = 500
test_size = train_dat.shape[0] - train_size

train_dat, val_dat = train_test_split(train_dat, train_size=train_size, test_size=test_size)

# [train_dat.shape, val_dat.shape, test_dat.shape]

In [None]:
## Inspect what one of the datasets above looks like
# train_dat.head(n=5)

In [None]:
## Get list/set of unique tags
# dat.bio.value_counts()
#dat.bio.value_counts().sort_index()

In [None]:
## Get names of IDs 
label_list = dat.bio.unique().tolist()
# label_list

In [None]:
## Get number of unique BIO tags for the i2b2 DEID NER task
num_tags = len(label_list)
# num_tags

In [None]:
#pd.DataFrame(pd.crosstab(dat.bio,dat.bio_int))

In [None]:
## Create cross tab dataFrame
bio_ct = pd.DataFrame(pd.crosstab(dat.bio,dat.bio_int))

## Get ijx representation of matrix
bio_ct_vals = bio_ct.transpose().values.flatten()
bio_ct_j = bio_ct.columns.values.repeat(bio_ct.shape[0])
bio_ct_i = bio_ct.index.to_list()*bio_ct.shape[1]
# [len(bio_ct_i), len(bio_ct_j), len(bio_ct_vals)]

## Create COO format array
bio_ct_long = pd.DataFrame({'i': bio_ct_i,
                            'j': bio_ct_j,
                            'x': bio_ct_vals})

#bio_ct_long = bio_ct_long[['x']!=0]
bio_ct_long = bio_ct_long[bio_ct_long['x']!=0]
# bio_ct_long

## Write the map to csv for use later if needed
bio_ct_long_fpath = output_path + "bio_ct_long.csv"

bio_ct_long.to_csv(path_or_buf=bio_ct_long_fpath, index=False)

In [None]:
###################################################
## Install the tokenizer (note it will be specific to the model define above)
###################################################   

if model_checkpoint=='roberta-base':
    tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
elif model_checkpoint=='roberta-large':
    tokenizer = AutoTokenizer.from_pretrained(model_path, add_prefix_space=True)
elif model_checkpoint=='distilroberta-base':
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True)
else:
    tokenizer = AutoTokenizer.from_pretrained(model_path)

In [None]:
## Assertion/check against the particular tokenizer installed
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

In [None]:
##################################################################
## Function to apply transormers tokenizer to sequence; then re-align labels to match newly encoded (new-length) sequence
##################################################################
label_all_tokens = True

## Note: if any of the token elements are 'None' or str.isnumeric=True then I think this will fail?
## Note: I handled this above by deleting these problematic tokens. That said, I could have handled by assigning to new string?? 's' + 'old_token'
def tokenize_and_align_labels(tokens, tags):
    tokenized_inputs = tokenizer(tokens, truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
##################################################################################
## Batch encode tokens/attention-mask/labels for train val and test datasets
##################################################################################

## Training data
train_encode = tokenize_and_align_labels(tokens=train_dat.tok_text.to_list(), tags=train_dat.bio_int.to_list())

## Validation data
val_encode = tokenize_and_align_labels(tokens=val_dat.tok_text.to_list(), tags=val_dat.bio_int.to_list())

## Test data
test_encode = tokenize_and_align_labels(tokens=test_dat.tok_text.to_list(), tags=test_dat.bio_int.to_list())

## Check attributes/shape of train/val/test encoded datasets
# [[len(train_encode['input_ids']), len(val_encode['input_ids']), len(test_encode['input_ids'])],
# [len(train_encode['attention_mask']), len(val_encode['attention_mask']), len(test_encode['attention_mask'])],
# [len(train_encode['labels']), len(val_encode['labels']), len(test_encode['labels'])]]

In [None]:
## Create dataframe with columns 
train_df = pd.DataFrame({'input_ids':train_encode['input_ids'],
                         'attention_mask':train_encode['attention_mask'],
                         'labels':train_encode['labels'],})

val_df = pd.DataFrame({'input_ids':val_encode['input_ids'],
                         'attention_mask':val_encode['attention_mask'],
                         'labels':val_encode['labels'],})

test_df = pd.DataFrame({'input_ids':test_encode['input_ids'],
                         'attention_mask':test_encode['attention_mask'],
                         'labels':test_encode['labels'],})

#train_df.head(n=5)
#val_df.head(n=5)
#test_df.head(n=5)

## Convert each of the above objects into a HuggingFace Dataset (note: based on Apache Arrow dataset)
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

In [None]:
###############################################################
## Instantiate a transformers Token Classification of NER model
##
## Model type should be one of BigBirdConfig, ConvBertConfig, LayoutLMConfig, DistilBertConfig, CamembertConfig, FlaubertConfig, XLMConfig, XLMRobertaConfig, LongformerConfig, RobertaConfig, SqueezeBertConfig, BertConfig, MegatronBertConfig, MobileBertConfig, XLNetConfig, AlbertConfig, ElectraConfig, FunnelConfig, MPNetConfig, DebertaConfig, DebertaV2Config, IBertConfig.
###############################################################
model = AutoModelForTokenClassification.from_pretrained(model_path, 
                                                        num_labels=num_tags)

In [None]:
# help(model)

In [None]:
print(model)

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
            

In [None]:
## Count total number of model parameters
total_params = sum(p.numel() for p in model.parameters())
total_params

354352169

In [None]:
## Count total number of "trainable" model parameters
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
trainable_params

354352169

In [None]:
## Print table of number of parameters for each layer/module
from prettytable import PrettyTable

def count_parameters(model):
    table = PrettyTable(["Modules", "Parameters"])
    total_params = 0
    for name, parameter in model.named_parameters():
        if not parameter.requires_grad: continue
        param = parameter.numel()
        table.add_row([name, param])
        total_params+=param
    print(table)
    print(f"Total Trainable Params: {total_params}")
    return total_params
    
count_parameters(model)

+------------------------------------------------------------+------------+
|                          Modules                           | Parameters |
+------------------------------------------------------------+------------+
|         roberta.embeddings.word_embeddings.weight          |  51471360  |
|       roberta.embeddings.position_embeddings.weight        |   526336   |
|      roberta.embeddings.token_type_embeddings.weight       |    1024    |
|            roberta.embeddings.LayerNorm.weight             |    1024    |
|             roberta.embeddings.LayerNorm.bias              |    1024    |
|    roberta.encoder.layer.0.attention.self.query.weight     |  1048576   |
|     roberta.encoder.layer.0.attention.self.query.bias      |    1024    |
|     roberta.encoder.layer.0.attention.self.key.weight      |  1048576   |
|      roberta.encoder.layer.0.attention.self.key.bias       |    1024    |
|    roberta.encoder.layer.0.attention.self.value.weight     |  1048576   |
|     robert

354352169

In [None]:
##############################################################
## Hyper-parameters from NER model
##############################################################
args = TrainingArguments(
    output_dir='i2b2_output',
    evaluation_strategy="epoch",
    learning_rate=learn_rate,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=n_train_epochs,
    weight_decay=wt_decay
)

In [None]:
## Data collator
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
## Sequence evaluation metric (from CONLL - used to eval NER, etc. type tasks)
metric = load_metric("seqeval")

Downloading:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

In [None]:
###############################################
## Function to compute evaluation metrics on train/val/test samples
###############################################

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
####################################################################
## Specify a training function; this will train/fine-tune NER model; and print metrics on train/val sets
####################################################################
trainer = Trainer(
    model,
    args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)


In [None]:
##############################################################
## Train the model - print per-epoch training/val metrics to console
##############################################################
t0 = time.time()
trainer.train()
t1 = time.time()

***** Running training *****
  Num examples = 500
  Num Epochs = 5
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 1
  Gradient Accumulation steps = 1
  Total optimization steps = 2500


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.1084,0.033407,0.910498,0.922327,0.916374,0.992366
2,0.0239,0.025169,0.953558,0.948171,0.950857,0.995118
3,0.0139,0.02185,0.957387,0.958198,0.957792,0.995885
4,0.0086,0.020882,0.95739,0.961446,0.959414,0.996058
5,0.006,0.020149,0.965977,0.962293,0.964132,0.996472


Saving model checkpoint to i2b2_output/checkpoint-500
Configuration saved in i2b2_output/checkpoint-500/config.json
Model weights saved in i2b2_output/checkpoint-500/pytorch_model.bin
tokenizer config file saved in i2b2_output/checkpoint-500/tokenizer_config.json
Special tokens file saved in i2b2_output/checkpoint-500/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 237
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to i2b2_output/checkpoint-1000
Configuration saved in i2b2_output/checkpoint-1000/config.json
Model weights saved in i2b2_output/checkpoint-1000/pytorch_model.bin
tokenizer config file saved in i2b2_output/checkpoint-1000/tokenizer_config.json
Special tokens file saved in i2b2_output/checkpoint-1000/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 237
  Batch size = 1
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to i2b2_output/checkpoint-1500
Conf

In [None]:
## Print training time
train_time = t1-t0

In [None]:
#help(trainer)

In [None]:
## Evaluate a trained model
trainer_evalaute_metrics = trainer.evaluate()
trainer_evaluate_metrics_df = pd.Series(trainer_evalaute_metrics)
trainer_evaluate_metrics_df

## Write train/val/test metrics dataFrame to disk
trainer_df_fpath = output_path + "trainer_df.csv"

trainer_evaluate_metrics_df.to_csv(path_or_buf=trainer_df_fpath, index=True)

***** Running Evaluation *****
  Num examples = 237
  Batch size = 1


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
#############################################
## Evaluate model on validation set - per tag analysis and overall analysis
#############################################
predictions_train, labels_train, _ = trainer.predict(train_dataset)
predictions_train_max = np.argmax(predictions_train, axis=2)

# Remove ignored index (special tokens)
true_predictions_train = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_train_max, labels_train)
]
true_labels_train = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_train_max, labels_train)
]

## See results of trained model applied to eval/test set (evalued on a per-tag basis - this is like sklearn.metrics.classification_report)
train_metrics = metric.compute(predictions=true_predictions_train, references=true_labels_train)
train_metrics_df = pd.DataFrame(train_metrics).transpose()
train_metrics_df['dataset'] = 'train'
train_metrics_df['phi_type'] = train_metrics_df.index
#train_metrics_df

***** Running Prediction *****
  Num examples = 500
  Batch size = 1


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Train input-ids decoded to tokens (using character level tokenizer)
train_tokens = [tokenizer.convert_ids_to_tokens(ids)[1:-1] for ids in train_df.input_ids]

## Flatten lists
train_tokens_long = [item for sublist in train_tokens for item in sublist]
true_predictions_train_long = [item for sublist in true_predictions_train for item in sublist]
true_labels_train_long = [item for sublist in true_labels_train for item in sublist]
#[len(train_tokens_long), len(true_predictions_train_long), len(true_labels_train_long)]

## Get docs_ids
len_docs = [len(tokenizer.convert_ids_to_tokens(ids)[1:-1]) for ids in train_df.input_ids]
doc_ids = train_dat.doc_id.repeat(len_docs)
# len(doc_ids)

## Put flat lists into pandas dataFrame
train_preds_out = pd.DataFrame({'doc_id': doc_ids,
                                  'tokens': train_tokens_long,
                                  'pred': true_predictions_train_long,
                                  'true_label': true_labels_train_long,
                                  })

## Denote that these are training data
train_preds_out['dataset'] = 'train'
# train_preds_out.shape

In [None]:
#########################################
## Save the array of scores/logits over: docs (i=1..500), words (p=1..512), bio-tags (t=1..41)
#########################################
fpath_prediction_train_arr = output_path + "predictions_train_arr.npy"

np.save(fpath_prediction_train_arr, predictions_train)

In [None]:
#############################################
## Evaluate model on validation set - per tag analysis and overall analysis
#############################################
predictions_val, labels_val, _ = trainer.predict(val_dataset)
predictions_val_max = np.argmax(predictions_val, axis=2)

# Remove ignored index (special tokens)
true_predictions_val = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_val_max, labels_val)
]
true_labels_val = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_val_max, labels_val)
]

## See results of trained model applied to eval/test set (evalued on a per-tag basis - this is like sklearn.metrics.classification_report)
val_metrics = metric.compute(predictions=true_predictions_val, references=true_labels_val)
val_metrics_df = pd.DataFrame(val_metrics).transpose()
val_metrics_df['dataset'] = 'val'
val_metrics_df['phi_type'] = val_metrics_df.index
#val_metrics_df

***** Running Prediction *****
  Num examples = 237
  Batch size = 1


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Validation input-ids decoded to tokens (using character level tokenizer)
val_tokens = [tokenizer.convert_ids_to_tokens(ids)[1:-1] for ids in val_df.input_ids]

## Flatten lists
val_tokens_long = [item for sublist in val_tokens for item in sublist]
true_predictions_val_long = [item for sublist in true_predictions_val for item in sublist]
true_labels_val_long = [item for sublist in true_labels_val for item in sublist]
#[len(val_tokens_long), len(true_predictions_val_long), len(true_labels_val_long)]

## Get docs_ids
len_docs = [len(tokenizer.convert_ids_to_tokens(ids)[1:-1]) for ids in val_df.input_ids]
doc_ids = val_dat.doc_id.repeat(len_docs)
# len(doc_ids)

## Put flat lists into pandas dataFrame
val_preds_out = pd.DataFrame({'doc_id': doc_ids,
                                  'tokens': val_tokens_long,
                                  'pred': true_predictions_val_long,
                                  'true_label': true_labels_val_long,
                                  })

## Denote that these are training data
val_preds_out['dataset'] = 'val'
## val_preds_out.shape

In [None]:
#########################################
## Save the array of scores/logits over: docs (i=1..500), words (p=1..512), bio-tags (t=1..41)
#########################################
fpath_prediction_val_arr = output_path + "predictions_val_arr.npy"

np.save(fpath_prediction_val_arr, predictions_val)

In [None]:
#############################################
## Evaluate model on test set - per tag analysis and overall analysis
#############################################
predictions_test, labels_test, _ = trainer.predict(test_dataset)
predictions_test_max = np.argmax(predictions_test, axis=2)

# Remove ignored index (special tokens)
true_predictions_test = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_test_max, labels_test)
]
true_labels_test = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions_test_max, labels_test)
]

## See results of trained model applied to eval/test set (evalued on a per-tag basis - this is like sklearn.metrics.classification_report)
test_metrics = metric.compute(predictions=true_predictions_test, references=true_labels_test)
test_metrics_df = pd.DataFrame(test_metrics).transpose()
test_metrics_df['dataset'] = 'test'
test_metrics_df['phi_type'] = test_metrics_df.index
#test_metrics_df

***** Running Prediction *****
  Num examples = 486
  Batch size = 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
## Test input-ids decoded to tokens (using character level tokenizer)
test_tokens = [tokenizer.convert_ids_to_tokens(ids)[1:-1] for ids in test_df.input_ids]

## Flatten lists
test_tokens_long = [item for sublist in test_tokens for item in sublist]
true_predictions_test_long = [item for sublist in true_predictions_test for item in sublist]
true_labels_test_long = [item for sublist in true_labels_test for item in sublist]
#[len(test_tokens_long), len(true_predictions_test_long), len(true_labels_test_long)]

## Get docs_ids
len_docs = [len(tokenizer.convert_ids_to_tokens(ids)[1:-1]) for ids in test_df.input_ids]
doc_ids = test_dat.doc_id.repeat(len_docs)
# len(doc_ids)

## Put flat lists into pandas dataFrame
test_preds_out = pd.DataFrame({'doc_id': doc_ids,
                                  'tokens': test_tokens_long,
                                  'pred': true_predictions_test_long,
                                  'true_label': true_labels_test_long,
                                  })

## Denote that these are training data
test_preds_out['dataset'] = 'test'
# test_preds_out.shape

In [None]:
#########################################
## Save the array of scores/logits over: docs (i=1..500), words (p=1..512), bio-tags (t=1..41)
#########################################
fpath_prediction_test_arr = output_path + "predictions_test_arr.npy"

np.save(fpath_prediction_test_arr, predictions_test)

In [None]:
##############################################################################
## Combine train/val/test evaluation metrics dataframe and export to disk
##############################################################################
metrics_df = pd.concat([train_metrics_df, val_metrics_df, test_metrics_df])

metrics_df['model_name'] = model_checkpoint
metrics_df['num_epochs'] = n_train_epochs
metrics_df['learning_rate'] = learn_rate
metrics_df['weight_decay'] = wt_decay

## Write train/val/test metrics dataFrame to disk
metrics_df_fpath = output_path + "overall_train_val_test_metrics_df.csv"

metrics_df.to_csv(path_or_buf=metrics_df_fpath, index=False)

In [None]:
metrics_df

Unnamed: 0,precision,recall,f1,number,dataset,phi_type,model_name,num_epochs,learning_rate,weight_decay
AGE,1.0,0.981818,0.990826,55.0,train,AGE,roberta-large,5,2e-05,0.01
CITY,0.996743,0.993506,0.995122,308.0,train,CITY,roberta-large,5,2e-05,0.01
COUNTRY,1.0,0.942857,0.970588,35.0,train,COUNTRY,roberta-large,5,2e-05,0.01
DATE,0.998712,0.997428,0.998069,8552.0,train,DATE,roberta-large,5,2e-05,0.01
DOCTOR,0.954726,0.951495,0.953107,1773.0,train,DOCTOR,roberta-large,5,2e-05,0.01
EMAIL,1.0,0.888889,0.941176,9.0,train,EMAIL,roberta-large,5,2e-05,0.01
FAX,1.0,1.0,1.0,28.0,train,FAX,roberta-large,5,2e-05,0.01
HEALTHPLAN,0.0,0.0,0.0,1.0,train,HEALTHPLAN,roberta-large,5,2e-05,0.01
HOSPITAL,0.99771,0.996949,0.997329,1311.0,train,HOSPITAL,roberta-large,5,2e-05,0.01
IDNUM,1.0,1.0,1.0,167.0,train,IDNUM,roberta-large,5,2e-05,0.01


In [None]:
################################################################################
## Combine train/val/test predictions dataframe and export to disk
################################################################################
preds_out_df = pd.concat([train_preds_out, val_preds_out, test_preds_out])

preds_out_df['model_name'] = model_checkpoint
preds_out_df['num_epochs'] = n_train_epochs
preds_out_df['learning_rate'] = learn_rate
preds_out_df['weight_decay'] = wt_decay

## Write train/val/test metrics dataFrame to disk
preds_df_fpath = output_path + "overall_train_val_test_preds_df.csv"

preds_out_df.to_csv(path_or_buf=preds_df_fpath, index=False)


In [None]:
############################
## Save final model and tokenizer to disk
############################
model_out_path = output_path + model_checkpoint
tokenizer_out_path = output_path + model_checkpoint 

model.save_pretrained(model_out_path)
tokenizer.save_pretrained(tokenizer_out_path)

Configuration saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/config.json
Model weights saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/pytorch_model.bin
tokenizer config file saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/tokenizer_config.json
Special tokens file saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/special_tokens_map.json


('gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/tokenizer_config.json',
 'gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/special_tokens_map.json',
 'gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/vocab.json',
 'gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/merges.txt',
 'gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/added_tokens.json',
 'gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/roberta-large/tokenizer.json')

In [None]:
####################################
## Save the final trainer file to disk
####################################
trainer_out_path = output_path + 'trainer_model'

trainer.save_model(trainer_out_path)

Saving model checkpoint to gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/trainer_model
Configuration saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/trainer_model/config.json
Model weights saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/trainer_model/pytorch_model.bin
tokenizer config file saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/trainer_model/tokenizer_config.json
Special tokens file saved in gdrive/My Drive/Colab Notebooks/transformer_model_output_dir/model=roberta-large_numepochs=5_learnrate=2e-05_wtdecay=0.01/trainer_model/special_tokens_map.json


In [None]:
## See what bio_tags map to what integers (bio_int)
#bio_ct = pd.DataFrame(pd.crosstab(dat.bio,dat.bio_int)).to_dict()
#bio_ct

In [None]:
###########################################
## Encode a random string and apply model.predict() method to see if it captures PHI needed to be DEID
###########################################
my_string = "Date: June 2020: Patient - christopher meaney - a biostatistician at UT presented to Dr. J. Smith with back pain from sedentary lifestyle and RSI."

## Tokenize string
my_tokens = my_string.split(' ')
# my_tokens

## Get associated tags (labels for string)
my_tags = ['O','B-DATE','I-DATE','O','O',"B-PATIENT","I-PATIENT",'O','O','B-PROFESSSION','O','B-ORGANIZATION','O','O','O','B-DOCTOR',"I-DOCTOR",'O','O','O','O','O','O','O','O']
my_tags_int = [0,1,12,0,0,11,10,0,0,6,0,5,0,0,0,2,4,0,0,0,0,0,0,0,0]

## Check that token/tag length are the same
# [len(my_tokens), len(my_tags), len(my_tags_int)]
my_string_df = pd.DataFrame({'tokens': [my_tokens],
              'bio': [my_tags],
               'bio_int': [my_tags_int]})

my_string_df_long = pd.DataFrame({'tokens': my_tokens,
              'bio': my_tags,
               'bio_int': my_tags_int})

# my_string_df


## Pass the dataframe to tokenizer
my_string_encode = tokenize_and_align_labels(tokens=my_string_df.tokens.to_list(), tags=my_string_df.bio_int.to_list())

## Convert tokenized input into pandas dataframe
my_string_encoded_df = pd.DataFrame({'input_ids': my_string_encode['input_ids'],
                              'attention_mask': my_string_encode['attention_mask'],
                              'labels': my_string_encode['labels']})
# my_string_encoded_df


## Convert pandas dataFrame into HuggingFace Dataset (an Apache Arrow dataset)
my_string_dataset = Dataset.from_pandas(my_string_encoded_df)
#my_string_dataset[0]

## Feed formatted string (in Dataset structure); to fine-tuned Transformer model (and obtain predictions)
predictions, labels, _ = trainer.predict(my_string_dataset)
predictions_max = np.argmax(predictions, axis=2)

#predictions_max
#labels.shape
#type(predictions_max)


## Feed formatted string (in Dataset structure); to fine-tuned Transformer model (and obtain predictions)
predictions, labels, _ = trainer.predict(my_string_dataset)
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

## Put into dataframe
pd.DataFrame({'tokens': my_string_encode.tokens()[1:-1],
              'true_labels': true_labels[0],
              'true_preds': true_predictions[0]})

***** Running Prediction *****
  Num examples = 1
  Batch size = 1


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
***** Running Prediction *****
  Num examples = 1
  Batch size = 1


Unnamed: 0,tokens,true_labels,true_preds
0,ĠDate,O,O
1,:,O,O
2,ĠJune,B-DATE,B-DATE
3,Ġ2020,I-DATE,B-DATE
4,:,I-DATE,O
5,ĠPatient,O,O
6,Ġ-,O,O
7,Ġchrist,B-PATIENT,B-PATIENT
8,opher,B-PATIENT,B-PATIENT
9,Ġme,I-PATIENT,I-PATIENT


In [None]:
#############################
## Print system info
#############################
#!pip install sinfo
#import sinfo from sinfo
sinfo()

The `sinfo` package has changed name and is now called `session_info` to become more discoverable and self-explanatory. The `sinfo` PyPI package will be kept around to avoid breaking old installs and you can downgrade to 0.3.2 if you want to use it without seeing this message. For the latest features and bug fixes, please install `session_info` instead. The usage and defaults also changed slightly, so please review the latest README at https://gitlab.com/joelostblom/session_info.
-----
datasets            1.18.3
datasets_modules    NA
google              NA
numpy               1.21.5
pandas              1.3.5
prettytable         3.1.1
sinfo               0.3.4
sklearn             1.0.2
torch               1.10.0+cu111
transformers        4.16.2
-----
IPython             5.5.0
jupyter_client      5.3.5
jupyter_core        4.9.2
notebook            5.3.1
-----
Python 3.7.12 (default, Jan 15 2022, 18:48:18) [GCC 7.5.0]
Linux-5.4.144+-x86_64-with-Ubuntu-18.04-bionic
2 logical CPU cores, x8