In [1]:
import pandas as pd
import numpy as np
from transformers import TrainingArguments, Trainer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
import evaluate

In [3]:
from pynvml import *


def print_gpu_utilization():
    nvmlInit()
    handle = nvmlDeviceGetHandleByIndex(0)
    info = nvmlDeviceGetMemoryInfo(handle)
    print(f"GPU memory occupied: {info.used//1024**2} MB.")


def print_summary(result):
    print(f"Time: {result.metrics['train_runtime']:.2f}")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}")
    print_gpu_utilization()

In [4]:
file_path_1 = './i2b2_parsing/parsed_gold_set_1.jsonl'
test_path = './i2b2_parsing/parsed_test.jsonl'
data = pd.read_json(file_path_1, lines=True)
test_data = pd.read_json(test_path, lines=True)

In [5]:
# data

In [6]:
texts = data['text'].tolist() # type: ignore
test_texts = test_data['text'].tolist() # type: ignore
texts[0]

'\n\n\nRecord date: 2080-11-30\n\n\n\nReason for Visit\n\nOwen is a 63 y/o male here for evaluation of treatment. Doin relatively well. \n\n\n\nProblems\n\n\n\n      OA\n\n\n\n      LLE-PARTIALLY SEVERED-MULT. SURGERIES\n\n\n\n      IRRIDECTOMY\n\n\n\n      SKIN ULCER-DR Esposito\n\n\n\n      PAST SMOKER \n\n\n\n      HTN\n\n\n\n\n\nMedications\n\n\n\n      ASA       PO \n\n\n\n      Vitamin E        PO QD : 400 IU \n\n\n\n      ATENOLOL   25 MG PO QD\n\n\n\n      Lipitor (ATORVASTATIN)    10MG,  1 Tablet(s)  PO QD\n\n\n\n\n\nAllergies\n\n\n\n      NKDA    - NONE\n\n\n\n\n\nNarrative History\n\nTakes meds. No SEs. Denies vision change, headache, chest pain, SOB, light head, palptations. Denies loss of balance, strength or sensation. \n\n Pulm- no cough.  Occ wheeze. No SOB.\n\nGI- no nausea, vomitting, dyspepsia, reflux, abdo pain, diarrhea, constipation, melena, BRBPR.\n\nGU- asymp\n\nLocomotor- pain left knee/leg. \n\nSees Dr Esposito for chr ulcer. Has surgical boot on now. \n\nExer

In [7]:
spans = data['spans'].tolist() # type: ignore
test_spans = test_data['spans'].tolist() # type: ignore
spans[0]

[{'id': 'P0',
  'start': '16',
  'end': '26',
  'text': '2080-11-30',
  'TYPE': 'DATE',
  'comment': '',
  'label': 'DATE'},
 {'id': 'P1',
  'start': '48',
  'end': '52',
  'text': 'Owen',
  'TYPE': 'PATIENT',
  'comment': '',
  'label': 'PATIENT'},
 {'id': 'P2',
  'start': '58',
  'end': '60',
  'text': '63',
  'TYPE': 'AGE',
  'comment': '',
  'label': 'AGE'},
 {'id': 'P3',
  'start': '242',
  'end': '250',
  'text': 'Esposito',
  'TYPE': 'DOCTOR',
  'comment': '',
  'label': 'STAFF'},
 {'id': 'P4',
  'start': '854',
  'end': '862',
  'text': 'Esposito',
  'TYPE': 'DOCTOR',
  'comment': '',
  'label': 'STAFF'},
 {'id': 'P5',
  'start': '1664',
  'end': '1683',
  'text': 'William Seth Potter',
  'TYPE': 'DOCTOR',
  'comment': '',
  'label': 'STAFF'}]

In [8]:
model_nm = 'microsoft/deberta-v3-small'

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, convert_slow_tokenizer
tokz = AutoTokenizer.from_pretrained(model_nm, use_fast=True)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [10]:
def tok_function(x): return tokz(x['text'])

In [11]:
from datasets import Dataset
ds = Dataset.from_pandas(data)
test_ds = Dataset.from_pandas(test_data)

In [12]:
tok_ds = ds.map(tok_function, batched=True)
test_tok_ds = test_ds.map(tok_function, batched=True)
tok_ds

                                                              

Dataset({
    features: ['text', 'spans', 'meta', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 790
})

In [13]:
# tok_ds['text'][0]
# tok_ds['input_ids'][0]


In [14]:
# data['spans']

In [15]:
# input_ids = tok_ds['input_ids'][0]
# tokens = tokz.convert_ids_to_tokens(input_ids)
# token_positions = tokz(tok_ds['text'][0], return_offsets_mapping=True)['offset_mapping']

# for token, input_id, token_pos in zip(tokens, input_ids, token_positions):
#     print(f'{token}: {input_id}: {token_pos}')

In [16]:
label_map = {
    "Padding": -100,
    "O": 0,
    "B-DATE": 1,
    "I-DATE": 2,
    "B-PATIENT": 3,
    "I-PATIENT": 4,
    "B-AGE": 5,
    "I-AGE": 6,
    "B-STAFF": 7,
    "I-STAFF": 8,
    "B-PHONE": 9,
    "I-PHONE": 10,
    "B-EMAIL": 11,
    "I-EMAIL": 12,
    "B-ID": 13,
    "I-ID": 14,
    "B-HOSP": 15,
    "I-HOSP": 16,
    "B-PATORG": 17,
    "I-PATORG": 18,
    "B-LOC": 19,
    "I-LOC": 20,
    "B-OTHERPHI": 21,
    "I-OTHERPHI": 22,
}

In [17]:
def pre_process_data(text, spans, input_ids):
    tokens = tokz.convert_ids_to_tokens(input_ids)
    labels = ['O'] * len(input_ids)
    token_positions = tokz(text, return_offsets_mapping=True)['offset_mapping']


    for span in spans:
        start, end, label = span['start'], span['end'], span['label']

        token_start, token_end = None, None

        for idx, (char_start, char_end) in enumerate(token_positions):
            if tokens[idx].startswith('▁'):
                char_start += 1
            # print(tokens[idx], char_start, char_end, start, end)
            if char_start == int(start):
                token_start = idx
            if char_end == int(end):
                token_end = idx
                break
        
        if token_start is not None and token_end is not None:
            # print(token_start, token_end, label)
            labels[token_start] = f'B-{label}'
            for idx in range(token_start + 1, token_end + 1):
                labels[idx] = f'I-{label}'

    # input_ids = tokz.convert_tokens_to_ids(tokens)
    label_ids = [label_map[label] for label in labels]

    return label_ids

In [18]:
tok_ds_processed = tok_ds.add_column('labels', [pre_process_data(text, spans, input_ids) for text, spans, input_ids in zip(tok_ds['text'], tok_ds['spans'], tok_ds['input_ids'])])

In [19]:
def find_max_len(x):
    return max([len(input_ids) for input_ids in x['input_ids']])

max_len = find_max_len(tok_ds_processed)
max_len

4995

In [20]:
def pad_data(sequences, max_length, padding_value):
    padded_sequences = []
    for seq in sequences:
        if len(seq) > max_length:
            padded_seq = seq[:max_length]
        else:
            padded_seq = seq + [padding_value] * (max_length - len(seq))
        padded_sequences.append(padded_seq)
    return padded_sequences


In [21]:
input_ids = pad_data(tok_ds_processed['input_ids'], 1024, 0)
labels = pad_data(tok_ds_processed['labels'], 1024, -100)
attention_mask = pad_data(tok_ds_processed['attention_mask'], 1024, 0)

In [22]:
tok_ds_processed = tok_ds_processed.remove_columns(['input_ids', 'labels', 'attention_mask', 'text', 'spans', 'meta'])
tok_ds_processed = tok_ds_processed.add_column('input_ids', input_ids)
tok_ds_processed = tok_ds_processed.add_column('labels', labels)
tok_ds_processed = tok_ds_processed.add_column('attention_mask', attention_mask)

In [23]:
tok_ds_processed = tok_ds_processed.remove_columns(['token_type_ids'])


In [48]:
test_tok_ds_processed = test_tok_ds.add_column('labels', [pre_process_data(text, spans, input_ids) for text, spans, input_ids in zip(test_tok_ds['text'], test_tok_ds['spans'], test_tok_ds['input_ids'])])

In [49]:
test_input_ids = pad_data(test_tok_ds_processed['input_ids'], 1024, 0)
test_labels = pad_data(test_tok_ds_processed['labels'], 1024, -100)
test_attention_mask = pad_data(test_tok_ds_processed['attention_mask'], 1024, 0)

In [53]:
test_tok_ds_processed = test_tok_ds_processed.remove_columns(['input_ids', 'labels', 'attention_mask', 'text', 'spans', 'meta'])
test_tok_ds_processed = test_tok_ds_processed.add_column('input_ids', test_input_ids)
test_tok_ds_processed = test_tok_ds_processed.add_column('labels', test_labels)
test_tok_ds_processed = test_tok_ds_processed.add_column('attention_mask', test_attention_mask)
test_tok_ds_processed = test_tok_ds_processed.remove_columns(['token_type_ids'])

In [25]:
# # Print out mapping of the tokens to the labels
# tokens = tokz.convert_ids_to_tokens(tok_ds_processed['input_ids'][1])
# labels = [list(label_map.keys())[list(label_map.values()).index(label_id)] for label_id in tok_ds_processed['labels'][1]]

# for token, label in zip(tokens, labels):
#     print(f'{token} - {label}')

In [26]:
## Create test and validation sets
## test_tok_ds_processed is the test set
tok_dds = tok_ds_processed.train_test_split(test_size=0.25, seed=42)
tok_dds

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 592
    })
    test: Dataset({
        features: ['input_ids', 'labels', 'attention_mask'],
        num_rows: 198
    })
})

In [27]:
print_gpu_utilization()

GPU memory occupied: 258 MB.


### Training the model

In [28]:
bs = 1
epochs = 4

In [29]:
lr = 8e-5

In [30]:
args = TrainingArguments('outputs', learning_rate=lr, warmup_ratio=0.1, lr_scheduler_type='cosine',
    evaluation_strategy="epoch", per_device_train_batch_size=bs, per_device_eval_batch_size=bs*2,
    num_train_epochs=epochs, weight_decay=0.01, report_to='none', gradient_accumulation_steps=4, optim="adafactor", gradient_checkpointing=True)

In [31]:
from transformers import AutoModelForTokenClassification, AutoConfig


In [32]:
config = AutoConfig.from_pretrained(model_nm, num_labels=len(label_map))


In [33]:
metric = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

In [34]:
model = AutoModelForTokenClassification.from_pretrained(model_nm, config=config)

Some weights of the model checkpoint at microsoft/deberta-v3-small were not used when initializing DebertaV2ForTokenClassification: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.dense.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.bias', 'mask_predictions.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.bias']
- This IS expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a B

In [35]:
print_gpu_utilization()

GPU memory occupied: 261 MB.


In [36]:
trainer = Trainer(model, args, train_dataset=tok_dds['train'], eval_dataset=tok_dds['test'], tokenizer=tokz)

In [37]:
# import torch
# torch.backends.cudnn.benchmark=True
# import os
# os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:16 "

In [38]:
result = trainer.train()

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,0.015754
2,No log,0.009772
3,No log,0.007673
4,0.098700,0.007552


In [39]:
print_summary(result)

Time: 440.05
Samples/second: 5.38
GPU memory occupied: 3484 MB.


In [40]:
trainer.save_model('/storage/models/phi/')

In [55]:
prediction_result = trainer.predict(test_tok_ds_processed)

In [63]:
# # Print out mapping of the tokens to the labels
tokens = tokz.convert_ids_to_tokens(test_tok_ds_processed['input_ids'][0])
labels = [list(label_map.keys())[list(label_map.values()).index(label_id)] for label_id in prediction_result.label_ids[0]]

for token, label in zip(tokens, labels):
    print(f'{token} - {label}')

[CLS] - O
▁Record - O
▁date - O
: - O
▁20 - B-DATE
90 - I-DATE
- - I-DATE
07 - I-DATE
- - I-DATE
16 - I-DATE
▁NAME - O
: - O
▁Curtis - B-PATIENT
, - I-PATIENT
▁Om - I-PATIENT
▁M - O
RN - O
: - O
▁768 - B-ID
294 - I-ID
1 - I-ID
▁He - O
▁is - O
▁feeling - O
▁great - O
. - O
▁He - O
▁is - O
▁all - O
▁done - O
▁with - O
▁his - O
▁radiation - O
▁to - O
▁the - O
▁left - O
▁a - O
x - O
illa - O
▁for - O
▁metastatic - O
▁squamous - O
▁cell - O
▁cancer - O
. - O
▁He - O
▁is - O
▁following - O
▁closely - O
▁with - O
▁the - O
▁radiation - O
▁oncologist - O
▁and - O
▁the - O
▁medical - O
▁oncologist - O
. - O
▁He - O
▁is - O
▁seeing - O
▁them - O
▁both - O
▁later - O
▁this - O
▁month - O
. - O
▁He - O
▁has - O
▁had - O
▁no - O
▁problems - O
▁with - O
▁chest - O
▁pains - O
▁or - O
▁shortness - O
▁of - O
▁breath - O
. - O
▁All - O
▁in - O
▁all - O
, - O
▁things - O
▁are - O
▁going - O
▁well - O
. - O
▁PHYSICAL - O
▁EXAM - O
: - O
▁On - O
▁exam - O
, - O
▁no - O
▁acute - O
▁distress - O
. - O
▁Lung -