## NBME Competition Notebook

### Notebook Features
- HuggingFace API, PyTorch
- Sequence Classification
- Binary Token Classification for character multi-span

## 0. Load Dependencies
Load `datasets` library offline. The method refer to:
- https://www.kaggle.com/code/samuelepino/pip-downloading-packages-to-your-local-machine/notebook?scriptVersionId=29576961

In [1]:
!ls ../input/nbme-pre-trained-models/datasets

PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl
aiohttp-3.8.1-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl
aiosignal-1.2.0-py3-none-any.whl
async_timeout-4.0.2-py3-none-any.whl
asynctest-0.13.0-py3-none-any.whl
attrs-21.4.0-py2.py3-none-any.whl
certifi-2021.10.8-py2.py3-none-any.whl
charset_normalizer-2.0.12-py3-none-any.whl
datasets-2.1.0-py3-none-any.whl
dill-0.3.4-py2.py3-none-any.whl
filelock-3.6.0-py3-none-any.whl
frozenlist-1.3.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl
fsspec-2022.3.0-py3-none-any.whl
huggingface_hub-0.5.1-py3-none-any.whl
idna-3.3-py3-none-any.whl
importlib_metadata-4.11.3-py3-none-any.whl
multidict-6.0.2-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
multiprocess-0.70.12.2-py37-none-any.whl
numpy-1.21.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
packa

In [2]:
!pip install datasets --no-index --find-links=file:///kaggle/input/nbme-pre-trained-models/datasets

Looking in links: file:///kaggle/input/nbme-pre-trained-models/datasets
Processing /kaggle/input/nbme-pre-trained-models/datasets/datasets-2.1.0-py3-none-any.whl
Processing /kaggle/input/nbme-pre-trained-models/datasets/responses-0.18.0-py3-none-any.whl
Processing /kaggle/input/nbme-pre-trained-models/datasets/xxhash-3.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl
Installing collected packages: xxhash, responses, datasets
Successfully installed datasets-2.1.0 responses-0.18.0 xxhash-3.0.0
[0m

In [3]:
import re
import numpy as np 
import pandas as pd
import tensorflow as tf

import os
import torch
from torch import nn

from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoModelForTokenClassification
from transformers import DataCollatorWithPadding, DataCollatorForTokenClassification
from transformers import TrainingArguments, Trainer

path = '../input/nbme-score-clinical-patient-notes'

train = pd.read_csv(path + '/train.csv')
features = pd.read_csv(path + '/features.csv')
pns = pd.read_csv(path + '/patient_notes.csv')

test = pd.read_csv(path + '/test.csv')
submission = pd.read_csv(path + '/sample_submission.csv')

print(len(train), train.columns)
print(len(features), features.columns)
print(len(pns), pns.columns)

print(len(test), test.columns)
print(len(submission), submission.columns)

print()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("Current Device: ", device)
if torch.cuda.is_available():
    print("Number of CUDA device: ", torch.cuda.device_count())
    print("Device name: ", torch.cuda.get_device_name(0))

14300 Index(['id', 'case_num', 'pn_num', 'feature_num', 'annotation', 'location'], dtype='object')
143 Index(['feature_num', 'case_num', 'feature_text'], dtype='object')
42146 Index(['pn_num', 'case_num', 'pn_history'], dtype='object')
5 Index(['id', 'case_num', 'pn_num', 'feature_num'], dtype='object')
5 Index(['id', 'location'], dtype='object')

Current Device:  cuda:0
Number of CUDA device:  1
Device name:  Tesla P100-PCIE-16GB


In [4]:
len_feature_text = features["feature_text"].apply(lambda x: len(x.strip()))
print(f"Character Length of feature text: MAX - {max(len_feature_text)}, MIN - {min(len_feature_text)}")

len_pn_history = pns["pn_history"].apply(lambda x: len(x))
print(f"Character Length of patient note: MAX - {max(len_pn_history)}, MIN - {min(len_pn_history)}")

max_len_annotation = train["annotation"].apply(lambda x: max(list(map(len, x.split(';')))))
min_len_annotation = train["annotation"].apply(lambda x: min(list(map(len, x.split(';')))))
print(f"Character Length of annotation: MAX - {max(max_len_annotation)}, MIN - {min(min_len_annotation)}")

Character Length of feature text: MAX - 68, MIN - 3
Character Length of patient note: MAX - 950, MIN - 30
Character Length of annotation: MAX - 298, MIN - 2


## 1. Prepare Datasets

In [5]:
tokenizer = AutoTokenizer.from_pretrained("../input/nbme-pre-trained-models/tokenizer")

def tokenize(text):
    """Tokenize a sequence."""
    return tokenizer.tokenize(text, add_special_tokens=False)

example_text = pns.iloc[0].at['pn_history']
print("Example Tokens: \n", tokenize(example_text))

Example Tokens: 
 ['17', '-', 'year', '-', 'old', 'male', ',', 'has', 'come', 'to', 'the', 'student', 'health', 'clinic', 'complaining', 'of', 'heart', 'pounding', '.', 'mr', '.', 'cleveland', "'", 's', 'mother', 'has', 'given', 'verbal', 'consent', 'for', 'a', 'history', ',', 'physical', 'examination', ',', 'and', 'treatment', '-', 'began', '2', '-', '3', 'months', 'ago', ',', 'sudden', ',', 'intermittent', 'for', '2', 'days', '(', 'lasting', '3', '-', '4', 'min', ')', ',', 'worse', '##ning', ',', 'non', '-', 'all', '##ev', '/', 'ag', '##gra', '##v', '-', 'associated', 'with', 'di', '##sp', '##nea', 'on', 'ex', '##ers', '##ion', 'and', 'rest', ',', 'stressed', 'out', 'about', 'school', '-', 'reports', 'fe', 'feels', 'like', 'his', 'heart', 'is', 'jumping', 'out', 'of', 'his', 'chest', '-', 'ro', '##s', ':', 'denies', 'chest', 'pain', ',', 'd', '##ya', '##ph', '##ores', '##is', ',', 'w', '##t', 'loss', ',', 'chill', '##s', ',', 'fever', ',', 'nausea', ',', 'vomiting', ',', 'pedal', 'ed

### 1-1. Prepare sequence training dataset.

In [6]:
seq_train = pns[['pn_history', 'case_num']]
seq_train.rename(columns = {'pn_history':'sequence', 'case_num':'labels'}, inplace = True)
seq_train['sequence'] = seq_train['sequence'].apply(lambda text: tokenize(text))

token_len = seq_train['sequence'].apply(lambda x: len(x))
print(f'Token length: MAX - {max(token_len)}, MIN - {min(token_len)}')

num_labels = pns["case_num"].nunique()
print(f"Number of Cases: {num_labels}")

seq_train_shuffled = seq_train.sample(frac=1).reset_index(drop=True)
seq_valid = seq_train_shuffled.loc[: len(seq_train) * 0.01]
seq_train = seq_train_shuffled.loc[len(seq_train) * 0.01:]

print(f'Length of Train data: {len(seq_train)}')
print(f'Length of Validation data: {len(seq_valid)}')

display(seq_train)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


Token length: MAX - 312, MIN - 11
Number of Cases: 10
Length of Train data: 41724
Length of Validation data: 422


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,sequence,labels
422,"[mr, ., h, ##nas, ##on, is, a, 35, year, old, ...",3
423,"[pt, is, a, 26, yo, f, with, a, cc, of, pal, #...",5
424,"[26, -, year, -, old, female, ,, has, come, to...",5
425,"[periods, have, been, unpredictable, over, las...",2
426,"[ed, ##ie, w, ##hel, ##an, is, a, 26, yo, f, w...",5
...,...,...
42141,"[patient, is, a, 44, year, old, woman, who, is...",2
42142,"[hp, ##i, :, 20, yo, f, c, /, o, headache, fro...",9
42143,"[20, yo, f, c, /, o, of, ha, since, yesterday,...",9
42144,"[17, y, /, o, m, complain, ##s, of, sudden, on...",6


### 1-2. Prepare token classification training dataset.
First, define functions to encode from character indices to token labels and decode the other way round.

In [7]:
def idx2token_label(text, tokens, location):
    """Converts character indices('location') to token labels."""
    
    token_len = len(tokens)
    token_label = [0] * (token_len)
    
    pat = re.compile('\d+ \d+')
    indices = pat.findall(location)
    if not indices:
        return token_label
    indices = list(map(lambda s: s.split(), indices))
    indices.append(['0', '0'])

    s, e = list(map(int, indices.pop(0)))
    char_idx, token_idx = 0, 0
    text_len = len(text)
    while char_idx < text_len and token_idx < token_len and indices:
        if s <= char_idx < e:
            while token_idx < token_len and s <= char_idx < e:
#                 print(tokens[token_idx], char_idx)
                token_label[token_idx] = 1
                char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
                while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                    char_idx += 1
                token_idx += 1
            s, e = list(map(int, indices.pop(0)))
        else:
            char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
            while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                char_idx += 1
            token_idx += 1
    return token_label


def token_label2idx(text, tokens, token_label):
    """Converts token labels back to character indices."""
    
    char_indices = []

    token_len = len(tokens)
    text_len = len(text)
    char_idx, token_idx = 0, 0
    
    while char_idx < text_len and token_idx < token_len:
        if token_label[token_idx] == 1:
            s = char_idx
            while token_idx < token_len and token_label[token_idx] == 1:
                flag = False
                char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
                while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                    char_idx += 1
                    flag = True
                token_idx += 1
            e = char_idx - 1 if flag else char_idx
            char_indices.append(' '.join((str(s), str(e))))
        else:
            char_idx += len(re.sub('#', '', tokens[token_idx], flags=re.MULTILINE))
            while char_idx < text_len and text[char_idx] in " \t\n\r\f\v":
                char_idx += 1
            token_idx += 1
    
    return ';'.join(char_indices)


tmp = train.merge(pns, on='pn_num')
for idx in range(50, 70):
    print(tmp.iloc[idx].at['annotation'])
    example_text = tmp.iloc[idx].at['pn_history']
    example_idx = tmp.iloc[idx].at['location']
    print(f'Example Location: {example_idx}')
    ex_token_label = idx2token_label(example_text, tokenize(example_text), example_idx)
    print(f'Example Token Label: {ex_token_label}')
    print(f'Example Decoding: {token_label2idx(example_text, tokenize(example_text), ex_token_label)}\n')
del tmp

['17 yo']
Example Location: ['0 5']
Example Token Label: [1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
Example Decoding: 0 5

['M']
Example Location: ['6 7']
Example Token Label: [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

Prepare train dataset in DataFrame form.

In [8]:
def get_tokens_and_labels(df):
    """Get input and output of token classification model."""
    df = df.merge(pns, on='pn_num')
    df = df.merge(features, on='feature_num')
    df.rename(columns = {'feature_text':'features', 'pn_history':'text'}, inplace = True)
    df['tokens'] = df['features'].apply(lambda s: [s]) + df['text'].apply(lambda s: tokenize(s))
    if 'location' in df:
        inputs = df[['text', 'tokens', 'location']]
        df['tags'] = inputs.apply(lambda sr: [0] + idx2token_label(sr[0], sr[1], sr[2]), axis=1)
        return df[['id', 'features', 'text', 'tokens', 'tags', 'location']]
    return df[['id', 'features', 'text', 'tokens']]

tag_train = get_tokens_and_labels(train)
tag_test = get_tokens_and_labels(test)

tag_train_shuffled = tag_train.sample(frac=1).reset_index(drop=True)
tag_valid = tag_train_shuffled.loc[: len(tag_train) * 0.03]
tag_train = tag_train_shuffled.loc[len(tag_train) * 0.03:]

print(f'Length of Train data: {len(tag_train)}')
print(f'Length of Validation data: {len(tag_valid)}')
print(f'Length of Test data: {len(tag_test)}')
display(tag_train)

Length of Train data: 13871
Length of Validation data: 430
Length of Test data: 5


Unnamed: 0,id,features,text,tokens,tags,location
429,94754_909,viral-symptoms-OR-rhinorrhea-OR-scratchy-throat,"HPI:20yo c/o headach which started 1 day ago, ...",[viral-symptoms-OR-rhinorrhea-OR-scratchy-thro...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[]
430,92194_908,Nausea,"20-year-old female, has come to the doctor's o...","[Nausea, 20, -, year, -, old, female, ,, has, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['283 290']
431,00810_012,Male,Dillon is a 17 yo M with no PMH c/o heart poun...,"[Male, dillon, is, a, 17, yo, m, with, no, pm,...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...",['18 19']
432,00810_001,Family-history-of-thyroid-disorder,Dillon is a 17 yo M with no PMH c/o heart poun...,"[Family-history-of-thyroid-disorder, dillon, i...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['543 565']
433,40923_401,anxious-OR-nervous,45 yo F. CC: nervousness x 3 weeks. Increased ...,"[anxious-OR-nervous, 45, yo, f, ., cc, :, nerv...","[0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['13 24', '454 465']"
...,...,...,...,...,...,...
14295,21905_206,Recent-nausea-vomiting-OR-Recent-flulike-symptoms,44 yr old f w/ 3 yrs irregular menstrual bleed...,[Recent-nausea-vomiting-OR-Recent-flulike-symp...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[]
14296,01512_004,Lightheaded,17 yo male presents for 3-4 months of intermit...,"[Lightheaded, 17, yo, male, presents, for, 3, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[]
14297,80493_811,Diminished-energy-OR-feeling-drained,67 yo female presents with new-onset trouble s...,"[Diminished-energy-OR-feeling-drained, 67, yo,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","['184 197', '202 212', '303 313']"
14298,31800_302,Darker-bowel-movements,HPI: Mr. Hamilton is a 35 year old male who pr...,"[Darker-bowel-movements, hp, ##i, :, mr, ., ha...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",['289 300']


## 2. Task 1 - Sequence Classification

In [9]:
# Encode pandas DataFrame into HuggingFace Dataset object.
train_seq_dataset = Dataset.from_pandas(seq_train)
valid_seq_dataset = Dataset.from_pandas(seq_valid)

In [10]:
# Observe an example from train data
example_input = train_seq_dataset['sequence'][0]
example_tokenized = tokenizer(example_input, is_split_into_words=True)

print("Example input: \n", example_input)
print("Example tokenized : \n", example_tokenized.input_ids)

Example input: 
 ['mr', '.', 'h', '##nas', '##on', 'is', 'a', '35', 'year', 'old', 'male', 'who', 'presents', 'with', '2', 'months', 'of', 'pain', 'rated', 'at', '5', '/', '10', 'and', 'localized', 'to', 'the', 'mid', '-', 'abdomen', '.', 'pain', 'is', 'burning', 'in', 'nature', 'and', 'not', 'made', 'worse', ',', 'or', 'better', 'with', 'eating', '.', 'additionally', ',', 'patient', 'end', '##ors', '##es', 'a', 'sensation', 'of', 'na', '##ue', '##a', '.', 'nausea', 'and', 'pain', 'used', 'to', 'present', 'once', 'per', 'week', ',', 'but', 'is', 'now', 'present', '2', '-', '3', '##x', 'per', 'day', '.', 'he', 'initially', 'used', 'tu', '##ms', ',', 'which', 'provided', 'some', 'relief', ',', 'but', 'is', 'no', 'long', 'working', '.', 'pain', 'has', 'worsened', 'over', '2', 'months', '.', 'additionally', ',', 'pain', 'awake', '##ns', 'patient', 'from', 'sleep', '.', 'ro', '##s', ':', 'stool', 'appears', 'dark', ',', 'but', 'no', 'frank', 'blood', '.', 'no', 'other', 'positive', 'ro', '#

In [11]:
# Tokenize Dataset for training
def prepare_class_features(examples):
    tokenized_data = tokenizer(examples["sequence"],
                               truncation=True,
                               is_split_into_words=True)
    tokenized_data['labels'] = examples['labels']
    return tokenized_data

tokenized_seq_train = train_seq_dataset.map(prepare_class_features, 
                                            batched=True, 
                                            remove_columns=train_seq_dataset.column_names)
tokenized_seq_valid = valid_seq_dataset.map(prepare_class_features, 
                                            batched=True, 
                                            remove_columns=valid_seq_dataset.column_names)

  0%|          | 0/42 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [12]:
# Load Pretrained Model to current device
seq_model = AutoModelForSequenceClassification.from_pretrained("../input/nbme-pre-trained-models/seq_model", 
                                                              num_labels=num_labels).to(device)
# Print model structure
# seq_model

In [13]:
batch_size = 16
epochs = 5

args = TrainingArguments(
    "./train/nbme-case",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none",
    fp16=True, # half precision
)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

trainer = Trainer(
    seq_model,
    args,
    train_dataset=tokenized_seq_train,
    eval_dataset=tokenized_seq_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

Using amp half precision backend


In [14]:
def print_summary(result):
    print(f"Total Time: {result.metrics['train_runtime']:.2f}s")
    print(f"Samples/second: {result.metrics['train_samples_per_second']:.2f}s")

In [15]:
result = trainer.train()
print_summary(result)

***** Running training *****
  Num examples = 41724
  Num Epochs = 5
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 13040


Epoch,Training Loss,Validation Loss
1,0.0021,0.000207
2,0.0017,3.5e-05
3,0.0,1.2e-05
4,0.0,4e-06
5,0.0,3e-06


  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 422
  Batch size = 16
Saving model checkpoint to ./train/nbme-case/checkpoint-2608
Configuration saved in ./train/nbme-case/checkpoint-2608/config.json
Model weights saved in ./train/nbme-case/checkpoint-2608/pytorch_model.bin
tokenizer config file saved in ./train/nbme-case/checkpoint-2608/tokenizer_config.json
Special tokens file saved in ./train/nbme-case/checkpoint-2608/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num examples = 422
  Batch size = 16
Saving model checkpoint to ./train/nbme-case/checkpoint-5216
Configuration saved in ./train/nbme-case/checkpoint-5216/config.json
Model weights saved in ./train/nbme-case/checkpoint-5216/pytorch_model.bin
tokenizer config file saved in ./train/nbme-case/checkpoint-5216/tokenizer_config.json
Special tokens file saved in ./train/nbme-case/checkpoint-5216/special_tokens_map.json
  args.max_grad_norm,
***** Running Evaluation *****
  Num

Total Time: 4186.56s
Samples/second: 49.83s


### 3. Task 2 - Token Tagging 

In [16]:
# Encode pandas DataFrame into HuggingFace Dataset object.
train_tag_dataset = Dataset.from_pandas(tag_train)
valid_tag_dataset = Dataset.from_pandas(tag_valid)
test_dataset = Dataset.from_pandas(tag_test)

In [17]:
# Observe an example from train dataset.
example_input = train_tag_dataset['tokens'][0]
example_tokenized = tokenizer(example_input, is_split_into_words=True)

print("Example text: \n", train_tag_dataset['text'][0])
print("\nExample input: \n", example_input)
print("\nExample tokenized : \n", example_tokenized.input_ids)
print("\nExample label: \n", train_tag_dataset['tags'][0])

Example text: 
 HPI:20yo c/o headach which started 1 day ago, its all over the head, constant and progressively worsen, associated nausea, no vomitting. family hx of migraine. patient says these is the worse headach in her life.hx of OCP used. she is on Ibuprofen.
ROS:as above
ALLERGY:
PMH:
PSH:
FH/SH:
SH:


Example input: 
 ['viral-symptoms-OR-rhinorrhea-OR-scratchy-throat', 'hp', '##i', ':', '20', '##yo', 'c', '/', 'o', 'head', '##ach', 'which', 'started', '1', 'day', 'ago', ',', 'its', 'all', 'over', 'the', 'head', ',', 'constant', 'and', 'progressively', 'worse', '##n', ',', 'associated', 'nausea', ',', 'no', 'vomit', '##ting', '.', 'family', 'h', '##x', 'of', 'mig', '##raine', '.', 'patient', 'says', 'these', 'is', 'the', 'worse', 'head', '##ach', 'in', 'her', 'life', '.', 'h', '##x', 'of', 'o', '##cp', 'used', '.', 'she', 'is', 'on', 'ib', '##up', '##ro', '##fen', '.', 'ro', '##s', ':', 'as', 'above', 'all', '##ergy', ':', 'pm', '##h', ':', 'ps', '##h', ':', 'f', '##h', '/

In [18]:
# Prepare dataset for training.
# For labels, convert it to a tokenized label form using word_idx of BatchEncode object.
def prepare_tag_features(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length",
        is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx: 
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

tokenized_tag_train = train_tag_dataset.map(prepare_tag_features, 
                                            batched=True, 
                                            remove_columns=train_tag_dataset.column_names)
tokenized_tag_valid = valid_tag_dataset.map(prepare_tag_features, 
                                            batched=True, 
                                            remove_columns=valid_tag_dataset.column_names)

  0%|          | 0/14 [00:00<?, ?ba/s]

  0%|          | 0/1 [00:00<?, ?ba/s]

In [19]:
# Load PreTrained Model Structure.
token_model = AutoModelForTokenClassification.from_pretrained("../input/nbme-pre-trained-models/token_model", 
                                                              num_labels=2).to(device)

loading configuration file ../input/nbme-pre-trained-models/token_model/config.json
Model config DistilBertConfig {
  "_name_or_path": "../input/nbme-pre-trained-models/token_model",
  "activation": "gelu",
  "architectures": [
    "DistilBertForTokenClassification"
  ],
  "attention_dropout": 0.1,
  "dim": 768,
  "dropout": 0.1,
  "hidden_dim": 3072,
  "initializer_range": 0.02,
  "max_position_embeddings": 512,
  "model_type": "distilbert",
  "n_heads": 12,
  "n_layers": 6,
  "pad_token_id": 0,
  "qa_dropout": 0.1,
  "seq_classif_dropout": 0.2,
  "sinusoidal_pos_embds": false,
  "tie_weights_": true,
  "torch_dtype": "float32",
  "transformers_version": "4.18.0",
  "vocab_size": 30522
}

loading weights file ../input/nbme-pre-trained-models/token_model/pytorch_model.bin
All model checkpoint weights were used when initializing DistilBertForTokenClassification.

All the weights of DistilBertForTokenClassification were initialized from the model checkpoint at ../input/nbme-pre-trained-m

In [20]:
# Load previous sequence model weight into token model.
state_dict = seq_model.state_dict()
state_dict.pop('classifier.weight')
state_dict.pop('classifier.bias')

token_model.load_state_dict(state_dict, strict=False)

_IncompatibleKeys(missing_keys=['classifier.weight', 'classifier.bias'], unexpected_keys=['pre_classifier.weight', 'pre_classifier.bias'])

In [21]:
class NbmeTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels").to(device)
        outputs = model(**inputs)
        logits = outputs.get("logits").to(device)
        loss_fct = nn.CrossEntropyLoss().to(device)
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss

In [22]:
batch_size = 16
epochs = 7

args = TrainingArguments(
    "./nbme-tag",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    save_strategy="epoch",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    num_train_epochs=epochs,
    weight_decay=0.01,
    report_to="none",
    fp16=True # half precision
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

trainer = NbmeTrainer(
    token_model,
    args,
    train_dataset=tokenized_tag_train,
    eval_dataset=tokenized_tag_valid,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

PyTorch: setting up devices
Using amp half precision backend


In [23]:
result = trainer.train()
print_summary(result)

***** Running training *****
  Num examples = 13871
  Num Epochs = 7
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6069


Epoch,Training Loss,Validation Loss
1,0.0583,0.032282
2,0.0298,0.026508
3,0.0237,0.025019
4,0.0206,0.022459
5,0.0172,0.021874
6,0.0151,0.022123
7,0.014,0.022956


***** Running Evaluation *****
  Num examples = 430
  Batch size = 16
Saving model checkpoint to ./nbme-tag/checkpoint-867
Configuration saved in ./nbme-tag/checkpoint-867/config.json
Model weights saved in ./nbme-tag/checkpoint-867/pytorch_model.bin
tokenizer config file saved in ./nbme-tag/checkpoint-867/tokenizer_config.json
Special tokens file saved in ./nbme-tag/checkpoint-867/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 430
  Batch size = 16
Saving model checkpoint to ./nbme-tag/checkpoint-1734
Configuration saved in ./nbme-tag/checkpoint-1734/config.json
Model weights saved in ./nbme-tag/checkpoint-1734/pytorch_model.bin
tokenizer config file saved in ./nbme-tag/checkpoint-1734/tokenizer_config.json
Special tokens file saved in ./nbme-tag/checkpoint-1734/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 430
  Batch size = 16
Saving model checkpoint to ./nbme-tag/checkpoint-2601
Configuration saved in ./nbme-tag/checkpoint-2601/con

Total Time: 2959.00s
Samples/second: 32.81s


## 4. Evaluate Samples

In [24]:
# Get prediction values from the last model.
eval_outputs = trainer.predict(tokenized_tag_valid)
eval_pred = eval_outputs.predictions

***** Running Prediction *****
  Num examples = 430
  Batch size = 16


In [25]:
# Print index many examples from the evaluation data.
index = 50

for idx in range(index):
    eval_text = valid_tag_dataset['text'][idx]
    eval_tokens = valid_tag_dataset['tokens'][idx]
    eval_labels = np.argmax(eval_pred[idx], axis=-1)

    eval_indices = token_label2idx(eval_text, eval_tokens, eval_labels)
    result = eval_indices if eval_indices else "0 0"
    eval_true = valid_tag_dataset['location'][idx]

    print(f"Prediction: {result} / True value: {eval_true}")

Prediction: 0 0 / True value: []
Prediction: 16 20 / True value: ['5 9']
Prediction: 0 0 / True value: []
Prediction: 217 220;223 302 / True value: ['169 216']
Prediction: 99 122 / True value: ['51 73']
Prediction: 101 122 / True value: ['72 89']
Prediction: 0 0 / True value: []
Prediction: 209 217 / True value: ['134 148']
Prediction: 0 0 / True value: []
Prediction: 519 521;625 647 / True value: ['351 355;362 374;378 383;424 452;453 459']
Prediction: 301 304 / True value: ['220 223']
Prediction: 10 12 / True value: ['0 6']
Prediction: 343 345 / True value: ['221 227']
Prediction: 335 348 / True value: ['187 190;220 248']
Prediction: 0 0 / True value: []
Prediction: 0 0 / True value: ['2 7']
Prediction: 397 411 / True value: ['286 300']
Prediction: 0 0 / True value: []
Prediction: 879 904 / True value: ['664 670']
Prediction: 0 0 / True value: ['588 614']
Prediction: 0 0 / True value: []
Prediction: 0 0 / True value: []
Prediction: 348 365 / True value: ['294 310']
Prediction: 505 508

## 5. Predict on Test Dataset

In [26]:
# Prepare dataset for prediction.
def prepare_test_features(examples):
    return tokenizer(
        examples["tokens"], 
        truncation=True, 
        padding="max_length",
        is_split_into_words=True)

tokenized_test_dataset = test_dataset.map(prepare_test_features, 
                                          batched=True, 
                                          remove_columns=test_dataset.column_names)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [27]:
test_output = trainer.predict(tokenized_test_dataset)
test_pred = test_output.predictions

***** Running Prediction *****
  Num examples = 5
  Batch size = 16


In [28]:
predictions = []

for idx in range(len(test_pred)):
    test_text = test_dataset['text'][idx]
    test_tokens = test_dataset['tokens'][idx]
    test_labels = np.argmax(test_pred[idx], axis=-1)
    
    test_indices = token_label2idx(test_text, test_tokens, test_labels)
    predictions.append(test_indices)

In [29]:
submission['location'] = predictions
submission.to_csv('submission.csv', index=False)
display(submission)

Unnamed: 0,id,location
0,00016_000,
1,00016_001,
2,00016_002,260 267
3,00016_003,120 145
4,00016_004,286 290;296 307
