## BertForMaskedLM
This notebook is dedicated to training a model for predicting future human diseases based on his medical history.

In [1]:
!pip install -U huggingface_hub --quiet

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
!pip install -U accelerate transformers datasets evaluate --quiet

In [4]:
from tokenizers import Tokenizer
from transformers import BertTokenizerFast, BertConfig, BertForMaskedLM
import torch

In [5]:
tokenizer_= Tokenizer.from_file('word_level_tokenizer_6.json')
tokenizer = BertTokenizerFast(tokenizer_object=tokenizer_)  # initialize tokenizer from file

config = BertConfig(vocab_size=tokenizer.vocab_size)
model = BertForMaskedLM(config) # initialize model from config

In [6]:
from datasets import load_dataset

data = load_dataset('text', data_files='input_6.txt')

In [7]:
data = data['train'].train_test_split(test_size=0.2, seed=43)

In [8]:
data

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 5165
    })
    test: Dataset({
        features: ['text'],
        num_rows: 1292
    })
})

In [9]:
data['train'][0]['text']

'128613002 [SEP] 87433001 [SEP] 195662009 [SEP] 312608009 [SEP] 10509002 [SEP] 10509002 [SEP] 444814009 271737000'

In [10]:
# tokenize data

tokenized_data = data.map(
    lambda example: tokenizer(example['text']),
    batched=True,
    num_proc=4,
    remove_columns=data['train'].column_names
)

In [11]:
tokenized_data['train'][0]['input_ids']

[2, 45, 3, 50, 3, 6, 3, 19, 3, 7, 3, 7, 3, 5, 9, 3]

### Computing weights (optional --- it can be useful when dataset is unbalanced)

In [14]:
# diagnosis_samples = load_dataset('text', data_files='input_for_tokenizer_6.txt')
# diagnosis_samples = diagnosis_samples['train'].train_test_split(test_size=0.2, seed=43)

In [13]:
# diagnosis_codes = diagnosis_samples.map(
#     lambda example: {'ids': tokenizer(example['text'])['input_ids']},
#     batched=True,
#     num_proc=4,
#     remove_columns=diagnosis_samples['train'].column_names
# )

In [15]:
# diagnosis_codes['train'][0]['ids']

In [16]:
# import numpy as np
# from sklearn.utils.class_weight import compute_class_weight

# train_labels = np.concatenate(diagnosis_codes['train']['ids'])

# weights = np.zeros(model.config.vocab_size)
# weights[[np.unique(train_labels)]] = compute_class_weight('balanced',
#                                                         classes=np.unique(train_labels),
#                                                         y=np.array(train_labels))

# len(weights)

In [17]:
# np.unique(train_labels)

In [18]:
# weights

### Getting distribution of diagnoses in our dataset


In [19]:
# vals, counts = np.unique(train_labels, return_counts=True)

In [20]:
# import matplotlib.pyplot as plt

# plt.rcParams['figure.figsize'] = (12,10)
# plt.bar(vals[(vals!=3)&(vals!=2)], counts[(vals!=3)&(vals!=2)])
# plt.xticks(np.arange(0, 100, step=5))
# plt.xlabel('Код диагноза из словаря токенайзера')
# plt.ylabel('Число повторений в датасете')
# plt.savefig('targets_dist_in_train.png')

# plt.show()

In [21]:
# (counts[(vals==5)|(vals==6)|(vals==7)] / sum(counts[(vals!=3)&(vals!=2)])

### Training

In [23]:
from transformers import DataCollatorForLanguageModeling

# Data collator masks 15% of tokens in the entire dataset
# in order to train the model to predict masked tokens

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [24]:
from torch import nn
from transformers import TrainingArguments, Trainer


# class CustomTrainer(Trainer):
#     def compute_loss(self, model, inputs, return_outputs=False):
#         labels = inputs.pop("labels")
#         # forward pass
#         outputs = model(**inputs)
#         logits = outputs.get("logits")
#         num_labels = self.model.config.num_labels
#         # compute custom loss (we have labels with weights)
#         loss_fct = nn.CrossEntropyLoss(weight=torch.tensor(weights, device=model.device, dtype=torch.float))
#         loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))
#         return (loss, outputs) if return_outputs else loss

In [26]:
import evaluate
import numpy as np
from scipy.special import softmax

def compute_metrics(eval_preds):
    metric1 = evaluate.load('accuracy')
    metric2 = evaluate.load('f1')
    logits, labels = eval_preds
    predictions = np.argmax(softmax(logits, axis=-1), axis=-1)
    d = {**metric1.compute(predictions=predictions[labels!=-100],
                           references=labels[labels!=-100]),
         **metric2.compute(predictions=predictions[labels!=-100],
                           references=labels[labels!=-100], average='weighted'),
         # the accuracy of the model that produces a random diagnosis every time
         'RandomBaseline Accuracy': metric1.compute(predictions=np.random.randint(5, high=122, size=len(labels[labels!=-100])),
                                                    references=labels[labels!=-100])['accuracy'],
         # the accuracy of the model that produces the most popular diagnosis for all patients
         'Baseline Accuracy': metric1.compute(predictions=np.array([5]*len(labels[labels!=-100])),
                                              references=labels[labels!=-100])['accuracy']}
    return d

In [27]:
batch_size = 32

training_args = TrainingArguments(
    output_dir='concepts-finetuned-bert',
    overwrite_output_dir = True,
    evaluation_strategy='steps',
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    learning_rate=5e-5,
    weight_decay=0.01,
    num_train_epochs=20,
    logging_strategy='steps',
    logging_steps=400,
    fp16=True, # enable mixed-precision training, which gives us another boost in speed
    eval_steps=400,
    remove_unused_columns=False,
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_data['train'],
    eval_dataset=tokenized_data['test'],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,Accuracy,F1,Randombaseline accuracy,Baseline accuracy
400,3.064,2.674225,0.338095,0.267885,0.014286,0.15
800,2.4288,2.313627,0.402778,0.346803,0.009838,0.148727
1200,2.2341,2.121341,0.43778,0.392809,0.005045,0.145179
1600,2.0898,1.996113,0.460202,0.414149,0.012332,0.142377
2000,1.9711,1.936542,0.484447,0.438685,0.009217,0.139401
2400,1.9357,1.850466,0.491525,0.450351,0.005085,0.146893
2800,1.8361,1.904641,0.49749,0.453483,0.008924,0.14947
3200,1.8127,1.879042,0.489784,0.440053,0.008173,0.156451


TrainOutput(global_step=3240, training_loss=2.1670511763772846, metrics={'train_runtime': 402.7161, 'train_samples_per_second': 256.508, 'train_steps_per_second': 8.045, 'total_flos': 1728761298228432.0, 'train_loss': 2.1670511763772846, 'epoch': 20.0})

Our model has an accuracy of 0.49 and an F1 value of 0.44.

In [28]:
import math

eval_results = trainer.evaluate()
print(f"Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

Perplexity: 6.11


In [29]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(123, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_aff

### Inference

In [44]:
text = ' '.join(data['test'][-13]['text'].split(' ')[:-1]) + ' [MASK]'
print(text)

inputs = tokenizer(text, return_tensors='pt').to(device)

with torch.no_grad():
    logits = model(**inputs).logits

195662009 [SEP] 10509002 [SEP] 444814009 10509002 [SEP] [MASK]


In [45]:
# retrieve index of [MASK]
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]

predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

'444814009'

In [48]:
mask_token_logits = logits[0, mask_token_index, :]

top_3_tokens = torch.topk(mask_token_logits, 3, dim=1).indices[0].tolist()

for token in top_3_tokens:
    print(text.replace(tokenizer.mask_token, tokenizer.decode([token])))  # print top-3 variants

print('True sequence:\n', data['test'][-13]['text'])

195662009 [SEP] 10509002 [SEP] 444814009 10509002 [SEP] 444814009
195662009 [SEP] 10509002 [SEP] 444814009 10509002 [SEP] 195662009
195662009 [SEP] 10509002 [SEP] 444814009 10509002 [SEP] 10509002
True sequence:
 195662009 [SEP] 10509002 [SEP] 444814009 10509002 [SEP] 10509002


In [49]:
tokenizer.push_to_hub('concepts-finetuned-bert')
trainer.push_to_hub()

model.safetensors:   0%|          | 0.00/345M [00:00<?, ?B/s]

events.out.tfevents.1704575057.172524b07238.21368.0:   0%|          | 0.00/9.82k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1704575527.172524b07238.21368.1:   0%|          | 0.00/585 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/person1of2interest/concepts-finetuned-bert/commit/bae40068d2bd2f3c4392569d5d25c85ed692e446', commit_message='End of training', commit_description='', oid='bae40068d2bd2f3c4392569d5d25c85ed692e446', pr_url=None, pr_revision=None, pr_num=None)

### Using model from Hugging Face

In [None]:
# from transformers import AutoModelForMaskedLM

# checkpoint = 'person1of2interest/concepts-finetuned-bert'
# model = AutoModelForMaskedLM.from_pretrained(checkpoint,
#                                              token='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',
#                                              ignore_mismatched_sizes=True)

In [None]:
# from transformers import AutoTokenizer

# tokenizer = AutoTokenizer.from_pretrained(checkpoint, token='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')