In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import re
from tqdm.notebook import tqdm
from collections import Counter
import nltk
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os

for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/nbme-preprocessed-sentences/patient_notes_sentences.csv
/kaggle/input/nbme-preprocessed-sentences/data_train_sentence.csv
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/config.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/README.md
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/tokenizer.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/tokenizer_config.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/sentence_bert_config.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/pytorch_model.bin
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/config_sentence_transformers.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bio_ClinicalBERT/modules.json
/kaggle/input/nbme-preprocessed-sentences/models/emilyalsentzer_Bi

# MLM Pre-training
Credit to : [https://github.com/UKPLab/sentence-transformers/blob/master/examples/unsupervised_learning/MLM/train_mlm.py](http://)

In [2]:
from transformers import AutoModelForMaskedLM, AutoTokenizer
from transformers import DataCollatorForLanguageModeling, DataCollatorForWholeWordMask
from transformers import Trainer, TrainingArguments
import sys
import gzip
from datetime import datetime

# model_name = 'emilyalsentzer/Bio_ClinicalBERT'
# model_name = '../input/nbme-mlm-pretrained/MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT-2022-04-10_06-44-23'
model_name = '../input/nbme-mlm-pretrained/MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round2-2022-04-11_04-12-44'
model_alias = 'emilyalsentzer_Bio_ClinicalBERT_round3'

per_device_train_batch_size = 64

os.environ["WANDB_DISABLED"] = "true"
save_steps = 1000               #Save model every 1k steps
num_train_epochs = 7            #Number of epochs
use_fp16 = True                #Set to True, if your GPU supports FP16 operations
max_length = 100                #Max length for a text input
do_whole_word_mask = True       #If set to true, whole words are masked
mlm_prob = 0.15                 #Probability that a word is replaced by a [MASK] token

# Load the model
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [3]:
! mkdir MLMPredtrained

In [4]:
# make a copy
! cp -r $model_name "./MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round2-2022-04-11_04-12-44"

In [5]:
# expanding the vocab
sentence = pd.read_csv('/kaggle/input/nbme-preprocessed-sentences/patient_notes_sentences.csv')
sentence.head()

tqdm().pandas()
sentence['tokens'] = sentence.sentence.progress_apply(
    lambda sent: re.split('\W', sent)
)

0it [00:00, ?it/s]

  0%|          | 0/606664 [00:00<?, ?it/s]

In [6]:
# vocab in the notes
allText = " ".join(sentence.sentence)
allTextTokens = re.split('\W', allText)
allTextTokensCounter = Counter(allTextTokens)

In [7]:
# add the top 3000 tokens to the vocabs
top3k = list(token for token, freq in allTextTokensCounter.most_common(3000))
display('first element:', top3k[0])
del top3k[0]
# top3k

'first element:'

''

In [8]:
newTokensAdded = tokenizer.add_tokens(top3k)
display(newTokensAdded)
model.resize_token_embeddings(len(tokenizer))

0

Embedding(30358, 768, padding_idx=0)

In [9]:
onlywords = re.compile('^[^\W\s\d]+$')

exclusiveWords = {
    'and', 'or'
}

def isValidGram(grams):
    return all([all(re.match(onlywords, g) for g in grams),
                all(word not in exclusiveWords for words in grams)
               ])

In [10]:
'''
allTextSplitBySpace = re.split('\s', allText)
bigrm = nltk.bigrams(allTextSplitBySpace)
bigrm = filter(isValidGram, bigrm)
bigrmCtr = Counter(bigrm)
'''

"\nallTextSplitBySpace = re.split('\\s', allText)\nbigrm = nltk.bigrams(allTextSplitBySpace)\nbigrm = filter(isValidGram, bigrm)\nbigrmCtr = Counter(bigrm)\n"

In [11]:
# topBgrm = 
# bigrmCtr.most_common(1000)[-50:]
# bigrmCtr

In [12]:
# topBgrm
# for i in range(10):
#     print(topBgrm[i][0], isValidGram(topBgrm[i][0]))

In [13]:
# bigrmCtr

In [14]:
# vocab in the labels
data_train = pd.read_csv('/kaggle/input/nbme-preprocessed-sentences/data_train_sentence.csv')
featureTokens = data_train.feature_num.apply(str).unique().tolist()

newTokensAdded = tokenizer.add_tokens(featureTokens)
display(newTokensAdded)
model.resize_token_embeddings(len(tokenizer))

0

Embedding(30358, 768, padding_idx=0)

In [15]:
output_dir = "MLMPredtrained/{}-{}".format(model_alias.replace("/", "_"),  datetime.now().strftime("%Y-%m-%d_%H-%M-%S"))
print("Save checkpoints to:", output_dir)

##### Load our training datasets
train_path = '/kaggle/input/nbme-preprocessed-sentences/patient_notes_sentences.csv'
train_sentences = pd.read_csv(train_path).sentence.tolist()
print("Train sentences:", len(train_sentences))

dev_sentences = []
print("Dev sentences:", len(dev_sentences))

#A dataset wrapper, that tokenizes our data on-the-fly
class TokenizedSentencesDataset:
    def __init__(self, sentences, tokenizer, max_length, cache_tokenization=False):
        self.tokenizer = tokenizer
        self.sentences = sentences
        self.max_length = max_length
        self.cache_tokenization = cache_tokenization

    def __getitem__(self, item):
        if not self.cache_tokenization:
            return self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)

        if isinstance(self.sentences[item], str):
            self.sentences[item] = self.tokenizer(self.sentences[item], add_special_tokens=True, truncation=True, max_length=self.max_length, return_special_tokens_mask=True)
        return self.sentences[item]

    def __len__(self):
        return len(self.sentences)

train_dataset = TokenizedSentencesDataset(train_sentences, tokenizer, max_length)
dev_dataset = TokenizedSentencesDataset(dev_sentences, tokenizer, max_length, cache_tokenization=True) if len(dev_sentences) > 0 else None


##### Training arguments
if do_whole_word_mask:
    data_collator = DataCollatorForWholeWordMask(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)
else:
    data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=True, mlm_probability=mlm_prob)

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    evaluation_strategy="steps" if dev_dataset is not None else "no",
    per_device_train_batch_size=per_device_train_batch_size,
    eval_steps=save_steps,
    save_steps=save_steps,
    logging_steps=save_steps,
    save_total_limit=1,
    prediction_loss_only=True,
    fp16=use_fp16
)


trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset
)

print("Save tokenizer to:", output_dir)
tokenizer.save_pretrained(output_dir)

print('training begins')
trainer.train()

print("Save model to:", output_dir)
model.save_pretrained(output_dir)

print("Training done")

Save checkpoints to: MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35


Using the `WAND_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).


Train sentences: 606664
Dev sentences: 0


Using amp half precision backend
tokenizer config file saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/tokenizer_config.json
Special tokens file saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/special_tokens_map.json
***** Running training *****
  Num examples = 606664
  Num Epochs = 7
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 66360


Save tokenizer to: MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35
training begins


Step,Training Loss
1000,0.7429
2000,0.7242
3000,0.7115
4000,0.6977
5000,0.6791
6000,0.6767
7000,0.6734
8000,0.6627
9000,0.6629
10000,0.6446


  args.max_grad_norm,
Saving model checkpoint to MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-1000
Configuration saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-1000/config.json
Model weights saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-1000/pytorch_model.bin
Saving model checkpoint to MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-2000
Configuration saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-2000/config.json
Model weights saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-2000/pytorch_model.bin
Deleting older checkpoint [MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/checkpoint-1000] due to args.save_total_limit
Saving model checkpoint to MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022

Save model to: MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35


Model weights saved in MLMPredtrained/emilyalsentzer_Bio_ClinicalBERT_round3-2022-04-30_09-10-35/pytorch_model.bin


Training done
