# GatorTronS

## Train the model

In [1]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

%run /home/ec2-user/SageMaker/LLM-NER-clinical-text/src/models/train_model.py \
--model_name 'UFNLP/gatortrons' \
--data_dir 'ncbi_disease' \
--batch_size 24 \
--num_train_epochs 5 \
--learning_rate 5e-5 \
--weight_decay 0.01 \
--new_model_dir "/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/" \
--path_umls_semtype '/home/ec2-user/SageMaker/LLM-NER-clinical-text/data/public/MedMentions/SemGroups_2018.txt'


Loading and preprocessing the dataset ...
ncbi_disease
The device to run the model: cuda
Load the pretrained model ...


  return self.fget.__get__(instance, owner)()
Some weights of MegatronBertForTokenClassification were not initialized from the model checkpoint at UFNLP/gatortrons and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 354.221059millions parameters.


Epoch,Training Loss,Validation Loss,F1
1,0.0688,0.039396,0.871549
2,0.0204,0.047854,0.865149
3,0.0078,0.053422,0.875276
4,0.0034,0.061332,0.876097
5,0.0013,0.073064,0.88038




## Evaluate on test data


In [1]:
from src.models.model import *

# Load the model
model_loader = ModelLoader('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')

# Evaluate the models on metrics
scores = model_loader.evaluate_model(dataset_name='ncbi_disease',\
                                     path_umls_semtype='../data/public/MedMentions/SemGroups_2018.txt',\
                                     metric_names=['f1', 'precision', 'recall', 'matthews_correlation'])

scores

Loading and preprocessing the dataset ...
ncbi_disease


Map:   0%|          | 0/941 [00:00<?, ? examples/s]

{'f1': 0.9230959441861525,
 'precision': 0.8998375309216448,
 'recall': 0.948772382840148,
 'matthews_correlation': 0.8978492834665438}

## Fix the id2label in config file

In [1]:
import json
from src.data.data_loader import *
from huggingface_hub import HfApi

# Load the config file
f = open('../models/ncbi-disease/gatortrons/config.json')
config = json.load(f)

# Load the dataset with label mapping
dataset_loader = DatasetLoader(dataset_name='ncbi_disease', model_name='/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/', \
                               path_umls_semtype='../data/public/MedMentions/SemGroups_2018.txt')
dataset, classmap, umls_label_code, _ = dataset_loader.load_dataset()

# Add the label to the config
id_to_label = {ind: classmap.int2str(ind) for ind in range(len(classmap.names))}
config['id2label'] = id_to_label
config['label2id'] = {id_to_label[key]: key for key in id_to_label.keys()}

# Save the config
with open('../models/ncbi-disease/gatortrons/config.json', 'w') as f:
    json.dump(config, f)


Loading and preprocessing the dataset ...
ncbi_disease


In [3]:
# Push the config to hub
api = HfApi()
api.upload_file(
    path_or_fileobj="../models/ncbi-disease/gatortrons/config.json",
    path_in_repo="config.json",
    repo_id="longluu/Clinical-NER-NCBI-Disease-GatorTronS",
    repo_type="model",
    commit_message="fix the label2id in config"
)

CommitInfo(commit_url='https://huggingface.co/longluu/Clinical-NER-NCBI-Disease-GatorTronS/commit/f87294d962ac365cc8d9c3fb4b7884d44a6bcbb3', commit_message='fix the label2id in config', commit_description='', oid='f87294d962ac365cc8d9c3fb4b7884d44a6bcbb3', pr_url=None, pr_revision=None, pr_num=None)

## Push to model to hub

In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer

# Load the model
tokenizer = AutoTokenizer.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')
NER_model = AutoModelForTokenClassification.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')

# Push the model to hub
NER_model.push_to_hub("longluu/Clinical-NER-NCBI-Disease-GatorTronS", commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')
tokenizer.push_to_hub("longluu/Clinical-NER-NCBI-Disease-GatorTronS", commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/longluu/Clinical-NER-NCBI-Disease-GatorTronS/commit/83f96f0b04875ccf18c2a5a80064218eeb61570a', commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01', commit_description='', oid='83f96f0b04875ccf18c2a5a80064218eeb61570a', pr_url=None, pr_revision=None, pr_num=None)

## Train the model

In [1]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

%run /home/ec2-user/SageMaker/LLM-NER-clinical-text/src/models/train_model.py \
--model_name 'longluu/Clinical-NER-MedMentions-GatorTronS' \
--data_dir 'ncbi_disease' \
--batch_size 24 \
--num_train_epochs 5 \
--learning_rate 5e-5 \
--weight_decay 0.01 \
--new_model_dir "/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons-medmentions/" \
--path_umls_semtype '/home/ec2-user/SageMaker/LLM-NER-clinical-text/data/public/MedMentions/SemGroups_2018.txt'


tokenizer_config.json:   0%|          | 0.00/1.27k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/379k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

Loading and preprocessing the dataset ...
ncbi_disease


Map:   0%|          | 0/5433 [00:00<?, ? examples/s]

Map:   0%|          | 0/924 [00:00<?, ? examples/s]

Map:   0%|          | 0/941 [00:00<?, ? examples/s]

The device to run the model: cuda
Load the pretrained model ...


config.json:   0%|          | 0.00/3.41k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

RuntimeError: Error(s) in loading state_dict for MegatronBertForTokenClassification:
	size mismatch for classifier.weight: copying a param with shape torch.Size([43, 1024]) from checkpoint, the shape in current model is torch.Size([3, 1024]).
	size mismatch for classifier.bias: copying a param with shape torch.Size([43]) from checkpoint, the shape in current model is torch.Size([3]).
	You may consider adding `ignore_mismatched_sizes=True` in the model `from_pretrained` method.

## Evaluate on test data


In [1]:
from src.models.model import *

# Load the model
model_loader = ModelLoader('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')

# Evaluate the models on metrics
scores = model_loader.evaluate_model(dataset_name='ncbi_disease',\
                                     path_umls_semtype='../data/public/MedMentions/SemGroups_2018.txt',\
                                     metric_names=['f1', 'precision', 'recall', 'matthews_correlation'])

scores

Loading and preprocessing the dataset ...
ncbi_disease


Map:   0%|          | 0/941 [00:00<?, ? examples/s]

{'f1': 0.9230959441861525,
 'precision': 0.8998375309216448,
 'recall': 0.948772382840148,
 'matthews_correlation': 0.8978492834665438}

## Fix the id2label in config file

In [3]:
import json
from src.data.data_loader import *
from huggingface_hub import HfApi

# Load the config file
f = open('../models/ncbi-disease/gatortrons/config.json')
config = json.load(f)

# Load the dataset with label mapping
dataset_loader = DatasetLoader(dataset_name='ncbi_disease', model_name='/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/', \
                               path_umls_semtype='../data/public/MedMentions/SemGroups_2018.txt')
dataset, classmap, umls_label_code, _ = dataset_loader.load_dataset()

# Add the label to the config
id_to_label = {ind: classmap.int2str(ind) for ind in range(len(classmap.names))}
config['id2label'] = id_to_label
config['label2id'] = {id_to_label[key]: key for key in id_to_label.keys()}

# Save the config
with open('../models/ncbi-disease/gatortrons/config.json', 'w') as f:
    json.dump(config, f)


Loading and preprocessing the dataset ...
ncbi_disease


## Push to model to hub

In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer

# Load the model
tokenizer = AutoTokenizer.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')
NER_model = AutoModelForTokenClassification.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/ncbi-disease/gatortrons/')

# Push the model to hub
NER_model.push_to_hub("longluu/Clinical-NER-NCBI-Disease-GatorTronS", commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')
tokenizer.push_to_hub("longluu/Clinical-NER-NCBI-Disease-GatorTronS", commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/longluu/Clinical-NER-NCBI-Disease-GatorTronS/commit/83f96f0b04875ccf18c2a5a80064218eeb61570a', commit_message='--batch_size 24 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01', commit_description='', oid='83f96f0b04875ccf18c2a5a80064218eeb61570a', pr_url=None, pr_revision=None, pr_num=None)