# GatorTronS

## Train the model

In [1]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

%run /home/ec2-user/SageMaker/LLM-NER-clinical-text/src/models/train_model.py \
--model_name 'UFNLP/gatortrons' \
--data_dir '/home/ec2-user/SageMaker/LLM-NER-clinical-text/data/public/MedMentions/preprocessed-data/' \
--batch_size 4 \
--num_train_epochs 5 \
--learning_rate 5e-5 \
--weight_decay 0.01 \
--new_model_dir "/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/" \
--path_umls_semtype '/home/ec2-user/SageMaker/LLM-NER-clinical-text/data/public/MedMentions/SemGroups_2018.txt'


  from .autonotebook import tqdm as notebook_tqdm


Loading and preprocessing the dataset ...
/home/ec2-user/SageMaker/LLM-NER-clinical-text/data/public/MedMentions/preprocessed-data/
The device to run the model: cuda
Load the pretrained model ...


  return self.fget.__get__(instance, owner)()
Some weights of MegatronBertForTokenClassification were not initialized from the model checkpoint at UFNLP/gatortrons and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


The model has 354.262059millions parameters.


***** Running training *****
  Num examples = 2,635
  Num Epochs = 5
  Instantaneous batch size per device = 4
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 1
  Total optimization steps = 3,295
  Number of trainable parameters = 354,262,059


Epoch,Training Loss,Validation Loss,F1
1,0.5241,0.416685,0.603839
2,0.3327,0.401952,0.62728
3,0.2288,0.429635,0.633531
4,0.1512,0.470514,0.633126
5,0.1038,0.514751,0.633849


***** Running Evaluation *****
  Num examples = 878
  Batch size = 4
Saving model checkpoint to /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-659
Configuration saved in /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-659/config.json
Model weights saved in /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-659/model.safetensors
tokenizer config file saved in /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-659/tokenizer_config.json
Special tokens file saved in /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-659/special_tokens_map.json
***** Running Evaluation *****
  Num examples = 878
  Batch size = 4
Saving model checkpoint to /home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/tmp-checkpoint-1318
Configuration saved in /home/ec2-user/SageMaker/L

## Evaluate on test data


In [37]:
from transformers import AutoModelForTokenClassification, AutoConfig, AutoTokenizer, DataCollatorForTokenClassification
from src.data.data_loader import *
from datasets import load_metric
from torch.nn.functional import cross_entropy
import logging
import torch
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

# Load the fine-tuned model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
tokenizer = AutoTokenizer.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/')
NER_model = AutoModelForTokenClassification.from_pretrained('/home/ec2-user/SageMaker/LLM-NER-clinical-text/models/medmentions/gatortrons/').to(device)

# Load the data
dataset_loader = DatasetLoader(dataset_name='../data/public/MedMentions/preprocessed-data/', path_umls_semtype='../data/public/MedMentions/SemGroups_2018.txt', model_name='UFNLP/gatortrons')
data_medmentions, classmap, umls_label_code, tokenizer = dataset_loader.load_dataset()
data_medmentions = data_medmentions.remove_columns(['Full Text', 'Entity Codes', 'tokens', 'ner_tags', 'token_labels'])

# Create a collator
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
        
# Make predictions on test data
def forward_pass_with_label(batch):
    # Convert dict of lists to list of dicts suitable for data collator
    features = [dict(zip(batch, t)) for t in zip(*batch.values())]
    
    # Pad inputs and labels and put all tensors on device
    batch = data_collator(features)
    input_ids = batch["input_ids"].to(device)
    attention_mask = batch["attention_mask"].to(device)
    labels = batch["labels"].to(device)
    with torch.no_grad():
        # Pass data through model  
        output = NER_model(input_ids, attention_mask)
        
        # Logit.size: [batch_size, sequence_length, classes]
        # Predict class with largest logit value on classes axis
        predicted_label = torch.argmax(output.logits, axis=-1).cpu().numpy()

    return {"predicted_label": predicted_label}

prediction_results = data_medmentions['validation'].map(forward_pass_with_label, batched=True, batch_size=1)

# Compute the metrics
metric = load_metric('f1')
for (prediction, label) in zip(prediction_results['predicted_label'], prediction_results['labels']):
    metric.add_batch(predictions=prediction, references=label)
    
metric.compute(average='macro')

Loading and preprocessing the dataset ...
../data/public/MedMentions/preprocessed-data/


Map: 100%|██████████| 878/878 [00:59<00:00, 14.75 examples/s]
You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


{'f1': 0.5524024274855981}

In [43]:
prediction_results

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'labels', 'predicted_label'],
    num_rows: 878
})

In [None]:
from huggingface_hub import notebook_login

notebook_login()


In [None]:
# Push the model to hub
NER_model.push_to_hub("longluu/Clinical-NER-MedMentions-GatorTronS", commit_message='--batch_size 4 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')
tokenizer.push_to_hub("longluu/Clinical-NER-MedMentions-GatorTronS", commit_message='--batch_size 4 --num_train_epochs 5 --learning_rate 5e-5 --weight_decay 0.01')

README.md: 100%|██████████| 21.0/21.0 [00:00<00:00, 3.65kB/s]
model.safetensors:  38%|███▊      | 544M/1.42G [00:12<00:21, 40.1MB/s] 