# GatorTronS

## Train the model

In [None]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

%run /home/ec2-user/SageMaker/Medical-QA-extractive/src/models/run_qa.py \
    --model_name_or_path 'UFNLP/gatortrons' \
    --dataset_name 'longluu/covid-qa-split' \
    --do_train \
    --do_eval\
    --per_device_train_batch_size 4 \
    --learning_rate 3e-5 \
    --num_train_epochs 2 \
    --max_seq_length 512 \
    --doc_stride 250 \
    --max_answer_length 200 \
    --output_dir "/home/ec2-user/SageMaker/Medical-QA-extractive/models/COVID-QA/gatortrons/" \
    --overwrite_output_dir \
    --save_strategy "epoch"

## Evaluate the model

In [10]:
from src.models.evaluate_model import *

model_evaluator = ModelEvaluator(model_name='longluu/Medical-QA-gatortrons-COVID-QA', dataset_name='longluu/covid-qa-split', max_length=512, doc_stride=250)
score = model_evaluator.evaluate_model()
score

The device to run the model: cuda
Load the pretrained model ...


tokenizer_config.json:   0%|          | 0.00/1.46k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/379k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.17M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/680 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

The model has 354.220034millions parameters.
Loading and preprocessing the dataset ...
longluu/covid-qa-split


  0%|          | 0/202 [00:00<?, ?it/s]

{'exact_match': 37.12871287128713, 'f1': 64.92639883412856}

## Push model to hub

In [4]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoTokenizer

# Load the model
tokenizer = AutoTokenizer.from_pretrained('../models/COVID-QA/gatortrons/')
model = AutoModelForQuestionAnswering.from_pretrained('../models/COVID-QA/gatortrons/')

# Push the model to hub
model.push_to_hub("longluu/Medical-QA-gatortrons-COVID-QA", commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200')
tokenizer.push_to_hub("longluu/Medical-QA-gatortrons-COVID-QA", commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200')

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/16.9k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/longluu/Medical-QA-gatortrons-COVID-QA/commit/73870b2816184e6a6dea7e81694e3f70efba7f81', commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200', commit_description='', oid='73870b2816184e6a6dea7e81694e3f70efba7f81', pr_url=None, pr_revision=None, pr_num=None)

## Push train results to hub

In [7]:
from huggingface_hub import HfApi
api = HfApi()

# Eval results
api.upload_file(
    path_or_fileobj="../models/COVID-QA/gatortrons/eval_results.json",
    path_in_repo="eval_results.json",
    repo_id="longluu/Medical-QA-gatortrons-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

# All results
api.upload_file(
    path_or_fileobj="../models/COVID-QA/gatortrons/all_results.json",
    path_in_repo="all_results.json",
    repo_id="longluu/Medical-QA-gatortrons-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

# Trainer state
api.upload_file(
    path_or_fileobj="../models/COVID-QA/gatortrons/trainer_state.json",
    path_in_repo="trainer_state.json",
    repo_id="longluu/Medical-QA-gatortrons-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

CommitInfo(commit_url='https://huggingface.co/longluu/Medical-QA-gatortrons-COVID-QA/commit/9619f4855b463d5259c217caa6e10aa9f4467de2', commit_message='--per_device_train_batch_size 4 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200', commit_description='', oid='9619f4855b463d5259c217caa6e10aa9f4467de2', pr_url=None, pr_revision=None, pr_num=None)

# DeBerta trained on mrqa dataset

## Train the model

In [None]:
import logging
logging.disable(logging.INFO) # disable INFO and DEBUG logging everywhere

%run /home/ec2-user/SageMaker/Medical-QA-extractive/src/models/run_qa.py \
    --model_name_or_path 'VMware/deberta-v3-large-mrqa' \
    --dataset_name 'longluu/covid-qa-split' \
    --do_train \
    --do_eval\
    --per_device_train_batch_size 2 \
    --learning_rate 3e-5 \
    --num_train_epochs 3 \
    --max_seq_length 512 \
    --doc_stride 250 \
    --max_answer_length 200 \
    --output_dir "/home/ec2-user/SageMaker/Medical-QA-extractive/models/COVID-QA/deberta/" \
    --save_strategy "epoch"

## Evaluate the model

In [23]:
from src.models.evaluate_model import *

# First evaluate the model before fine-tuning
model_evaluator = ModelEvaluator(model_name='VMware/deberta-v3-large-mrqa', dataset_name='longluu/covid-qa-split', 
                                 max_length=512, doc_stride=250)
score = model_evaluator.evaluate_model()
score

The device to run the model: cuda
Load the pretrained model ...
The model has 434.01421millions parameters.
Loading and preprocessing the dataset ...
longluu/covid-qa-split


Map:   0%|          | 0/1817 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

Map:   0%|          | 0/202 [00:00<?, ? examples/s]

  0%|          | 0/202 [00:00<?, ?it/s]

{'exact_match': 31.683168316831683, 'f1': 60.80738674733328}

In [None]:
import os
import json
from src.models.evaluate_model import *

# Get all checkpoint folder names
root_dir = "../models/COVID-QA/deberta/"
dir_list = [os.path.join(root_dir, item) for item in os.listdir(root_dir) if os.path.isdir(os.path.join(root_dir, item)) \
                                                    and 'checkpoint-' in item]

# Get the score for each checkpoint
eval_exact_match = []
eval_f1 = []
train_loss = []
for dir_path in dir_list:
    # Get the evaluation scores
    model_evaluator = ModelEvaluator(model_name=dir_path, dataset_name='longluu/covid-qa-split', max_length=512, doc_stride=250)
    score = model_evaluator.evaluate_model()
    eval_exact_match.append(score['exact_match'])
    eval_f1.append(score['f1'])
    
    # Get the training loss
    f = open(dir_path + '/trainer_state.json')
    data = json.load(f)
    train_loss.append(data['log_history'][-1]['loss'])

In [18]:
# Put results in dataframe
epoch = [iteration+1 for iteration in range(len(dir_list))]
df_result = pd.DataFrame(columns=['epoch', 'train_loss', 'eval_exact_match', 'eval_f1'])
df_result['epoch'] = epoch
df_result['train_loss'] = train_loss
df_result['eval_exact_match'] = eval_exact_match
df_result['eval_f1'] = eval_f1     
df_result                         

Unnamed: 0,epoch,train_loss,eval_exact_match,eval_f1
0,1,0.3149,33.168317,58.727166
1,2,0.1673,34.653465,58.858354
2,3,0.0657,34.158416,59.128222


## Push model to hub

In [19]:
from huggingface_hub import notebook_login

notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [20]:
from transformers import AutoModelForQuestionAnswering, AutoConfig, AutoTokenizer

# Load the model
tokenizer = AutoTokenizer.from_pretrained('../models/COVID-QA/deberta/result-epoch-2')
model = AutoModelForQuestionAnswering.from_pretrained('../models/COVID-QA/deberta/result-epoch-2')

# Push the model to hub
model.push_to_hub("longluu/Medical-QA-deberta-MRQA-COVID-QA", commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200')
tokenizer.push_to_hub("longluu/Medical-QA-deberta-MRQA-COVID-QA", commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200')

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/longluu/Medical-QA-deberta-MRQA-COVID-QA/commit/c83f39bff0d11547e7b1be7060f2772069b14126', commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200', commit_description='', oid='c83f39bff0d11547e7b1be7060f2772069b14126', pr_url=None, pr_revision=None, pr_num=None)

## Push train results to hub

In [22]:
from huggingface_hub import HfApi
api = HfApi()

# Eval results
api.upload_file(
    path_or_fileobj="../models/COVID-QA/deberta/result-epoch-2/eval_results.json",
    path_in_repo="eval_results.json",
    repo_id="longluu/Medical-QA-deberta-MRQA-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

# All results
api.upload_file(
    path_or_fileobj="../models/COVID-QA/deberta/result-epoch-2/all_results.json",
    path_in_repo="all_results.json",
    repo_id="longluu/Medical-QA-deberta-MRQA-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

# Trainer state
api.upload_file(
    path_or_fileobj="../models/COVID-QA/deberta/result-epoch-2/trainer_state.json",
    path_in_repo="trainer_state.json",
    repo_id="longluu/Medical-QA-deberta-MRQA-COVID-QA",
    repo_type="model",
    commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200'
)

CommitInfo(commit_url='https://huggingface.co/longluu/Medical-QA-deberta-MRQA-COVID-QA/commit/005a529598bf7409b80e76a11dc356ceb855da87', commit_message='--per_device_train_batch_size 2 --learning_rate 3e-5 --num_train_epochs 2 --max_seq_length 512 --doc_stride 250 --max_answer_length 200', commit_description='', oid='005a529598bf7409b80e76a11dc356ceb855da87', pr_url=None, pr_revision=None, pr_num=None)