In [1]:
# !pip install transformers[torch]
# !pip install accelerate==0.23.0
# !pip install datasets
# !pip install wandb
# !pip install sacremoses
# !pip install optuna
# !pip install "ray[tune]"
# !pip install ipywidgets==7.1.1

In [1]:
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, DataCollatorWithPadding, AutoModelForSequenceClassification, TrainingArguments, Trainer, AutoConfig
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import precision_score
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
import wandb

In [3]:
import random

In [4]:
# import os
# os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [5]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [6]:
# import torch
# torch.cuda.empty_cache()

In [7]:
wandb.init(
    # set the wandb project where this run will be logged
    project="nhs-classification",

    # track hyperparameters and run metadata
    config={
    "learning_rate": 6.926e-6,
    "architecture": "transformer",
    "dataset": "binary2-nhs-abstract-only",
    "epochs": 6,
    "weight_decay": 1,
    }
)

[34m[1mwandb[0m: Currently logged in as: [33mminghui-ao[0m ([33mbinary_transformer[0m). Use [1m`wandb login --relogin`[0m to force relogin


In [3]:
df = pd.read_excel("../content/final_trainset.xlsx")
# df = pd.read_excel("/kaggle/input/testing/final_trainset.xlsx")

In [4]:
df.head(2)

Unnamed: 0,PMID,Labeling_State,Correct_Label,binary_label,binary2_label,Agreement,Explanation,TITLE,ABSTRACT,vector,curated,y
0,27504812,Gold Standard,2,1,1,0.9,,The Natural History of Primary Sclerosing Chol...,Data regarding pediatric primary sclerosing ch...,"[0.0, 0.35, 0.78, 12.03]",2,2
1,33564419,Labeled,-1,0,0,0.545,,Recurrent acute interstitial nephritis: what l...,Acute interstitial nephritis (AIN) is an emerg...,"[2.95, 0.0, 2.03, 0.54]",-1,-1


In [9]:
# Medical/biology oriented models
# tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract')
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.2')
# tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-base-cased-v1.1')
# tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12')
# tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_uncased_L-24_H-1024_A-16')
# tokenizer = AutoTokenizer.from_pretrained('microsoft/biogpt')

# General models
# tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
# tokenizer = AutoTokenizer.from_pretrained('roberta-base')
# tokenizer = AutoTokenizer.from_pretrained('roberta-large')

In [10]:
# for random train test split
abs_labels = df[["ABSTRACT", "binary2_label"]].rename(columns={"ABSTRACT": "text", "binary2_label": "label"}).copy()
ds = Dataset.from_pandas(abs_labels)
ds = ds.train_test_split(test_size=0.2, shuffle=True)

In [11]:
def preprocess(e):
    return tokenizer(e["text"], max_length=256, truncation=True)

In [12]:
tokenized_ds = ds.map(preprocess, batched=True)
tokenized_ds

Map:   0%|          | 0/6337 [00:00<?, ? examples/s]

Map:   0%|          | 0/1585 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 6337
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1585
    })
})

In [13]:
train_ds = tokenized_ds["train"]
eval_ds = tokenized_ds["test"]

In [14]:
train_ds 

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 6337
})

In [15]:
eval_ds

Dataset({
    features: ['text', 'label', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1585
})

In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
data_collator

DataCollatorWithPadding(tokenizer=BertTokenizerFast(name_or_path='dmis-lab/biobert-base-cased-v1.2', vocab_size=28996, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of

In [18]:
def compute_metrics(eval_pred):
    #taken from hf docs
    labels = eval_pred.label_ids
    preds = eval_pred.predictions.argmax(axis=1)

    accuracy = accuracy_score(labels, preds)
    precision = precision_score(labels, preds, average='weighted')
    recall = recall_score(labels, preds, average='weighted')
    f1 = f1_score(labels, preds, average='weighted')

    wandb.log({"acc": accuracy, "precision": precision, "recall": recall, "f1": f1})
    
    return {
        'accuracy': accuracy,
        'precision': precision,
        'recall': recall,
        'f1': f1
    }
#     return metrics.compute(predictions=predictions, references=labels)

In [19]:
id2label = {
    0: "the paper is not a primary experimental study in rare disease or the study is not directly investigating the natural history of a disease",
    1: "its primary contribution centers on observing the time course of a rare disease"
}
label2id = {v: k for k, v in id2label.items()}

In [20]:
model = AutoModelForSequenceClassification.from_pretrained(
    "dmis-lab/biobert-base-cased-v1.2", num_labels=2, id2label=id2label, label2id=label2id
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [21]:
# !pip install accelerate -U

In [32]:
# normal training args with hyperparameters
training_args = TrainingArguments(
    output_dir="NHS-dmis-hypop",
    learning_rate=3e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=6,
    weight_decay=1,
    evaluation_strategy="epoch",
    logging_steps=1,
    save_strategy="epoch",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    load_best_model_at_end=True,
    push_to_hub=True,
    hub_model_id="NIHNCATS/NHS-dmis-hypop", 
    hub_private_repo=True,
)

In [23]:
# training_args for hyperparameter optimization
# training_args = TrainingArguments(
#     output_dir="NHS-dmis-hpop",
#     evaluation_strategy="epoch", 
#     save_strategy="epoch",
#     save_total_limit=1,
#     disable_tqdm=False,
#     weight_decay=1,
#     per_device_train_batch_size=16,
#     per_device_eval_batch_size=16,
#     metric_for_best_model="eval_loss",
#     greater_is_better=False,
#     load_best_model_at_end = True,
# )

In [24]:
# from transformers import EarlyStoppingCallback
# # trainer for hp optimization
# def model_init():
#     return AutoModelForSequenceClassification.from_pretrained(
#         'dmis-lab/biobert-base-cased-v1.2', 
#         num_labels=2, id2label=id2label, label2id=label2id, return_dict=True)
# trainer = Trainer(
#     args=training_args,
#     tokenizer=tokenizer,
#     train_dataset=train_ds,
#     eval_dataset=eval_ds,
#     model_init=model_init,
#     compute_metrics=compute_metrics,
#     data_collator=data_collator,
#     callbacks=[EarlyStoppingCallback(2)],
# )

In [25]:
# def optuna_hp_space(trial):
#     return {
#         "learning_rate": trial.suggest_float("learning_rate", 1e-6, 1e-4, log=True),
# #         "weight_decay": trial.suggest_float("weight_decay", 0, 1),
#         "num_train_epochs": trial.suggest_int("num_train_epochs", 3, 6)
#     }

In [26]:
# trainer.hyperparameter_search(
#     direction="maximize", 
#     backend="optuna", 
#     hp_space=optuna_hp_space,
#     n_trials=10 # number of trials
# )

BestRun(run_id='1', objective=3.2861456636636093, hyperparameters={'learning_rate': 6.926443850533433e-06, 'num_train_epochs': 6}, run_summary=None)

In [33]:
from transformers import EarlyStoppingCallback

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=eval_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(2)],
)

In [34]:
trainer.train()
wandb.finish()

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.0999,0.407163,0.825868,0.827172,0.825868,0.826183
2,0.028,0.41549,0.825237,0.825567,0.825237,0.825362
3,0.2024,0.477789,0.815773,0.8156,0.815773,0.815665


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
acc,██▁
eval/accuracy,██▁
eval/f1,█▇▁
eval/loss,▁▂█
eval/precision,█▇▁
eval/recall,██▁
eval/runtime,▄█▁
eval/samples_per_second,▅▁█
eval/steps_per_second,▅▁█
f1,█▇▁

0,1
acc,0.81577
eval/accuracy,0.81577
eval/f1,0.81567
eval/loss,0.47779
eval/precision,0.8156
eval/recall,0.81577
eval/runtime,1480.1148
eval/samples_per_second,1.071
eval/steps_per_second,0.068
f1,0.81567


In [39]:
trainer.push_to_hub("NHS-dmis-hypop")

model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

events.out.tfevents.1712845913.NCATS-2265430-P.12060.0:   0%|          | 0.00/258k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

training_args.bin:   0%|          | 0.00/4.92k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NIHNCATS/NHS-dmis-hypop/commit/2a6672b950eacea975adf4d94973f760051bfd22', commit_message='NHS-dmis-hypop', commit_description='', oid='2a6672b950eacea975adf4d94973f760051bfd22', pr_url=None, pr_revision=None, pr_num=None)

In [40]:
# After the training code
# trainer.train()

# Load and preprocess the new test dataset
df_test = pd.read_excel("content/final_testset.xlsx")
test_labels = df_test[["ABSTRACT", "binary2_label"]].rename(columns={"ABSTRACT": "text", "binary2_label": "label"}).copy()
test_dataset = Dataset.from_pandas(test_labels)
tokenized_test_ds = test_dataset.map(preprocess, batched=True)

# Evaluate the model on the new test dataset
test_results = trainer.evaluate(tokenized_test_ds)
print("Test Results:", test_results)

# Log the results
wandb.log(test_results)

# Finish wandb session
wandb.finish()

# Push the trained model to Hugging Face Hub
trainer.push_to_hub("NHS-dmis-hypop")


Map:   0%|          | 0/416 [00:00<?, ? examples/s]

Test Results: {'eval_loss': 0.4118824005126953, 'eval_accuracy': 0.8173076923076923, 'eval_precision': 0.8188430938430938, 'eval_recall': 0.8173076923076923, 'eval_f1': 0.8177715636896312, 'eval_runtime': 336.0728, 'eval_samples_per_second': 1.238, 'eval_steps_per_second': 0.077, 'epoch': 3.0}


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
acc,▁
epoch,▁
eval/accuracy,▁
eval/f1,▁
eval/loss,▁
eval/precision,▁
eval/recall,▁
eval/runtime,▁
eval/samples_per_second,▁
eval/steps_per_second,▁

0,1
acc,0.81731
epoch,3.0
eval/accuracy,0.81731
eval/f1,0.81777
eval/loss,0.41188
eval/precision,0.81884
eval/recall,0.81731
eval/runtime,336.0728
eval/samples_per_second,1.238
eval/steps_per_second,0.077


events.out.tfevents.1712918663.NCATS-2265430-P.12060.1:   0%|          | 0.00/560 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/NIHNCATS/NHS-dmis-hypop/commit/f9a7e8ac9fa13c04b9c9742d4cd4a9331f516b74', commit_message='NHS-dmis-hypop', commit_description='', oid='f9a7e8ac9fa13c04b9c9742d4cd4a9331f516b74', pr_url=None, pr_revision=None, pr_num=None)

In [41]:
#save modle for API

In [42]:
model_path = "saved_model/my_bert_model"
trainer.model.save_pretrained(model_path)
trainer.tokenizer.save_pretrained(model_path)


('saved_model/my_bert_model\\tokenizer_config.json',
 'saved_model/my_bert_model\\special_tokens_map.json',
 'saved_model/my_bert_model\\vocab.txt',
 'saved_model/my_bert_model\\added_tokens.json',
 'saved_model/my_bert_model\\tokenizer.json')