In [1]:
import numpy as np
import pandas as pd

#### Read the Data

In [2]:
!rm negative.*
!rm positive.*

!wget 'https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/negative.tsv'

rm: cannot remove 'negative.*': No such file or directory
rm: cannot remove 'positive.*': No such file or directory
--2025-05-07 04:35:53--  https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/negative.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 370178 (362K) [text/plain]
Saving to: ‘negative.tsv’


2025-05-07 04:35:53 (12.6 MB/s) - ‘negative.tsv’ saved [370178/370178]



In [3]:
!wget 'https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/positive.tsv'

--2025-05-07 04:35:56--  https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/positive.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 686469 (670K) [text/plain]
Saving to: ‘positive.tsv’


2025-05-07 04:35:56 (19.9 MB/s) - ‘positive.tsv’ saved [686469/686469]



In [4]:
negative_abstract = pd.read_csv('negative.tsv', sep='\t')
positive_abstract = pd.read_csv('positive.tsv', sep='\t')

In [5]:
negative_abstract = negative_abstract.dropna()
positive_abstract = positive_abstract.dropna()

In [6]:
negative_abstract.columns

Index(['PMID', 'Text'], dtype='object')

In [7]:
negative_abstract.shape

(225, 2)

In [8]:
positive_abstract.shape

(446, 2)

In [9]:
negative_abstract['category'] = 0
positive_abstract['category'] = 1

all_abstract = pd.concat([negative_abstract, positive_abstract])

print(all_abstract.shape)

(671, 3)


In [10]:
all_abstract.head()

Unnamed: 0,PMID,Text,category
0,37939877,Sickle cell disease and pregnancy. Pregnancy i...,0
1,28506281,A telephonic mindfulness-based intervention fo...,0
2,36541848,Delayed haemolytic transfusion reaction in pae...,0
3,37855052,Subcutaneous injection of IHP-102 prevents lun...,0
4,36450880,Bone loss is ameliorated by fecal microbiota t...,0


## Fine - Tuning PubMedBert Model

In [11]:
# !pip install torch transformers datasets scikit-learn

In [12]:
# Load PubMedBert

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [13]:
# Prepare Dataset
texts = all_abstract['Text'].tolist()
labels = all_abstract['category'].tolist()

In [14]:
# Tokenize the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)


In [15]:
# Create a Dataset Class
import torch

class PubMedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = PubMedDataset(encodings, labels)


In [16]:
# Split the Dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = PubMedDataset(train_encodings, train_labels)
val_dataset = PubMedDataset(val_encodings, val_labels)


In [17]:
# Set Up DataLoader
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [18]:
# from transformers import AdamW, get_scheduler
from torch.optim import AdamW
from transformers import get_scheduler
from tqdm.auto import tqdm

In [19]:
# Train the Model

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/201 [00:00<?, ?it/s]

In [20]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

# Evaluate the Model
model.eval()
all_predictions = []
all_labels = []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')  # 'weighted' for multi-class support
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')

# Print the results
print(f'Validation Accuracy: {accuracy}')
print(f'Validation F1 Score: {f1}')
print(f'Validation Precision: {precision}')
print(f'Validation Recall: {recall}')


Validation Accuracy: 0.9111111111111111
Validation F1 Score: 0.9105606111672846
Validation Precision: 0.9104625362689878
Validation Recall: 0.9111111111111111


In [None]:
# Save the Fine-Tuned Model

import os
# Get the current working directory
save_directory = os.getcwd()

# Specify the directory for saving your model within the current working directory
save_directory = os.path.join(save_directory, 'finetuned_pubmedbert_1')

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the Fine-Tuned Model and Tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


Model saved to /content/drive/MyDrive/Colab Notebooks/lbl/Automaxo/finetuned_pubmedbert


In [None]:
model.save_pretrained("./finetuned_pubmedbert")
tokenizer.save_pretrained("./finetuned_pubmedbert")

('./finetuned_pubmedbert/tokenizer_config.json',
 './finetuned_pubmedbert/special_tokens_map.json',
 './finetuned_pubmedbert/vocab.txt',
 './finetuned_pubmedbert/added_tokens.json',
 './finetuned_pubmedbert/tokenizer.json')