In [None]:
# !pip install torch transformers datasets scikit-learn

In [1]:
import numpy as np
import pandas as pd
import torch
import os


from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.model_selection import train_test_split
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm
from torch.utils.data import DataLoader
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

#### Read the Data

In [3]:
negative_abstract = pd.read_csv('negative.tsv', sep='\t')
positive_abstract = pd.read_csv('positive.tsv', sep='\t')

In [4]:
negative_abstract = negative_abstract.dropna()
positive_abstract = positive_abstract.dropna()

In [8]:
negative_abstract['category'] = 0
positive_abstract['category'] = 1

all_abstract = pd.concat([negative_abstract, positive_abstract])

print(all_abstract.shape)

(671, 3)


In [9]:
all_abstract.head()

Unnamed: 0,PMID,Text,category
0,37939877,Sickle cell disease and pregnancy. Pregnancy i...,0
1,28506281,A telephonic mindfulness-based intervention fo...,0
2,36541848,Delayed haemolytic transfusion reaction in pae...,0
3,37855052,Subcutaneous injection of IHP-102 prevents lun...,0
4,36450880,Bone loss is ameliorated by fecal microbiota t...,0


## Fine - Tuning PubMedBert Model

In [13]:
# Load PubMedBert
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# Prepare Dataset
texts = all_abstract['Text'].tolist()
labels = all_abstract['category'].tolist()

In [16]:
# Tokenize the dataset

tokenizer = AutoTokenizer.from_pretrained(model_name)

encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)


In [17]:
# Create a Dataset Class
class PubMedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = PubMedDataset(encodings, labels)


In [18]:
# Split the Dataset
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = PubMedDataset(train_encodings, train_labels)
val_dataset = PubMedDataset(val_encodings, val_labels)


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [19]:
# Set Up DataLoader
train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [20]:
# Train the Model
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)


  0%|          | 0/201 [00:00<?, ?it/s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
100%|██████████| 201/201 [1:15:39<00:00, 21.14s/it]

In [22]:
# Evaluate the Model
model.eval()
all_predictions = []
all_labels = []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions, average='weighted')  # 'weighted' for multi-class support
precision = precision_score(all_labels, all_predictions, average='weighted')
recall = recall_score(all_labels, all_predictions, average='weighted')

# Print the results
print(f'Validation Accuracy: {accuracy}')
print(f'Validation F1 Score: {f1}')
print(f'Validation Precision: {precision}')
print(f'Validation Recall: {recall}')


Validation Accuracy: 0.8888888888888888
Validation F1 Score: 0.8871224165341813
Validation Precision: 0.8881578947368421
Validation Recall: 0.8888888888888888


In [22]:
"""
Validation Accuracy: 0.8888888888888888
Validation F1 Score: 0.8871224165341813
Validation Precision: 0.8881578947368421
Validation Recall: 0.8888888888888888

"""

'\nValidation Accuracy: 0.8740740740740741\nValidation F1 Score: 0.9050279329608939\n\n'

In [24]:
# Ensure all tensors are contiguous
def make_contiguous(model):
    for param in model.parameters():
        param.data = param.data.contiguous()

# Mount Google Drive
# If your drive is already mounted, you can skip this line
# drive.mount('/content/drive')

# Specify the path in your Google Drive where you want to save the model
save_directory = 'finetuned_pubmedbert'

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Ensure that all the model's tensors are contiguous
make_contiguous(model)

# Save the Fine-Tuned Model and Tokenizer to the specified Google Drive folder
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)

print(f'Model saved to {save_directory}')


Model saved to finetuned_pubmedbert


In [25]:
model.save_pretrained("./finetuned_pubmedbert")
tokenizer.save_pretrained("./finetuned_pubmedbert")


('./finetuned_pubmedbert/tokenizer_config.json',
 './finetuned_pubmedbert/special_tokens_map.json',
 './finetuned_pubmedbert/vocab.txt',
 './finetuned_pubmedbert/added_tokens.json',
 './finetuned_pubmedbert/tokenizer.json')