In [1]:
import numpy as np
import pandas as pd

#### Read the Data

In [2]:
!rm selected_negative.*
!rm positive.*

!wget 'https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/selected_negative_jhc.tsv'

rm: cannot remove 'selected_negative.*': No such file or directory
rm: cannot remove 'positive.*': No such file or directory
--2024-10-17 04:40:07--  https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/selected_negative_jhc.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 370178 (362K) [text/plain]
Saving to: ‘selected_negative_jhc.tsv’


2024-10-17 04:40:07 (73.0 MB/s) - ‘selected_negative_jhc.tsv’ saved [370178/370178]



In [3]:
!wget 'https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/positive.tsv'

--2024-10-17 04:40:07--  https://raw.githubusercontent.com/monarch-initiative/automaxo/ml_optimisation/ml_optimisation/positive.tsv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 686469 (670K) [text/plain]
Saving to: ‘positive.tsv’


2024-10-17 04:40:08 (12.9 MB/s) - ‘positive.tsv’ saved [686469/686469]



In [4]:
negative_abstract = pd.read_csv('selected_negative_jhc.tsv', sep='\t')
positive_abstract = pd.read_csv('positive.tsv', sep='\t')

In [5]:
negative_abstract = negative_abstract.dropna()
positive_abstract = positive_abstract.dropna()

In [6]:
negative_abstract.columns

Index(['PMID', 'Text'], dtype='object')

In [7]:
negative_abstract.shape

(225, 2)

In [8]:
positive_abstract.shape

(446, 2)

In [9]:
negative_abstract['category'] = 0
positive_abstract['category'] = 1

all_abstract = pd.concat([negative_abstract, positive_abstract])

print(all_abstract.shape)

(671, 3)


In [10]:
all_abstract.head()

Unnamed: 0,PMID,Text,category
0,37939877,Sickle cell disease and pregnancy. Pregnancy i...,0
1,28506281,A telephonic mindfulness-based intervention fo...,0
2,36541848,Delayed haemolytic transfusion reaction in pae...,0
3,37855052,Subcutaneous injection of IHP-102 prevents lun...,0
4,36450880,Bone loss is ameliorated by fecal microbiota t...,0


## Fine - Tuning PubMedBert Model

In [11]:
# !pip install torch transformers datasets scikit-learn

In [12]:
# Load PubMedBert

from transformers import AutoModelForSequenceClassification, AutoTokenizer

model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]



In [13]:
# Prepare Dataset
texts = all_abstract['Text'].tolist()
labels = all_abstract['category'].tolist()

In [14]:
# Tokenize the dataset
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_name)

encodings = tokenizer(texts, truncation=True, padding=True, max_length=512)


In [15]:
# Create a Dataset Class
import torch

class PubMedDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

dataset = PubMedDataset(encodings, labels)


In [16]:
# Split the Dataset
from sklearn.model_selection import train_test_split

train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2)
train_encodings = tokenizer(train_texts, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_texts, truncation=True, padding=True, max_length=512)

train_dataset = PubMedDataset(train_encodings, train_labels)
val_dataset = PubMedDataset(val_encodings, val_labels)


In [17]:
# Set Up DataLoader
from torch.utils.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)


In [18]:
# Train the Model
from transformers import AdamW, get_scheduler
from tqdm.auto import tqdm

optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_training_steps = num_epochs * len(train_loader)
lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

progress_bar = tqdm(range(num_training_steps))

model.train()
for epoch in range(num_epochs):
    for batch in train_loader:
        batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar.update(1)




  0%|          | 0/201 [00:00<?, ?it/s]

In [19]:
# Evaluate the Model
from sklearn.metrics import accuracy_score, f1_score

model.eval()
all_predictions = []
all_labels = []

for batch in val_loader:
    batch = {k: v.to(device) for k, v in batch.items()}
    with torch.no_grad():
        outputs = model(**batch)
        predictions = outputs.logits.argmax(dim=-1)
        all_predictions.extend(predictions.cpu().numpy())
        all_labels.extend(batch['labels'].cpu().numpy())

accuracy = accuracy_score(all_labels, all_predictions)
f1 = f1_score(all_labels, all_predictions)

print(f'Validation Accuracy: {accuracy}')
print(f'Validation F1 Score: {f1}')


Validation Accuracy: 0.8666666666666667
Validation F1 Score: 0.8977272727272727


In [22]:
"""
Validation Accuracy: 0.8740740740740741
Validation F1 Score: 0.9050279329608939

"""

'\nValidation Accuracy: 0.8740740740740741\nValidation F1 Score: 0.9050279329608939\n\n'

In [23]:
# Save the Fine-Tuned Model

import os
# Get the current working directory
save_directory = os.getcwd()

# Specify the directory for saving your model within the current working directory
save_directory = os.path.join(save_directory, 'finetuned_pubmedbert')

# Create the directory if it doesn't exist
if not os.path.exists(save_directory):
    os.makedirs(save_directory)

# Save the Fine-Tuned Model and Tokenizer
model.save_pretrained(save_directory)
tokenizer.save_pretrained(save_directory)


ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [24]:
model.save_pretrained("./finetuned_pubmedbert")
tokenizer.save_pretrained("./finetuned_pubmedbert")


ValueError: You are trying to save a non contiguous tensor: `bert.encoder.layer.0.attention.self.query.weight` which is not allowed. It either means you are trying to save tensors which are reference of each other in which case it's recommended to save only the full tensors, and reslice at load time, or simply call `.contiguous()` on your tensor to pack it before saving.

In [25]:
# Load the Model and Tokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch

# Define the directory where the model and tokenizer are saved
save_directory = './finetuned_pubmedbert'

# Load the model and tokenizer from the directory
model = AutoModelForSequenceClassification.from_pretrained(save_directory)
tokenizer = AutoTokenizer.from_pretrained(save_directory)

# Ensure the model is on the correct device (CPU or GPU)
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)


OSError: Error no file named pytorch_model.bin, model.safetensors, tf_model.h5, model.ckpt.index or flax_model.msgpack found in directory ./finetuned_pubmedbert.

In [26]:
# Define the classification Function
def classify_abstract(text):
    # Tokenize the input text
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512)

    # Move the inputs to the same device as the model (CPU or GPU)
    inputs = {k: v.to(device) for k, v in inputs.items()}

    # Put the model in evaluation mode
    model.eval()

    # Disable gradient calculation for inference
    with torch.no_grad():
        # Get the model's predictions
        outputs = model(**inputs)
        # Get the predicted label (0 or 1)
        prediction = outputs.logits.argmax(dim=-1).item()

    # Return the result based on the prediction
    return "Relevant" if prediction == 1 else "Not Relevant"



In [27]:
# Classify a New Abstract
text_1 = 'Sickle cell disease and pregnancy. Pregnancy is a particularly risky period in the life of patients with sickle cell disease (SCD). Physiological changes during pregnancy increase the risk of vaso-occlusive crises (VOC), acute chest syndrome, venous thromboembolic events, and infections. This concerns haemoglobin (Hb) S/C and S/beta+-thalassaemia patients as much than S/S or S/beta0-thalassaemia patients. SCD also increases the risk of obstetrical complications, such as preeclampsia, in utero foetal death, preterm delivery mostly induced, and intrauterine growth restriction. Thus, pregnancy should be planned and closely monitored by a multidisciplinary team involving obstetricians and sickle cell disease specialists. Before pregnancy, the parents should also be informed about the risk of transmission of this autosomal recessive disease, and the father should therefore be prescribed haemoglobin electrophoresis. Treatments have to be revised when planning pregnancy: hydroxyurea (HU) should be stopped as soon as pregnancy is suspected or confirmed. Preventive blood transfusion is not systematic, but is recommended in the case of a pre-existing transfusion program prior to pregnancy, severe pre-existing organ damage, severe obstetric history, and severe or repeated crises during follow-up, especially in patients taking HU before. Despite the risks of prematurity, systematic administration of corticosteroids for foetal lung maturation is not recommended due to the risk of maternal vaso-occlusive event. Although more frequent, due to obstetrical and maternal complications, caesarean section is not systematic, in the absence of maternal contraindications. It is advisable not to exceed the term of 39 weeks of amenorrhoea. Post-partum follow-up is recommended, particularly because of the risk of thromboembolism.'
text_2 = 'A telephonic mindfulness-based intervention for persons with sickle cell disease: study protocol for a randomized controlled trial. BACKGROUND: One of the most difficult symptoms for persons with sickle cell disease (SCD) to manage is chronic pain. Chronic pain impacts approximately one-third of persons with SCD and is associated with increased pain intensity, pain behavior, and frequency and duration of hospital visits. A promising category of nonpharmacological interventions for managing both physical and affective components of pain are mindfulness-based interventions (MBIs). METHODS/DESIGN: The primary aim of this study is to conduct a randomized controlled study to evaluate the acceptability and feasibility, as well as to determine the preliminary efficacy, of a telephonic MBI for adults with SCD who have chronic pain. We will enroll 60 adult patients with SCD and chronic pain at an outpatient comprehensive SCD center in the southeastern United States. Patients will be randomized to either an MBI or a wait-listed control group. The MBI group will complete a six-session (60 minutes), telephonically delivered, group-based MBI program. The feasibility, acceptability, and efficacy of the MBI regarding pain catastrophizing will be assessed by administering questionnaires at baseline and weeks 1, 3, and 6. In addition, ten randomly selected MBI participants will complete semistructured interviews to help determine intervention acceptability. DISCUSSION: In this study protocol, we report detailed methods of the randomized controlled trial. Findings of this study will be useful to determine the acceptability, feasibility, and efficacy of an MBI for persons with SCD and chronic pain. TRIAL REGISTRATION: ClinicalTrials.gov identifier: NCT02394587 . Registered on 9 February 2015.'
text_3 = 'Treatment of chronic chest wall pain in a patient with Loeys-Dietz syndrome using spinal cord stimulation. OBJECTIVE: Spinal cord stimulation (SCS) has been used with increased frequency for the treatment of intractable pain including chronic chest pain. METHODS: The patient with a history of Loeys-Dietz syndrome who underwent repair of pectus excavatum with subsequent chronic chest wall pain underwent an uneventful SCS trial with percutaneous epidural placement of two temporary eight-electrode leads placed at the level of T3-T4-T5. RESULTS: After experiencing excellent pain relief over the next two days, the patient was implanted with permanent leads and rechargeable generator four weeks later and reported sustained pain relief at 12-month follow-up visit. CONCLUSION: SCS provides an effective, alternative treatment option for select patients with Loeys-Dietz syndrome who underwent repair of pectus excavatum with subsequent chronic chest wall pain who have failed conservative treatment. SCS may provide pain relief with advantages over conservative treatments and more invasive techniques.'

In [29]:
text_4 = "Background: Marfan's Syndrome (MFS) is a disorder of connective tissue, mainly involving the cardiovascular, musculoskeletal, and ocular systems. The most severe problems include aortic root dilatation and dissection. Anesthetic management is vital for the improvement on perioperative morbidity. Case report: 61-year-old male with MFS, presenting mainly with pectus carinatum, scoliosis, ectopia lens, previous spontaneous pneumothorax and aortal aneurysm and dissection submitted to thoracoabdominal aortic prosthesis placement. Underwent routine laparoscopic cholecystectomy due to lithiasis. Important findings on preoperative examination were thoracolumbar kyphoscoliosis, metallic murmur on cardiac exam. Chest radiograph revealed Cobb angle of 70°. Echocardiogram showed evidence of aortic mechanical prosthesis with no deficits. Discussion: Preoperative evaluation should focus on cardiopulmonary abnormalities. The anesthesiologist should be prepared for a potentially difficult intubation. Proper positioning and limb support prior to induction is crucial in order to avoid joint injuries. Consider antibiotic prophylaxis for subacute bacterial endocarditis. The patient should be carefully positioned to avoid joint injuries. Intraoperatively cardiovascular monitoring is mandatory: avoid maneuvers that can lead to tachycardia or hypertension, control airway pressure to prevent pneumothorax and maintain an adequate volemia to decrease chances of prolapse, especially if considering laparoscopic surgery. No single intraoperative anesthetic agent or technique has demonstrated superiority. Adequate postoperative pain management is vitally important to avoid the detrimental effects of hypertension and tachycardia."

In [30]:
text_5 = "Marfan syndrome is a rare multisystem disease of the connective tissue, which affects multiple organ systems. advances in healthcare have doubled the life-expectancy of patients over the past three decades. to date, there is no comprehensive review that consolidates economic considerations and care for marfan patients. Areas covered: Present research suggests that there may be a link between treatment pattern, disease progression and economic costs of Marfan syndrome. It indicates that an early detection of the disease and preventive interventions achieve a dual aim. From a patient perspective, it may reduce the amount of emergency surgery or intervention, and inpatient stays. In addition, it slows disease progression, lowers lifestyle restrictions, reduces psychological stress, and improves health-related quality of life. Expert commentary: Early detection and preventive measures are likely to achieve a dual aim by simultaneously containing costs and reducing the number and length of inpatient stays."

In [31]:
text_6 = "Marfan syndrome (MFS) is a complex connective tissue disease that is primarily characterized by cardiovascular, ocular and skeletal systems disorders. Despite its rarity, MFS severely impacts the quality of life of the patients. It has been shown that molecular genetic factors serve critical roles in the pathogenesis of MFS. FBN1 is associated with MFS and the other genes such as FBN2, transforming growth factor beta (TGF-β) receptors (TGFBR1 and TGFBR2), latent TGF-β-binding protein 2 (LTBP2) and SKI, amongst others also have their associated syndromes, however high overlap may exist between these syndromes and MFS. Abnormalities in the TGF-β signaling pathway also contribute to the development of aneurysms in patients with MFS, although the detailed molecular mechanism remains unclear. Mutant FBN1 protein may cause unstableness in elastic structures, thereby perturbing the TGF-β signaling pathway, which regulates several processes in cells. Additionally, DNA methylation of FBN1 and histone acetylation in an MFS mouse model demonstrated that epigenetic factors play a regulatory role in MFS. The purpose of the present review is to provide an up-to-date understanding of MFS-related genes and relevant assessment technologies, with the aim of laying a foundation for the early diagnosis, consultation and treatment of MFS."

In [32]:
text_7 = "Yeast and cancer cells – common principles in lipid metabolism One of the paradigms in cancer pathogenesis is the requirement of a cell to undergo transformation from respiration to aerobic glycolysis – the Warburg effect – to become malignant. The demands of a rapidly proliferating cell for carbon metabolites for the synthesis of biomass, energy and redox equivalents, are fundamentally different from the requirements of a differentiated, quiescent cell, but it remains open whether this metabolic switch is a cause or a consequence of malignant transformation. One of the major requirements is the synthesis of lipids for membrane formation to allow for cell proliferation, cell cycle progression and cytokinesis. Enzymes involved in lipid metabolism were indeed found to play a major role in cancer cell proliferation, and most of these enzymes are conserved in the yeast, Saccharomyces cerevisiae. Most notably, cancer cell physiology and metabolic fluxes are very similar to those in the fermenting and rapidly proliferating yeast. Both types of cells display highly active pathways for the synthesis of fatty acids and their incorporation into complex lipids, and imbalances in synthesis or turnover of lipids affect growth and viability of both yeast and cancer cells. Thus, understanding lipid metabolism in S. cerevisiae during cell cycle progression and cell proliferation may complement recent efforts to understand the importance and fundamental regulatory mechanisms of these pathways in cancer."

In [33]:
text_8 = "Dan Tracey Interview with Dan Tracey, who uses the fruit fly model to study circuits and genes that function in nociception at Indiana University."

In [35]:
classify_abstract(text_7)


'Not Relevant'