# BioBERT testing


In [27]:
import numpy as np
import pandas as pd
import re
import nltk
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
import torch
import string

# Ensure you have downloaded the required NLTK resources
# nltk.download('stopwords')
# nltk.download('punkt')

# Define text cleaning function
def clean_text(text):
    stemmer = nltk.SnowballStemmer("english")
    text = str(text).lower()
    text = re.sub('\[.*?\]', '', text)
    text = re.sub('https?://\S+|www\.\S+', '', text)
    text = re.sub('<.*?>+', '', text)
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    text = re.sub('\n', '', text)
    text = re.sub('\w*\d\w*', '', text)
    text = [word for word in text.split(' ')]
    text = " ".join(text)
    text = [stemmer.stem(word) for word in text.split(' ')]
    text = " ".join(text)
    return text

# Load the pre-trained BERT model and tokenizer
model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract"
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define a function to classify input text
def classify_text(text):
    # Clean the input text
    cleaned_text = clean_text(text)
    
    # Tokenize the cleaned text
    encodings = tokenizer(cleaned_text, padding="max_length", truncation=True, return_tensors="pt")
    
    # Make prediction
    with torch.no_grad():
        outputs = model(**encodings)
    logits = outputs.logits
    prediction = torch.argmax(logits, dim=1).item()
    
    # Map prediction to label
    label = "Positive" if prediction == 1 else "Negative"
    return label


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [28]:
# Example usage
input_text = "it is unknown how long the virus may have been circulating, this may in part be due to the lack of early clinical recognition of an infection with which South Africa previously gained little experience during the ongoing global outbreak, potential pauci-symptomatic manifestation of the disease, or delays in care-seeking behaviour due to limited access to care or fear of stigma."
result = classify_text(input_text)
print(f"The sentiment of the input sentence is: {result}")


Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


The sentiment of the input sentence is: Negative
The sentiment of the input sentence is: Negative


In [23]:
sample_text = "The International Health Regulations (IHR) National Focal Point (NFP) of the Republic of South Africa notified WHO of 20 confirmed mpox cases between 8 May and 2 July 2024, including three deaths (case fatality ratio (CFR) of 15%)"
result = classify_text(sample_text)
print(f"The sentiment of the input sentence is: {result}")

The sentiment of the input sentence is: Negative
The sentiment of the input sentence is: Negative


'O' (Outside): This label indicates that the token does not belong to any entity or specific relation of interest. It is outside of any target category.

'B-Cause' (Beginning of Cause): This label marks the beginning of a span that represents a causal factor. It is the first token of a cause-related phrase.

'I-Cause' (Inside of Cause): This label is used for tokens that are inside a span that represents a causal factor. It follows the 'B-Cause' label and continues the cause-related phrase.

'B-Effect' (Beginning of Effect): This label marks the beginning of a span that represents an effect. It is the first token of an effect-related phrase.

'I-Effect' (Inside of Effect): This label is used for tokens that are inside a span that represents an effect. It follows the 'B-Effect' label and continues the effect-related phrase.

In [10]:
texts = [
    "The lack of early clinical recognition of an infection leads to community transmission of mpox.",
    "It is raining outside.",
    "Delays in care-seeking behaviour due to limited access to care or fear of stigma.",
    "The power went out because of the storm."
]

# Labels must align with tokens after tokenization. Here is a simplified example:
labels = [
    ['O', 'O', 'O', 'O', 'O', 'B-Cause', 'I-Cause', 'I-Cause', 'I-Cause', 'I-Cause', 'O', 'O', 'B-Effect', 'I-Effect', 'I-Effect', 'I-Effect', 'O', 'O', 'B-Effect', 'I-Effect', 'O'],
    ['O', 'O', 'O'],
    ['O', 'O', 'O', 'O', 'O', 'O', 'B-Cause', 'I-Cause', 'I-Cause', 'O', 'B-Effect', 'I-Effect', 'O'],
    ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-Cause', 'O', 'O', 'O', 'B-Effect', 'O']
]

class CausalRelationDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        labels = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        input_ids = encoding['input_ids'].flatten()
        attention_mask = encoding['attention_mask'].flatten()
        
        # Convert labels to IDs, padding or truncating as necessary
        label_ids = [self.label_to_id(label) for label in labels]
        label_ids += [self.label_to_id('O')] * (self.max_length - len(label_ids))
        label_ids = label_ids[:self.max_length]

        return {
            'input_ids': input_ids,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label_ids, dtype=torch.long)
        }

    def label_to_id(self, label):
        label_map = {'O': 0, 'B-Cause': 1, 'I-Cause': 2, 'B-Effect': 3, 'I-Effect': 4}
        return label_map[label]

dataset = CausalRelationDataset(texts, labels, tokenizer)

In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=2,
    per_device_eval_batch_size=2,
    logging_dir='./logs',
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
)

trainer.train()


In [12]:
def predict_causal_relations(text, model, tokenizer):
    encoding = tokenizer.encode_plus(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt',
    )
    input_ids = encoding['input_ids']
    attention_mask = encoding['attention_mask']

    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():  # Disable gradient calculation
        outputs = model(input_ids, attention_mask=attention_mask)
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=2).flatten().tolist()

    tokens = tokenizer.convert_ids_to_tokens(input_ids.flatten().tolist())
    prediction_labels = [id_to_label(pred) for pred in predictions]

    return list(zip(tokens, prediction_labels))

def id_to_label(label_id):
    label_map = {0: 'O', 1: 'B-Cause', 2: 'I-Cause', 3: 'B-Effect', 4: 'I-Effect'}
    return label_map[label_id]


In [None]:
# Example prediction
sample_text = "The sudden appearance of unlinked cases of mpox in South Africa without a history of international travel, the high HIV prevalence among confirmed cases, and the high case fatality ratio suggest that community transmission is underway, and the cases detected to date represent a small proportion of all mpox cases that might be occurring in the community; it is unknown how long the virus may have been circulating. This may in part be due to the lack of early clinical recognition of an infection with which South Africa previously gained little experience during the ongoing global outbreak, potential pauci-symptomatic manifestation of the disease, or delays in care-seeking behaviour due to limited access to care or fear of stigma."
predictions = predict_causal_relations(sample_text, model, tokenizer)

# Print all tokens and their corresponding labels
for token, label in predictions:
    print(f'{token} - {label}')

# Print only tokens with labels other than 'O'
print("\nFiltered Predictions (labels other than 'O'):")
for token, label in predictions:
    if label != 'O':
        print(f'{token} - {label}')