In [None]:
# @title Default title text
!pip install transformers datasets torch

Collecting datasets
  Downloading datasets-2.20.0-py3-none-any.whl (547 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m547.8/547.8 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-16.1.0-cp310-cp310-manylinux_2_28_x86_64.whl (40.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.8/40.8 MB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.1 MB/s[0m eta [36m0:00:00[0m
Collecting requests (from transformers)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [3]:
from huggingface_hub import notebook_login
# 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
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Dataset Loading

In [None]:
from datasets import load_dataset
ds_artificial = load_dataset("qiaojin/PubMedQA", "pqa_artificial")
ds_labelled = load_dataset("qiaojin/PubMedQA", "pqa_labeled")
ds_unlabelled = load_dataset("qiaojin/PubMedQA", "pqa_unlabeled")

Downloading data:   0%|          | 0.00/233M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/211269 [00:00<?, ? examples/s]

Downloading data:   0%|          | 0.00/66.0M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/61249 [00:00<?, ? examples/s]

In [None]:
print("ARTIFICIAL DATASET: \n",ds_artificial)
print("LABELLED DATASET: \n",ds_labelled)
print("UNLABELLED DATASET: \n",ds_unlabelled)

ARTIFICIAL DATASET: 
 DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 211269
    })
})
LABELLED DATASET: 
 DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer', 'final_decision'],
        num_rows: 1000
    })
})
UNLABELLED DATASET: 
 DatasetDict({
    train: Dataset({
        features: ['pubid', 'question', 'context', 'long_answer'],
        num_rows: 61249
    })
})


In [None]:
import pandas as pd
df = pd.DataFrame(ds["train"])
df.shape

(1000, 5)

In [None]:
df.to_csv("train.csv")

In [None]:
i=0
print("PUBID: ",df['pubid'][i])
print("QUESTION: ",df['question'][i])
print("CONTEXT: ",df['context'][i])
print("LONG_ANSWER: ",df['long_answer'][i])
print("FINAL_DECISION: ",df['final_decision'][i])

PUBID:  21645374
QUESTION:  Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?
CONTEXT:  {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.', 'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), cells in early 

# Model Loading and Benchmarking

In [4]:
# Install necessary libraries
!pip install transformers datasets torch

# Import necessary libraries
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset
import torch
from torch.nn import DataParallel
from tqdm.auto import tqdm  # Import tqdm for the progress bar

# Load the PubMedQA dataset
dataset = load_dataset("qiaojin/PubMedQA", "pqa_labeled")

# Load the tokenizer and model
model_name = "gpt2"  # You can change this to any other GPT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

# Check if multiple GPUs are available and wrap the model
if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = DataParallel(model)

model.to('cuda')  # Move the model to GPU for faster computation
tokenizer.pad_token_id = tokenizer.eos_token_id



In [7]:
def evaluate_model(dataset, model, tokenizer, device='cuda'):
    model.eval()
    correct_predictions = 0
    total_predictions = len(dataset)

    # Initialize tqdm progress bar
    progress_bar = tqdm(total=total_predictions, desc='Evaluating', leave=True)

    for item in dataset:
        question = item['question']
        context = item['context']
        prompt = f"Question: {question} Context: {context} Is the answer 'yes', 'no', or 'maybe'?"

        inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True).to(device)
        input_length = inputs['input_ids'].shape[1]
        max_output_length = input_length + 50  # Allow up to 50 new tokens beyond the input length

        try:
            outputs = model.generate(
                inputs['input_ids'],
                max_length=max_output_length,  # Adjusted to use max_length instead
                pad_token_id=tokenizer.eos_token_id,
                num_return_sequences=1,
                temperature=0.5
            )
        except RuntimeError as e:
            print(f"RuntimeError: {e}")
            continue

        # Decode the model output
        output_text = tokenizer.decode(outputs[0], skip_special_tokens=True).strip()

        # Check if the response is valid
        expected_answer = item['final_decision'].strip().lower()
        if output_text.lower() in ['yes', 'no', 'maybe'] and output_text.lower() == expected_answer:
            correct_predictions += 1

        progress_bar.update(1)  # Update the progress bar for each item processed

    progress_bar.close()
    accuracy = correct_predictions / total_predictions
    return accuracy


In [None]:
# Run the evaluation
accuracy = evaluate_model(dataset['train'], model, tokenizer)
print(f"Accuracy: {accuracy * 100:.2f}%")

Evaluating:   0%|          | 0/1000 [00:00<?, ?it/s]

