In [None]:
from transformers import AutoTokenizer, AutoModelForQuestionAnswering, AutoModelForSequenceClassification
import torch
import json
from tqdm import tqdm
import numpy as np
import datasets

In [None]:
datasets.load_dataset("qiaojin/PubMedQA/pqa_labeled")

In [4]:

# Load your trained model and tokenizer
model_path = "trained_model/"
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForQuestionAnswering.from_pretrained(model_path)


In [34]:
def predict_single_example(model, tokenizer, question, context, max_length=512):
    """
    Run inference on a single example
    
    Args:
        model: The trained model
        tokenizer: The tokenizer
        question (str): The question text
        context (str): The context text
        max_length (int): Maximum sequence length
        
    Returns:
        dict: Contains prediction, probabilities, and formatted output
    """
    # Prepare the input
    inputs = tokenizer(
        question,
        context,
        truncation=True,
        max_length=max_length,
        padding='max_length',
        return_tensors='pt'
    )
    
    # Move to GPU if available
    device = next(model.parameters()).device
    inputs = {k: v.to(device) for k, v in inputs.items()}
    
    # Run inference
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get predictions
    predicted_class = torch.argmax(probs, dim=-1).item()
    probabilities = probs[0].tolist()
    
    # Convert to labels
    label_map = {0: 'yes', 1: 'no', 2: 'maybe'}
    prediction = label_map[predicted_class]
    
    # Format the output
    result = {
        'prediction': prediction,
        'probabilities': {
            'yes': f"{probabilities[0]:.3f}",
            'no': f"{probabilities[1]:.3f}",
            'maybe': f"{probabilities[2]:.3f}"
        },
        'input': {
            'question': question,
            'context': context
        }
    }
    
    return result

In [59]:
def convert_pubmed_to_ynm(dataset):
    def convert_example(example):
        return {
            'question': example['question'],
            'context': ' '.join(example['context']['contexts']),
            'label': example['final_decision'],
            'pubid': example['pubid'],
            'original_answer': example['long_answer']
        }
    
    # Convert each split if it exists
    converted_dataset = {}
    for split in dataset.keys():
        converted_dataset[split] = dataset[split].map(convert_example)
        
        # Print label distribution for verification
        labels = converted_dataset[split]['label']
        label_counts = {}
        for label in labels:
            label_counts[label] = label_counts.get(label, 0) + 1
        print(f"\nLabel distribution in {split} split:")
        for label, count in label_counts.items():
            print(f"{label}: {count} ({count/len(labels)*100:.2f}%)")
    
    return datasets.DatasetDict(converted_dataset)

def convert_contrast_to_ynm(json_data):
    """
    Convert JSON data with perturbed questions into training format.
    
    Args:
        json_data (dict): JSON data with 'data' key containing list of examples
        
    Returns:
        dict: Dictionary with 'train' data in the format:
            {
                'question': list of questions,
                'context': list of contexts,
                'label': list of yes/no labels
            }
    """
    questions = []
    contexts = []
    labels = []
    
    for item in json_data['data']:
        # Add original question
        questions.append(item['question'])
        contexts.append(item['paragraph'])
        # Convert TRUE/FALSE to yes/no
        labels.append('yes' if item['answer'].upper() == 'TRUE' else 'no')
        
        # Add all perturbed questions
        for perturbed in item['perturbed_questions']:
            questions.append(perturbed['perturbed_q'])
            contexts.append(item['paragraph'])
            labels.append('yes' if perturbed['answer'].upper() == 'TRUE' else 'no')
    
    return {
        'train': datasets.Dataset.from_dict({
            'question': questions[1:],
            'context': contexts[1:],
            'label': labels[1:]
        })
    }

In [69]:
with open('boolq_perturbed.json', 'r') as f:
  json_data = json.load(f)
dataset = convert_contrast_to_ynm(json_data)
# dataset = datasets.load_dataset("qiaojin/PubMedQA", "pqa_labeled")
# dataset = convert_pubmed_to_ynm(dataset)
dataset

{'train': Dataset({
     features: ['question', 'context', 'label'],
     num_rows: 410
 })}

In [70]:
yes = 0
no = 0
for ex in dataset['train']:
  if ex['label'] == 'yes':
    yes+=1
  else:
    no+=1

print(yes, no)

207 203


In [41]:
model = AutoModelForSequenceClassification.from_pretrained(
        'trainedconpm',
        num_labels=3
    )
    
tokenizer = AutoTokenizer.from_pretrained('trainedconpm')

In [42]:
predict_single_example(model, tokenizer, dataset['train'][3]['question'], dataset['train'][3]['context'], max_length=512)

{'prediction': 'yes',
 'probabilities': {'yes': '0.573', 'no': '0.333', 'maybe': '0.094'},
 'input': {'question': 'Are the long-term results of the transanal pull-through equal to those of the transabdominal pull-through?',
  'context': 'contexts labels meshes reasoning_required_pred reasoning_free_pred'}}

In [20]:
model_path = "trained_model/"  # Replace with your model path
data_file = '../contrast-sets/BoolQ/boolq_perturbed.json'   # Replace with your data file path
batch_size = 8  # Adjust based on your GPU memory

# Run evaluation
print("Starting evaluation...")
results = evaluate_model(model_path, data_file, batch_size)

# Print some statistics
print(f"\nEvaluation completed on {len(results)} examples")

Starting evaluation...


TypeError: list indices must be integers or slices, not str

In [19]:
results

[{'id': '3d8155d6-a740-47c6-8835-7c89c708fdb4_1',
  'question': 'How many priests influenced Mem de Sa?',
  'context': "Urged by influential Jesuit priests who had come to Brazil with Mem de Sá, named José de Anchieta and Manuel da Nóbrega, and who had played a big role in pacifying the Tamoios, Mem de Sá ordered his nephew, Estácio de Sá to assemble a new attack force. Estácio de Sá founded the city of Rio de Janeiro on March 1, 1565, and fought the Frenchmen for two more years. Helped by a military reinforcement sent by his uncle, on January 20, 1567, he imposed final defeat on the French forces and decisively expelled them from Brazil, but died a month later from wounds inflicted in the battle. Coligny's and Villegaignon's dream had lasted a mere 12 years. Largely in response to the two attempts of France to conquer territory in Brazil , between 1612 and 1615, the Portuguese crown decided to expand its colonization efforts in Brazil. Prior to this in 1560, Mem de Sá, the new Governo

In [62]:
import datasets
squad = datasets.load_dataset('squad')

pubmed = datasets.load_dataset("qiaojin/PubMedQA", "pqa_labeled")
# hotpot = datasets.load_dataset('hotpot_qa', 'fullwiki')

In [63]:
label_map = {0: 'yes', 1: 'no'}

# Calculate per-class metrics
for class_idx, class_name in label_map.items():
  print(class_idx, class_name)

0 yes
1 no


In [16]:
ds = convert_pubmed_to_ynm(pubmed)

In [18]:
pubmed['train'][0]

{'pubid': 21645374,
 'question': 'Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?',
 'context': {'contexts': ['Programmed cell death (PCD) is the regulated death of cells within an organism. The lace plant (Aponogeton madagascariensis) produces perforations in its leaves through PCD. The leaves of the plant consist of a latticework of longitudinal and transverse veins enclosing areoles. PCD occurs in the cells at the center of these areoles and progresses outwards, stopping approximately five cells from the vasculature. The role of mitochondria during PCD has been recognized in animals; however, it has been less studied during PCD in plants.',
   'The following paper elucidates the role of mitochondrial dynamics during developmentally regulated PCD in vivo in A. madagascariensis. A single areole within a window stage leaf (PCD is occurring) was divided into three areas based on the progression of PCD; cells that will not undergo PCD (NPCD), ce

In [7]:
squad

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [9]:
from typing import Dict, Any
from datasets import DatasetDict, Dataset
import copy

def convert_ropes_to_squad(ropes_dataset: DatasetDict) -> DatasetDict:
    """
    Convert a ROPES format dataset to SQuAD format.
    
    Args:
        ropes_dataset (DatasetDict): Dataset in ROPES format with features:
            - id: unique identifier
            - background: background information
            - situation: specific situation
            - question: question text
            - answers: answer information
            
    Returns:
        DatasetDict: Dataset in SQuAD format with features:
            - id: unique identifier
            - title: document title (empty string for ROPES)
            - context: combined background and situation
            - question: question text
            - answers: answer information in SQuAD format
    """
    
    def process_example(example: Dict[str, Any]) -> Dict[str, Any]:
        """Process a single example from ROPES to SQuAD format."""
        # Combine background and situation with a newline separator
        context = example['background'].strip() + "\n\n" + example['situation'].strip()
        
        # Create new example in SQuAD format
        squad_example = {
            'id': example['id'],
            'title': '',  # ROPES doesn't have titles
            'context': context,
            'question': example['question'],
            'answers': {
                'text': example['answers']['text'],
                'answer_start': []  # Initialize empty list for answer_start positions
            }
        }
        
        # Calculate answer_start positions for each answer
        for answer_text in example['answers']['text']:
            # Find the start position of the answer in the context
            answer_start = context.find(answer_text)
            if answer_start != -1:
                squad_example['answers']['answer_start'].append(answer_start)
            else:
                # If answer not found in context, use 0 as fallback
                # You might want to handle this differently based on your needs
                squad_example['answers']['answer_start'].append(0)
        
        return squad_example
    
    # Process each split in the dataset
    squad_dataset = DatasetDict()
    for split_name, dataset in ropes_dataset.items():
        # Map the conversion function over all examples in the dataset
        squad_split = dataset.map(
            process_example,
            remove_columns=dataset.column_names,
            desc=f"Converting {split_name} split to SQuAD format"
        )
        squad_dataset[split_name] = squad_split
    
    return squad_dataset


In [10]:
ldf = convert_ropes_to_squad(ropes)

Converting train split to SQuAD format: 100%|██████████| 10924/10924 [00:01<00:00, 10627.63 examples/s]
Converting test split to SQuAD format: 100%|██████████| 1710/1710 [00:00<00:00, 10641.41 examples/s]
Converting validation split to SQuAD format: 100%|██████████| 1688/1688 [00:00<00:00, 10721.35 examples/s]


In [11]:
ldf

DatasetDict({
    train: Dataset({
        features: ['id', 'question', 'answers', 'title', 'context'],
        num_rows: 10924
    })
    test: Dataset({
        features: ['id', 'question', 'answers', 'title', 'context'],
        num_rows: 1710
    })
    validation: Dataset({
        features: ['id', 'question', 'answers', 'title', 'context'],
        num_rows: 1688
    })
})