In [None]:
# Import necessary libraries
from openprompt.plms import T5TokenizerWrapper
from datasets import load_from_disk
from openprompt.pipeline_base import PromptDataLoader
from transformers import T5ForConditionalGeneration, T5Tokenizer
from openprompt.prompts import ManualTemplate, MixedTemplate
from openprompt import PromptForClassification
from openprompt.data_utils import FewShotSampler
from random import shuffle
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
import torch
from openprompt.prompts import ManualVerbalizer
from openprompt.data_utils import InputExample
from tqdm import tqdm
import json

# Path to the dataset and loading the raw dataset
dataset_path = "path/to/your/data/set"
raw_dataset = load_from_disk(dataset_path)

# Initialize the dataset dictionary to store the validation set
dataset = {'validation': []}

# Limit validation dataset to 1000 examples for testing
raw_dataset['validation'] = raw_dataset['validation'].select(range(1000))

# Process each example in the validation set
for idx, data in enumerate(raw_dataset['validation']):
    # Extract necessary fields
    question = data["inputs_pretokenized"]  # The question and context
    choices = data["answer_choices"]        # List of answer choices
    correct_answer = data["targets_pretokenized"].strip()  # Correct answer text

    # Identify the correct answer's index within the choices
    correct_index = choices.index(correct_answer) if correct_answer in choices else -1
    if correct_index == -1:
        # Skip examples where the correct answer is missing
        print(f"Correct answer not found in choices for index {idx}")
        continue
    
    # Create an InputExample for OpenPrompt
    formatted_choices = ",".join(choices)
    input_example = InputExample(
        text_a=question,
        guid=idx,
        label=correct_index,
        meta={"choices": formatted_choices}
    )

    # Append the processed example to the validation dataset
    dataset['validation'].append(input_example)

# Display the first validation example and its type
print(dataset['validation'][0])
print(type(dataset['validation'][0]))

# Load the pre-trained T5 model and tokenizer
t5_path = "/path/to/t5-base"
model = T5ForConditionalGeneration.from_pretrained(t5_path)
tokenizer = T5Tokenizer.from_pretrained(t5_path)

# Setup a ManualTemplate for processing the input data
template = ManualTemplate(
    tokenizer=tokenizer,
    text='{"placeholder":"text_a"}{"mask"}',
)

# Logging setup
log_file = "results.json"
results = []

# Iterate through the validation dataset for evaluation
for data in dataset['validation']:

    # Helper function to format label choices
    def format_labels(choices):
        # Split the string into a list, strip whitespace from each choice
        return [choice.strip() for choice in choices.split(",")]
            
    # Function to dynamically create a verbalizer based on the choices
    def create_dynamic_verbalizer(choices, tokenizer):
        formatted_labels = format_labels(choices)
        return ManualVerbalizer(
            tokenizer=tokenizer,
            num_classes=len(formatted_labels),
            label_words=[[label] for label in formatted_labels]
        )

    # Get the choices and format labels
    choices = data.meta["choices"]
    formatted_labels = format_labels(choices)

    # Create a verbalizer for the current example
    verbalizer = create_dynamic_verbalizer(choices, tokenizer)

    # Initialize the PromptForClassification model
    prompt_model = PromptForClassification(
        plm=model,
        template=template,
        verbalizer=verbalizer,
        freeze_plm=True,  # Freezes the pre-trained language model
    )

    # Prepare the dataloader for the validation example
    validation_dataloader = PromptDataLoader(
        dataset=[data],
        template=template,
        tokenizer=tokenizer,
        tokenizer_wrapper_class=T5TokenizerWrapper,
        decoder_max_length=50,
        max_seq_length=480,
        batch_size=1,
        shuffle=False,
        teacher_forcing=False,
        predict_eos_token=False,
        truncate_method="tail",
    )

    # Function to evaluate a single example
    def evaluate_single_example(prompt_model, dataloader, tokenizer):
        prompt_model.eval()  # Set the model to evaluation mode
        with torch.no_grad():
            for inputs in dataloader:
                # Get the logits from the model
                logits = prompt_model(inputs)
                # Decode the predictions
                generated_ids = torch.argmax(logits, dim=-1)
                predicted_class = generated_ids.item()
                generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
                return generated_ids, generated_text, predicted_class, data.label

    # Evaluate the example and store the results
    generated_ids, generated_text, predicted_class, label = evaluate_single_example(
        prompt_model, validation_dataloader, tokenizer
    )
    correct = predicted_class == label
    results.append({
        "index": idx, 
        "generated_text": generated_text,
        "predicted_class": predicted_class, 
        "true_class": label, 
        "correct": correct
    })

    # Log the result of the current example
    print(f"Example {idx + 1}/{len(dataset['validation'])}: Generated 'id:{generated_ids}, {generated_text}', Predicted {predicted_class}, True {label} - "
          f"{'Correct' if correct else 'Incorrect'}")

# Compute the overall accuracy
accuracy = sum(r["correct"] for r in results) / len(results)
print(f"Validation Accuracy: {accuracy:.4f}")

# Save the results to a JSON file
with open(log_file, "w") as f:
    json.dump(results, f, indent=4)


# Multiple Choice QA with OpenPrompt Framework

This project demonstrates an approach to evaluating multiple-choice QA tasks using the OpenPrompt framework, addressing its limitations when handling tasks that require dynamic target labels per input. The key challenge lies in OpenPrompt’s inability to train directly on multiple-choice QA due to the nature of dynamically assigned target labels for each input. This README provides a detailed explanation of the methodology and rationale behind the implementation.

---

## Problem Statement

OpenPrompt is a flexible framework for prompt-based learning with large language models (LLMs). However, **multiple-choice QA tasks** pose a unique challenge:
- Each input requires a dynamically generated **target label** based on the possible answer choices.
- OpenPrompt is **not trainable** for such tasks because it assumes fixed verbalizers and static labels, which do not apply in this case.

To overcome this, we developed an **evaluation-only approach** that dynamically creates verbalizers and dataloaders for each example, enabling accurate assessment of the LLM's performance.

---

## Solution Overview

The implementation focuses on **evaluating multiple-choice QA tasks** without training, leveraging the flexibility of OpenPrompt while adhering to its limitations.

### Key Steps:
1. **Dynamic Target Labeling**:
   - For each example, extract the question, possible answer choices, and the correct answer.
   - Dynamically create a verbalizer that maps each choice to a unique label.

2. **Evaluation Without Training**:
   - Use OpenPrompt’s `PromptForClassification` to run the evaluation in an **inference-only mode**.
   - Dynamically generate a dataloader for each example, ensuring compatibility with the framework.

3. **Granular Example Evaluation**:
   - Process each example individually using a modular `evaluate_single_example` function.
   - This function dynamically creates a verbalizer and evaluates the model's output for the given input.

---

## Project Features

- **Dynamic Verbalizer Creation**:
  Each example has a unique verbalizer that maps answer choices to label words, enabling multiple-choice QA evaluation.

- **Example-Level Evaluation**:
  The implementation avoids training limitations by processing and evaluating each example independently.

- **Accurate Logging and Debugging**:
  - Logs logits for each label to facilitate detailed analysis of the model’s predictions.
  - Records the predicted class, generated text, and accuracy for each example.

- **Overall Metrics**:
  Computes overall accuracy based on the results of individual example evaluations.

---

## Implementation Details

### 1. Dataset Preparation
- Extract and preprocess the dataset for multiple-choice QA.
- Dynamically adjust the dataset to match OpenPrompt’s required format.

### 2. Template Definition
- Define a manual template to structure the input text for the LLM:
  ```json
  {"placeholder":"text_a"}{"mask"}
  ```
- This template enables OpenPrompt to understand and process the input data effectively.
### 3. Verbalizer Creation
- A dynamic verbalizer maps each answer choice to a unique token for prediction.
- Example
  ```python
  ManualVerbalizer(
    tokenizer=tokenizer,
    num_classes=len(choices),
    label_words=[[label] for label in formatted_labels]
  )
  ```
### 4. Evaluation Function
- `evaluate_single_example`: A reusable function that encapsulates the logic for evaluating one input at a time.
- Handles the following:
 - Dynamic verbalizer and dataloader creation.
 - Logits evaluation for each label.
 - Decoding and interpreting the model’s predictions.

## Limitations of OpenPrompt Framework

The following challenges highlight the necessity of this approach:

1. **Dynamic Target Labels**:
   - Multiple-choice QA requires new target labels for each input, which OpenPrompt does not inherently support during training.

2. **Non-Trainable Nature**:
   - OpenPrompt is designed for evaluation or few-shot learning, making tasks like multiple-choice QA incompatible with traditional training paradigms.

3. **Framework Constraints**:
   - Fixed verbalizers and dataloaders do not accommodate the variability required by multiple-choice QA tasks.

---

## How This Implementation Overcomes Limitations

This project circumvents the above limitations by:

- Dynamically creating verbalizers and dataloaders for each input example.
- Using OpenPrompt in **inference-only mode**, focusing solely on evaluation without requiring training.
- Adopting a modular, example-level evaluation approach that aligns with the framework’s strengths while bypassing its constraints.
