In [None]:
# Import necessary libraries
from openprompt.plms import LMTokenizerWrapper
from datasets import load_from_disk
from openprompt.pipeline_base import PromptDataLoader
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from openprompt.prompts import PrefixTuningTemplate
from openprompt import PromptForClassification
from openprompt.data_utils import FewShotSampler
from openprompt.prompts import ManualVerbalizer
from random import shuffle
import torch
from transformers import AdamW
from transformers.optimization import get_linear_schedule_with_warmup
from openprompt.data_utils import InputExample
import json

# Load dataset
dataset_path = "/path/to/your/data/set"
raw_dataset = load_from_disk(dataset_path)

# Map textual labels to numeric labels
label_map = {"positive": 0, "negative": 1}

# Prepare datasets
dataset = {}
for split in ['train', 'validation']:
    dataset[split] = []
    if split == 'train':
        # Shuffle and select a subset for training
        raw_dataset[split] = raw_dataset[split].shuffle(seed=42).select(range(1000))
    else:
        # Select a subset for validation
        raw_dataset[split] = raw_dataset[split].select(range(1000))
    
    for idx, data in enumerate(raw_dataset[split]):
        label_text = data["targets_pretokenized"].strip().lower()  # Extract label text
        label_numeric = label_map.get(label_text, -1)  # Convert to numeric label
        input_example = InputExample(text_a=data['inputs_pretokenized'], guid=idx, label=label_numeric)
        dataset[split].append(input_example)
print(dataset['train'][0])
print(type(dataset['train'][0]))

# Few-shot sampling from training data
sampler = FewShotSampler(num_examples_per_label=30)
fewshot_data = sampler(dataset['train'], seed=42)

# Load the GPT-2 model and tokenizer
gpt_path = "/path/to/gpt2"
model = GPT2LMHeadModel.from_pretrained(gpt_path)
tokenizer = GPT2Tokenizer.from_pretrained(gpt_path)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Logging setup
log_file = "prefix_tuning_results_gpt2.json"
results = []

# Define hyperparameter search ranges
learning_rates = [0.0005, 0.001, 0.005]  # Learning rates to test
num_soft_tokens = [10, 50, 100]  # Number of soft tokens for prefix tuning
warmup_steps = [10, 20, 25]  # Warm-up steps for learning rate scheduler

# Loop through hyperparameter combinations
for lr in learning_rates:
    for tokens in num_soft_tokens:
        for warmup in warmup_steps:
            print(f"Testing: LR={lr}, Soft Tokens={tokens}, Warm-Up Steps={warmup}")
            
            # Reload model and tokenizer for each configuration
            model = GPT2LMHeadModel.from_pretrained(gpt_path)
            tokenizer = GPT2Tokenizer.from_pretrained(gpt_path)
            tokenizer.pad_token = tokenizer.eos_token
            
            # Define the prefix tuning template
            template = PrefixTuningTemplate(
                model=model,
                tokenizer=tokenizer,
                text='{"placeholder":"text_a"} {"mask"}',
                num_token=tokens,  # Number of virtual tokens
            )
            
            # Define a manual verbalizer
            verbalizer = ManualVerbalizer(
                tokenizer=tokenizer, 
                num_classes=2,  # Binary classification
                label_words=[["positive", "good", "excellent", "wonderful"], ["negative", "bad", "horrible", "terrible"]],
                classes=[0, 1]
            )
            
            # Initialize the prompt model
            prompt_model = PromptForClassification(
                plm=model, 
                template=template, 
                verbalizer=verbalizer, 
                freeze_plm=True
            )
            
            # Create dataloaders
            train_dataloader = PromptDataLoader(
                dataset=fewshot_data, 
                template=template, 
                tokenizer=tokenizer,
                tokenizer_wrapper_class=LMTokenizerWrapper, 
                max_seq_length=480, 
                decoder_max_length=3,
                batch_size=5, 
                shuffle=True, 
                teacher_forcing=False, 
                predict_eos_token=False,
                truncate_method="tail"
            )
            
            validation_dataloader = PromptDataLoader(
                dataset=dataset["validation"], 
                template=template, 
                tokenizer=tokenizer,
                tokenizer_wrapper_class=LMTokenizerWrapper, 
                max_seq_length=480, 
                decoder_max_length=3,
                batch_size=5, 
                shuffle=False, 
                teacher_forcing=False, 
                predict_eos_token=False,
                truncate_method="tail"
            )
            
            # Define loss function
            loss_func = torch.nn.CrossEntropyLoss()
            
            # Define optimizer for prefix tuning parameters
            optimizer_grouped_parameters = [{'params': [p for name, p in template.named_parameters() if 'raw_embedding' not in name]}]
            optimizer = AdamW(optimizer_grouped_parameters, lr=lr)
            
            # Define learning rate scheduler
            scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup, num_training_steps=1000)

            from tqdm import tqdm
            
            # Set model to training mode
            prompt_model.train()
            
            # Training parameters
            num_epochs = 10
            gradient_accumulation_steps = 1
            
            # Training loop
            for epoch in range(num_epochs):
                print(f"Epoch {epoch + 1}/{num_epochs}")
                total_loss = 0
                pbar = tqdm(train_dataloader, desc="Training")
                
                for step, inputs in enumerate(pbar):
                    logits = prompt_model(inputs)  # Get model predictions
                    labels = inputs['label']  # Ground-truth labels
                    
                    # Compute loss
                    loss = loss_func(logits, labels)
                    loss.backward()  # Backpropagation
                    
                    # Optimizer step
                    if (step + 1) % gradient_accumulation_steps == 0:
                        optimizer.step()
                        scheduler.step()
                        optimizer.zero_grad()
                    
                    total_loss += loss.item()
                    pbar.set_postfix({"loss": total_loss / (step + 1)})
            
            # Define evaluation function
            def evaluate(prompt_model, dataloader):
                prompt_model.eval()  # Set model to evaluation mode
                total, correct = 0, 0
                
                with torch.no_grad():
                    for inputs in dataloader:
                        logits = prompt_model(inputs)
                        preds = torch.argmax(logits, dim=-1)  # Predicted class
                        labels = inputs['label']
                        total += len(labels)
                        correct += (preds == labels).sum().item()
                
                return correct / total  # Compute accuracy
            
            # Validation after each epoch
            val_accuracy = evaluate(prompt_model, validation_dataloader)
            print(f"Validation Accuracy after Epoch {epoch + 1}: {val_accuracy:.4f}")
            
            # Log results
            result = {
                "learning_rate": lr,
                "num_soft_tokens": tokens,
                "warmup_steps": warmup,
                "final_loss": total_loss / (10 * len(train_dataloader)),
                "accuracy": val_accuracy
            }
            results.append(result)

            # Save intermediate results
            with open(log_file, "w") as f:
                json.dump(results, f, indent=4)

print("Tuning complete. Results saved to", log_file)


# Overview of Prefix Tuning for Sentiment Classification with GPT-2

This code implements **prefix tuning** for sentiment classification using the OpenPrompt framework and a pre-trained GPT-2 model. It explores the use of soft prompts to fine-tune GPT-2 for a binary classification task in a **few-shot learning setting**. The script includes hyperparameter tuning to optimize the learning process and evaluates the model's performance across different configurations.

---

## Key Features

### 1. **Dataset Preparation**
- Loads the dataset from disk and preprocesses it into the OpenPrompt-compatible `InputExample` format.
- Maps sentiment labels (`positive`, `negative`) to numeric values for binary classification.
- Applies a **few-shot sampling** strategy to simulate low-resource scenarios by selecting 30 examples per label.

### 2. **Prefix Tuning**
- Uses the `PrefixTuningTemplate` to introduce **soft tokens** that are added to the input text, enabling the model to adapt to the sentiment classification task without modifying the base GPT-2 model weights.
- The number of soft tokens is adjustable and tested as part of the hyperparameter tuning.

### 3. **Manual Verbalizer**
- Maps the model's predictions to human-readable sentiment labels (`positive`, `negative`).
- Uses a manually defined set of label words to guide the language model's output during classification.

### 4. **Training Process**
- Trains only the prefix-tuning parameters while keeping the GPT-2 model frozen, significantly reducing computational overhead.
- Tracks training loss for each batch and adjusts learning rates using a linear scheduler with warm-up steps.

### 5. **Hyperparameter Tuning**
- Explores various combinations of:
  - **Learning rates** (`0.0005`, `0.001`, `0.005`).
  - **Number of soft tokens** (`10`, `50`, `100`).
  - **Warm-up steps** (`10`, `20`, `25`).
- Automates hyperparameter experimentation to identify the best-performing configuration.

### 6. **Evaluation**
- Validates the model's performance on a separate validation set after each training epoch.
- Computes accuracy by comparing the model's predictions against ground-truth labels.

### 7. **Result Logging**
- Logs training loss, validation accuracy, and hyperparameter settings.
- Saves results to a JSON file for further analysis.

---

## Known Limitation of GPT-2

### **Verbose or Irrelevant Outputs**
GPT-2 often generates verbose or irrelevant outputs instead of concise, task-relevant answers. This behavior arises from its general-purpose training and lack of fine-tuning for the specific task of sentiment classification. As a result, additional fine-tuning or prompt engineering is required to make GPT-2 outputs more aligned with task objectives.

---

## Workflow

### 1. **Dataset Loading and Preprocessing**
- The dataset is loaded and split into training and validation subsets.
- Each data point is converted into an `InputExample` format compatible with OpenPrompt.

### 2. **Few-Shot Sampling**
- A small subset of the training data is sampled to simulate a low-resource environment, with balanced examples for each label.

### 3. **Prefix Tuning Template**
- Initializes a soft prompt using the `PrefixTuningTemplate`.
- Virtual tokens are appended to the input text to steer the pre-trained GPT-2 model toward the classification task.

### 4. **Training**
- The prefix-tuning parameters are optimized using the AdamW optimizer.
- A linear learning rate scheduler with warm-up steps ensures a stable optimization process.

### 5. **Evaluation**
- After each epoch, the model is evaluated on the validation set.
- Accuracy metrics are calculated and logged for each hyperparameter configuration.

### 6. **Hyperparameter Search**
- The script iterates over different combinations of learning rates, soft token counts, and warm-up steps to find the optimal setup.
- Each configuration is tested independently, and the results are logged for analysis.

---

## Applications

- **Few-Shot Learning**: Demonstrates how prefix tuning can effectively adapt pre-trained language models for classification tasks in low-resource settings.
- **Sentiment Analysis**: Applies prefix tuning to a binary sentiment classification task (`positive` vs. `negative`).
- **Prompt-Based Learning**: Highlights the flexibility of soft prompts for adapting language models to downstream tasks without full fine-tuning.

---

## Benefits of Prefix Tuning

1. **Parameter Efficiency**:
   - Only optimizes a small number of parameters (soft prompts), reducing computational costs and memory requirements.

2. **Low-Resource Adaptation**:
   - Achieves strong performance with minimal labeled data, making it suitable for few-shot learning.

3. **Modularity**:
   - Easily integrates with pre-trained language models like GPT-2.
   - Allows experimentation with different templates, verbalizers, and hyperparameter settings.

---

## Conclusion

This implementation provides a practical demonstration of prefix tuning for sentiment classification using GPT-2. By combining soft prompts, manual verbalizers, and hyperparameter tuning, the script highlights the flexibility and efficiency of prompt-based learning for binary classification tasks in NLP. However, the inherent limitation of GPT-2's verbosity or irrelevance in output generation underscores the need for fine-tuning to align the model better with task-specific requirements.
