# BiasGuard: Advanced Bias Mitigation in Large Language Models

This notebook demonstrates the key features of the BiasGuard project.

In [None]:
# Install required packages
!pip install -r ../requirements.txt

In [None]:
import sys
sys.path.append('..')

from data.data_processing import load_and_process_datasets
from models.actor_model import ActorModel
from models.critic_model import CriticModel
from models.reward_model import RewardModel
from training.ppo_trainer import BiasGuardPPOTrainer
from training.multi_role_debates import MultiRoleDebateGenerator
from training.self_reflection import SelfReflectionModule
from evaluation.metrics import compute_perplexity, compute_bleu, compute_diversity
from evaluation.bias_evaluation import evaluate_overall_bias, evaluate_bias_categories
from utils.config import BiasGuardConfig
from utils.visualization import plot_training_progress, plot_bias_categories

## 1. Data Processing

In [None]:
dataset = load_and_process_datasets()
print(f"Dataset size: {len(dataset)}")
print(f"Sample data point: {dataset[0]}")

## 2. Model Initialization

In [None]:
config = BiasGuardConfig()
actor_model = ActorModel(config.model_id)
critic_model = CriticModel(config.model_id)
reward_model = RewardModel(config.model_id)

## 3. Multi-Role Debate Generation

In [None]:
debate_generator = MultiRoleDebateGenerator()
debate_prompts = debate_generator.generate_debate_topics(n=2)
for prompt in debate_prompts:
    print(prompt)
    print("\n---\n")

## 4. Model Response and Bias Evaluation

In [None]:
prompt = debate_prompts[0]
response = actor_model.generate(prompt)
bias_score = critic_model.evaluate_bias(response)

print(f"Prompt: {prompt}")
print(f"Response: {response}")
print(f"Bias Score: {bias_score}")

## 5. Self-Reflection and Improvement

In [None]:
self_reflection = SelfReflectionModule(actor_model)
reflection = self_reflection.reflect_on_response(prompt, response)
improved_response = self_reflection.generate_improved_response(prompt, response, reflection)

print(f"Reflection: {reflection}")
print(f"Improved Response: {improved_response}")
print(f"Improved Bias Score: {critic_model.evaluate_bias(improved_response)}")

## 6. Training Demonstration (Mini-batch)

In [None]:
trainer = BiasGuardPPOTrainer(actor_model, critic_model, reward_model, actor_model.tokenizer)
stats, responses, bias_scores, rewards = trainer.train_step(debate_prompts[:5])

print("Training Stats:", stats)
for i, (response, bias_score, reward) in enumerate(zip(responses, bias_scores, rewards)):
    print(f"\nSample {i+1}:")
    print(f"Response: {response}")
    print(f"Bias Score: {bias_score}")
    print(f"Reward: {reward}")

## 7. Evaluation Metrics

In [None]:
perplexity = compute_perplexity(actor_model, dataset[:100])
bleu_score = compute_bleu(actor_model, dataset[:100])
distinct_1, distinct_2 = compute_diversity(actor_model, dataset[:100])
overall_bias = evaluate_overall_bias(actor_model, dataset[:100])

print(f"Perplexity: {perplexity}")
print(f"BLEU Score: {bleu_score}")
print(f"Distinct-1: {distinct_1}")
print(f"Distinct-2: {distinct_2}")
print(f"Overall Bias: {overall_bias}")

## 8. Visualization

In [None]:
# Simulated training progress data
import numpy as np

steps = list(range(100))
metrics = {
    'step': steps,
    'loss': np.random.rand(100).tolist(),
    'perplexity': np.random.rand(100) * 10 + 5,
    'bleu_score': np.random.rand(100) * 0.5,
    'bias_score': np.random.rand(100) * 0.5
}

plot_training_progress(metrics)

# Simulated bias category data
bias_categories = {
    'gender': 0.3,
    'race': 0.25,
    'age': 0.2,
    'religion': 0.15,
    'nationality': 0.1
}

plot_bias_categories(bias_categories)