In [None]:
# Clone the repository (force clean clone if it exists)
!rm -rf fairness-prms
!git clone https://github.com/minhtran1015/fairness-prms

# Verify the fix is present
!grep -n "trust_remote_code" fairness-prms/fairness-prms/src/sal/utils/data.py || echo "⚠️ Fix not found!"

# Apply runtime patch to fix Transformers ALL_PARALLEL_STYLES issue
print("\n🔧 Applying Transformers patch...")
patch_file = 'fairness-prms/fairness-prms/src/sal/models/prm/transformers_patch.py'
patch_code = '''"""Patch for Transformers ALL_PARALLEL_STYLES issue"""
import transformers.modeling_utils

# Fix the NoneType ALL_PARALLEL_STYLES issue
# Set to the standard list of supported parallel styles
if not hasattr(transformers.modeling_utils, 'ALL_PARALLEL_STYLES') or transformers.modeling_utils.ALL_PARALLEL_STYLES is None:
    transformers.modeling_utils.ALL_PARALLEL_STYLES = ['colwise', 'rowwise', 'layerwise']
'''

with open(patch_file, 'w') as f:
    f.write(patch_code)

print("✅ Transformers patch created")
print("   Supported parallel styles: colwise, rowwise, layerwise")
print("   This will be imported before model loading")

In [None]:
# EARLY CHECK: Verify the code fix is present
import sys

print("=" * 70)
print("CHECK 1: Verifying code fix")
print("=" * 70)

data_file = '/kaggle/working/fairness-prms/fairness-prms/src/sal/utils/data.py'

try:
    with open(data_file, 'r') as f:
        content = f.read()
    
    if 'trust_remote_code=True' in content:
        print("✅ Code includes trust_remote_code=True")
        print("\n⚠️  Note: You still need to install datasets==2.14.0 (see cell 8)")
    else:
        print("❌ CRITICAL ERROR: Code does NOT include trust_remote_code=True")
        print("   Fix: Update GitHub repo with the fix, then re-run cell 1")
        sys.exit(1)
        
except FileNotFoundError:
    print("❌ ERROR: Could not find data.py file!")
    print("   Make sure you ran cell 1 to clone the repository.")
    sys.exit(1)

print("=" * 70)

In [None]:
# CHECK 2: Verify Transformers patch was created
import sys

print("=" * 70)
print("CHECK 2: Verifying Transformers patch")
print("=" * 70)

patch_file = '/kaggle/working/fairness-prms/fairness-prms/src/sal/models/prm/transformers_patch.py'
bias_detection_file = '/kaggle/working/fairness-prms/fairness-prms/src/sal/models/prm/bias_detection.py'

try:
    # Check if patch file exists
    with open(patch_file, 'r') as f:
        patch_content = f.read()
    
    if 'ALL_PARALLEL_STYLES' in patch_content:
        print("✅ Transformers patch file created successfully")
    else:
        print("❌ CRITICAL ERROR: Patch file exists but doesn't contain fix")
        sys.exit(1)
    
    # Check if bias_detection.py imports the patch
    with open(bias_detection_file, 'r') as f:
        bias_detection_content = f.read()
    
    if 'from . import transformers_patch' in bias_detection_content:
        print("✅ bias_detection.py imports the patch")
        print("\n🎉 ALL PATCHES VERIFIED - Safe to continue!")
    else:
        print("❌ CRITICAL ERROR: bias_detection.py does NOT import patch")
        print("   Solution: Update GitHub repo with latest changes")
        sys.exit(1)
        
except FileNotFoundError as e:
    print(f"❌ ERROR: Could not find required file: {e}")
    print("   Make sure you ran cell 1 to clone and patch the repository.")
    sys.exit(1)

print("=" * 70)

# Fairness PRMs Kaggle Run

This notebook runs a fairness evaluation using Process Reward Models (PRMs) on dual T4 GPUs.

## ⚠️ Critical Requirements:

The evaluation requires THREE critical fixes:
1. **Code fix**: `trust_remote_code=True` in data.py (in GitHub repo)
2. **Library fix**: `datasets==2.14.0` (newer versions removed this feature)
3. **Transformers patch**: Fix for `ALL_PARALLEL_STYLES` NoneType error (created in cell 1)

## 📋 Correct Execution Order:

1. **Cell 1**: Clone repository + create Transformers patch
2. **Cell 2**: ✅ CHECK 1 - Verify trust_remote_code fix
3. **Cell 3**: ✅ CHECK 2 - Verify Transformers patch
4. **Cell 5-8**: Setup (list files, change dir, install tree)
5. **Cell 9**: 🔧 Install dependencies including `datasets==2.14.0`
6. **Cell 10**: ✅ CHECK 3 - Verify datasets version (MUST be 2.14.0)
7. **Cell 12-15**: Run evaluation and inspect results

⚠️ **Important**: Run cells IN ORDER! All checks must pass before proceeding.

## 🚀 Key Features:
- Dual T4 GPU support with tensor parallelism
- Fairness-aware Best-of-N sampling
- Process Reward Model (PRM) for bias detection
- Optimized for Kaggle environment
- Comprehensive error checking and validation

In [None]:
!ls /kaggle/working/

In [None]:
%cd /kaggle/working/fairness-prms/fairness-prms

In [None]:
!apt-get -qq update && apt-get -qq install -y tree

In [None]:
!tree

In [None]:
# Install dependencies with specific versions
# CRITICAL: Use datasets 2.14.0 which still supports trust_remote_code

print("🔧 Installing dependencies...")
print("=" * 70)

# First, uninstall the current datasets to force downgrade
!pip uninstall -y datasets 2>/dev/null || true

# Install specific versions
!pip install --quiet 'protobuf<4'
!pip install --upgrade pip --quiet

# Install datasets 2.14.0 FIRST before other packages
!pip install --quiet 'datasets==2.14.0'

# Then install other dependencies (including latex2sympy2 and word2number for math utils)
!pip install --quiet 'protobuf<4' huggingface_hub regex sympy peft numpy latex2sympy2 word2number

# Install vLLM 0.6.3 (works well with T4 GPUs - compute capability 7.5)
!pip install --quiet 'vllm==0.6.3'

# Install project in editable mode (this might try to upgrade datasets, so we pin it)
!pip install -e '.[dev]' --quiet --no-deps
!pip install --quiet 'datasets==2.14.0'  # Ensure it stays at 2.14.0

print("=" * 70)
print("✅ Installation complete!")
print("   Next: Run cell 10 to verify datasets version (CHECK 3)")

In [None]:
# CRITICAL CHECK: Verify datasets version after installation
import sys

print("=" * 70)
print("CHECK 3: Verifying datasets library version")
print("=" * 70)

try:
    import datasets
    version = datasets.__version__
    major, minor = map(int, version.split('.')[:2])
    
    print(f"Current datasets version: {version}")
    
    if major < 3:  # versions 2.x support trust_remote_code
        print(f"✅ datasets version {version} supports trust_remote_code")
        print("\n🎉 ALL CHECKS PASSED - Safe to continue!")
    else:
        print(f"❌ datasets version {version} does NOT support trust_remote_code")
        print(f"\n   Current version: {version}")
        print(f"   Required: <3.0.0 (recommended: 2.14.0)")
        print("\n   SOLUTION: The installation may have failed to downgrade.")
        print("   Try running cell 8 again, or restart the kernel and run from cell 1.")
        sys.exit(1)
except Exception as e:
    print(f"❌ ERROR checking datasets version: {e}")
    sys.exit(1)

print("=" * 70)

## Dataset Loading Fix

⚠️ **Root Cause**: The BBQ dataset uses a custom loading script (`bbq.py`), which requires `trust_remote_code=True`. However, newer versions of the `datasets` library (3.0+) have **removed** support for `trust_remote_code` entirely.

✅ **Solution**: Install `datasets==2.14.0` which still supports `trust_remote_code=True`. This is handled in the installation cell above.

## Dual T4 GPU Setup

**Configuration:** Using 2x T4 GPUs (compute capability 7.5)
T4 GPUs are well-supported by vLLM and perfect for this task!

In [None]:
# Verify T4 GPU setup
import torch
print("CUDA available:", torch.cuda.is_available())
print("Number of GPUs:", torch.cuda.device_count())
print("CUDA version:", torch.version.cuda)
print("PyTorch version:", torch.__version__)

for i in range(torch.cuda.device_count()):
    print(f"\nGPU {i}: {torch.cuda.get_device_name(i)}")
    print(f"  Compute Capability: {torch.cuda.get_device_capability(i)}")
    print(f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1024**3:.2f} GB")

### Configuration for Dual T4 GPUs

With 2x T4 GPUs, we can:
- Use `tensor_parallel_size=2` to split the model across both GPUs
- Handle more samples (50 instead of 20-40)
- Use higher `gpu_memory_utilization=0.85` 
- Run with full `n=8` for Best-of-N sampling
- Use `max_tokens=1024` for complete reasoning

Each T4 has ~15GB VRAM, giving us ~30GB total for the model and inference.

In [None]:
import os
from kaggle_secrets import UserSecretsClient
from huggingface_hub import login

# This is the code Kaggle gave you to get your secret
user_secrets = UserSecretsClient()
secret_value_0 = user_secrets.get_secret("HUGGING_FACE_HUB_TOKEN")

# --- Use the secret by doing the following ---

# 1. Set the environment variable that the Hugging Face library looks for
os.environ["HUGGING_FACE_HUB_TOKEN"] = secret_value_0

# 2. Now, call the login function. It will find the variable and log you in.
login()

print("Successfully logged in to Hugging Face! ✅")

In [None]:
# Set environment variables for Transformers to avoid parallel processing issues
import os
os.environ['TRANSFORMERS_VERBOSITY'] = 'error'
os.environ['TOKENIZERS_PARALLELISM'] = 'false'

# Patch Transformers to fix the NoneType ALL_PARALLEL_STYLES issue
import transformers.modeling_utils
if not hasattr(transformers.modeling_utils, 'ALL_PARALLEL_STYLES') or transformers.modeling_utils.ALL_PARALLEL_STYLES is None:
    # Set to an empty list to prevent the TypeError
    transformers.modeling_utils.ALL_PARALLEL_STYLES = []
    print("✅ Patched Transformers ALL_PARALLEL_STYLES")

print("✅ Environment variables configured for Transformers")

In [None]:
# Run with Dual T4 GPU optimized settings:
# - tensor_parallel_size=2 to use both T4 GPUs
# - Adjusted batch size and parameters for dual GPU setup
# - Each T4 has ~15GB memory, so we can handle more samples
!python scripts/test_time_compute.py recipes/fairness_example.yaml \
  --output.push_to_hub=false \
  --dataset.num_samples=50 \
  --search.n=8 \
  --search.max_tokens=1024 \
  --search.temperature=0.7 \
  --model.tensor_parallel_size=2 \
  --model.gpu_memory_utilization=0.85

In [None]:
# Inspect output JSONL (first 3 lines)
import glob, json, os
paths = glob.glob('data/**/bon_completions.jsonl', recursive=True)
print('Found output files:', paths)
if paths:
    with open(paths[0]) as f:
        for i, line in zip(range(3), f):
            row = json.loads(line)
            keep_keys = [k for k in row.keys() if k in ['problem','pred','scores','completions']]
            preview = {k: row[k] for k in keep_keys}
            print(json.dumps(preview, indent=2)[:800])