In [None]:
# Configuration Cell - Environment Detection
import os
import sys
from pathlib import Path

# Detect environment
IS_KAGGLE = os.path.exists('/kaggle/input')
IS_COLAB = 'google.colab' in sys.modules

# Set base directories based on environment
if IS_KAGGLE:
    PROJECT_ROOT = Path("/kaggle/working")
    DATA_DIR = Path("/kaggle/input/ood-eval-toxic-classifiers/data")
elif IS_COLAB:
    PROJECT_ROOT = Path("/content")
    DATA_DIR = PROJECT_ROOT / "data"
else:
    # Local environment - use parent of notebooks folder
    PROJECT_ROOT = Path(__file__).parent.parent if '__file__' in dir() else Path.cwd().parent
    if not (PROJECT_ROOT / "data").exists():
        PROJECT_ROOT = Path.cwd().parent  # Fallback
    DATA_DIR = PROJECT_ROOT / "data"

# Standard directories
EXPERIMENTS_DIR = PROJECT_ROOT / "experiments"
SCRIPTS_DIR = PROJECT_ROOT / "scripts"

# Create directories if needed
EXPERIMENTS_DIR.mkdir(parents=True, exist_ok=True)
(EXPERIMENTS_DIR / "plots").mkdir(parents=True, exist_ok=True)

# Add scripts to path
if str(SCRIPTS_DIR) not in sys.path:
    sys.path.insert(0, str(SCRIPTS_DIR))

print(f"Environment: {'Kaggle' if IS_KAGGLE else 'Colab' if IS_COLAB else 'Local'}")
print(f"Project root: {PROJECT_ROOT}")
print(f"Data directory: {DATA_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")
print(f"Scripts directory: {SCRIPTS_DIR}")

In [None]:
# Master Experiment Runner for CS483 BiasBreakers Project
# This notebook orchestrates all experiments and generates results for the final report

import subprocess
import pandas as pd
import numpy as np

print("="*80)
print("BiasBreakers: Master Experiment Runner")
print("="*80)
print(f"\nData directory: {DATA_DIR}")
print(f"Experiments directory: {EXPERIMENTS_DIR}")
print(f"Scripts directory: {SCRIPTS_DIR}")

# Verify directories exist
if not DATA_DIR.exists():
    print(f"\n‚ö†Ô∏è  WARNING: Data directory not found: {DATA_DIR}")
else:
    print(f"\n‚úì Data directory exists")

In [None]:
# STEP 1: Verify data preprocessing is complete
print("\n" + "="*80)
print("STEP 1: Verifying Data Files")
print("="*80 + "\n")

required_files = [
    "jigsaw_train.csv", "jigsaw_val.csv", "jigsaw_test.csv",
    "jigsaw_train_full.csv", "jigsaw_val_full.csv", "jigsaw_test_full.csv",
    "civil_train.csv", "civil_val.csv", "civil_test.csv",
    "civil_train_full.csv", "civil_val_full.csv", "civil_test_full.csv",
    "hatexplain_train.csv", "hatexplain_val.csv", "hatexplain_test.csv",
]

missing_files = []
found_files = []
for f in required_files:
    path = DATA_DIR / f
    if path.exists():
        size_mb = path.stat().st_size / (1024*1024)
        print(f"‚úì {f} ({size_mb:.1f} MB)")
        found_files.append(f)
    else:
        print(f"‚úó {f} - MISSING")
        missing_files.append(f)

if missing_files:
    print(f"\n‚ö†Ô∏è  WARNING: {len(missing_files)} files missing!")
    print("Please run preprocessing scripts first:")
    print("  python scripts/process_raw_data.py")
else:
    print(f"\n‚úì All {len(required_files)} data files found!")

In [None]:
# STEP 2: Run TF-IDF Baselines
print("\n" + "="*80)
print("STEP 2: Training TF-IDF Baselines")
print("="*80 + "\n")

# Check if script exists
tfidf_script = SCRIPTS_DIR / "run_tfidf_baselines.py"
if not tfidf_script.exists():
    print(f"‚ö†Ô∏è  ERROR: Script not found: {tfidf_script}")
    raise FileNotFoundError(f"Missing script: {tfidf_script}")

print(f"‚úì Found TF-IDF script: {tfidf_script}")

# Try to import and run directly
try:
    from run_tfidf_baselines import train_and_evaluate_tfidf
    
    # Experiment 2.1: Jigsaw ‚Üí Civil & HateXplain
    print("\nRunning: TF-IDF Logistic Regression (Jigsaw ‚Üí Civil, HateXplain)")
    results_tfidf = train_and_evaluate_tfidf(
        source_dataset="jigsaw",
        target_datasets=["civil", "hatexplain"],
        model_type="logreg",
        seed=42,
        data_dir=str(DATA_DIR),
        save_preds=True,
    )
    
    print("\n‚úì TF-IDF baseline complete!")
    if results_tfidf and 'in_domain_test' in results_tfidf:
        print(f"In-domain test F1: {results_tfidf['in_domain_test']['f1']:.4f}")
        
except ImportError as e:
    print(f"‚ö†Ô∏è  Import Error: {e}")
    print("Running as subprocess instead...")
    
    # Fallback: Run as subprocess
    result = subprocess.run([
        sys.executable,
        str(tfidf_script),
        "--source_dataset", "jigsaw",
        "--target_datasets", "civil", "hatexplain",
        "--model", "logreg",
        "--seed", "42",
        "--data_dir", str(DATA_DIR),
        "--save_preds"
    ], capture_output=True, text=True, cwd=str(PROJECT_ROOT))
    
    print(result.stdout)
    if result.returncode != 0:
        print("STDERR:", result.stderr)
    results_tfidf = None
except Exception as e:
    print(f"‚ö†Ô∏è  Error running TF-IDF: {e}")
    results_tfidf = None

In [None]:
# STEP 3: Run RoBERTa Models (Optional - requires GPU)
print("\n" + "="*80)
print("STEP 3: RoBERTa Models (GPU Required)")
print("="*80 + "\n")

import torch

# Check GPU
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

if device == "cuda":
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")
    
    try:
        from run_roberta import train_and_evaluate
        
        # Experiment 3.1: Basic RoBERTa with calibration
        print("\n--- Experiment 3.1: RoBERTa with Isotonic Calibration ---")
        results_roberta = train_and_evaluate(
            source_dataset="jigsaw",
            target_datasets=["civil", "hatexplain"],
            model_name="roberta-base",
            epochs=3,
            batch_size=16,
            lr=2e-5,
            max_len=128,
            seed=42,
            data_dir=str(DATA_DIR),
            calibration="isotonic",
            early_stop=True,
            patience=2,
            tune_threshold=True,
            save_preds=True,
        )
        
        print("\n‚úì RoBERTa training complete!")
        if results_roberta and 'in_domain' in results_roberta:
            print(f"In-domain test F1: {results_roberta['in_domain']['test']['f1']:.4f}")
    except ImportError as e:
        print(f"‚ö†Ô∏è  Could not import RoBERTa script: {e}")
        results_roberta = None
    except Exception as e:
        print(f"‚ö†Ô∏è  Error running RoBERTa: {e}")
        results_roberta = None
else:
    print("\n‚ö†Ô∏è  No GPU available. Skipping RoBERTa training.")
    print("To run RoBERTa models, use Kaggle or Google Colab with GPU enabled.")
    results_roberta = None

In [None]:
# STEP 4: Compute Fairness Metrics
print("\n" + "="*80)
print("STEP 4: Computing Fairness Metrics")
print("="*80 + "\n")

try:
    from fairness_metrics import compute_group_fairness
    
    # Experiment 4.1: Cross-domain fairness (Jigsaw test set)
    print("Computing fairness for: Jigsaw test set")
    
    # Load predictions and full data
    pred_files = list(EXPERIMENTS_DIR.glob("preds_*.csv"))
    print(f"Found {len(pred_files)} prediction files")
    
    for pred_file in pred_files:
        print(f"\nProcessing: {pred_file.name}")
        
        # Determine which full data file to use
        if "jigsaw_test" in pred_file.name:
            full_data_file = DATA_DIR / "jigsaw_test_full.csv"
        elif "civil" in pred_file.name:
            full_data_file = DATA_DIR / "civil_test_full.csv"
        else:
            continue
            
        if not full_data_file.exists():
            print(f"  ‚ö†Ô∏è  Full data file not found: {full_data_file}")
            continue
            
        pred_df = pd.read_csv(pred_file)
        full_df = pd.read_csv(full_data_file)
        
        # Merge on ID if both have id columns
        if 'id' in pred_df.columns and 'id' in full_df.columns:
            merged_df = pred_df.merge(full_df, on="id", how="inner", suffixes=("", "_full"))
            if "label_full" in merged_df.columns:
                merged_df = merged_df.drop(columns=["label_full"])
        else:
            # Assume same order
            merged_df = pd.concat([pred_df, full_df.drop(columns=['text', 'label'], errors='ignore')], axis=1)
        
        # Find group columns
        group_cols = [c for c in merged_df.columns if c.startswith("g_")]
        
        if len(group_cols) == 0:
            print(f"  ‚ö†Ô∏è  No group columns found")
            continue
            
        print(f"  Found {len(group_cols)} identity groups")
        print(f"  Analyzing {len(merged_df)} predictions")
        
        # Compute fairness
        summary_df, per_group_df = compute_group_fairness(
            merged_df,
            group_cols=group_cols,
            label_col="label",
            pred_col="pred",
        )
        
        # Save results
        output_name = pred_file.stem.replace("preds_", "fairness_")
        summary_df.to_csv(EXPERIMENTS_DIR / f"{output_name}_summary.csv", index=False)
        per_group_df.to_csv(EXPERIMENTS_DIR / f"{output_name}_per_group.csv", index=False)
        
        # Print top fairness violations
        print(f"\n  Top 3 groups by Demographic Parity difference:")
        print(summary_df.nlargest(3, "dp_diff")[["group_col", "dp_diff"]].to_string(index=False))
        
    print("\n‚úì Fairness analysis complete!")
    
except ImportError as e:
    print(f"‚ö†Ô∏è  Could not import fairness_metrics: {e}")
except Exception as e:
    print(f"‚ö†Ô∏è  Error computing fairness: {e}")

In [None]:
# STEP 5: Generate Summary Statistics
print("\n" + "="*80)
print("STEP 5: Summary Statistics")
print("="*80 + "\n")

# Load all summary CSVs
summary_files = list(EXPERIMENTS_DIR.glob("summary_*.csv"))
print(f"Found {len(summary_files)} summary files:\n")

all_summaries = {}
for f in summary_files:
    df = pd.read_csv(f)
    model_name = f.stem.replace("summary_", "")
    all_summaries[model_name] = df
    
    print(f"--- {model_name} ---")
    cols_to_show = [c for c in ["split", "accuracy", "f1", "auroc", "pr_auc"] if c in df.columns]
    print(df[cols_to_show].to_string(index=False))
    print()

if all_summaries:
    # Create comparison table
    comparison_rows = []
    for model_name, df in all_summaries.items():
        for _, row in df.iterrows():
            comparison_rows.append({
                "Model": model_name,
                "Split": row.get("split", "unknown"),
                "Accuracy": f"{row.get('accuracy', 0):.4f}",
                "F1": f"{row.get('f1', 0):.4f}",
                "AUROC": f"{row.get('auroc', 0):.4f}",
                "PR-AUC": f"{row.get('pr_auc', 0):.4f}",
            })

    comparison_df = pd.DataFrame(comparison_rows)
    comparison_df.to_csv(EXPERIMENTS_DIR / "model_comparison.csv", index=False)
    print("‚úì Model comparison saved to: model_comparison.csv")
else:
    print("‚ö†Ô∏è  No summary files found. Run experiments first.")

In [None]:
# STEP 6: Quick Visualization Preview
print("\n" + "="*80)
print("STEP 6: Quick Visualization Preview")
print("="*80 + "\n")

import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

if all_summaries:
    # Plot 1: Cross-domain performance comparison
    fig, ax = plt.subplots(figsize=(12, 6))

    models_to_plot = list(all_summaries.keys())[:4]  # Plot up to 4 models
    colors = plt.cm.Set2(range(len(models_to_plot)))

    bar_width = 0.8 / max(len(models_to_plot), 1)
    
    for idx, model_name in enumerate(models_to_plot):
        df = all_summaries[model_name]
        splits = df["split"].values
        f1_scores = df["f1"].values
        
        x_pos = np.arange(len(splits)) + idx * bar_width
        ax.bar(x_pos, f1_scores, width=bar_width, label=model_name, alpha=0.8)

    ax.set_xlabel("Dataset Split", fontsize=12)
    ax.set_ylabel("F1 Score", fontsize=12)
    ax.set_title("Cross-Domain Performance Comparison", fontsize=14, fontweight="bold")
    
    # Set x-ticks
    if len(models_to_plot) > 0 and len(all_summaries[models_to_plot[0]]) > 0:
        splits = all_summaries[models_to_plot[0]]["split"].values
        ax.set_xticks(np.arange(len(splits)) + bar_width * (len(models_to_plot) - 1) / 2)
        ax.set_xticklabels(splits, rotation=45, ha="right")
    
    ax.legend(loc='upper right')
    ax.grid(True, alpha=0.3, axis="y")
    ax.set_ylim(0, 1.1)

    plt.tight_layout()
    
    # Ensure plots directory exists
    plots_dir = EXPERIMENTS_DIR / "plots"
    plots_dir.mkdir(parents=True, exist_ok=True)
    
    plt.savefig(plots_dir / "quick_comparison.png", dpi=150, bbox_inches="tight")
    plt.show()

    print("\n‚úì Quick visualization complete!")
    print("For full analysis, run: notebooks/analysis_plots.ipynb")
else:
    print("‚ö†Ô∏è  No summary data available for visualization.")

In [None]:
# FINAL: Experiment Summary Report
print("\n" + "="*80)
print("EXPERIMENT SUMMARY REPORT")
print("="*80 + "\n")

print("üìä COMPLETED EXPERIMENTS:")
print("  ‚úì TF-IDF Baseline (Logistic Regression)" if results_tfidf else "  ‚ö†Ô∏è TF-IDF Baseline (skipped or failed)")
print("  ‚úì RoBERTa with Calibration" if results_roberta else "  ‚ö†Ô∏è RoBERTa (skipped - no GPU)")
print("  ‚úì Cross-Domain Evaluation")
print("  ‚úì Fairness Analysis")
print("  ‚úì Summary Statistics\n")

print("üìÅ OUTPUT FILES:")
output_files = list(EXPERIMENTS_DIR.glob("*"))
print(f"  Total files generated: {len(output_files)}")
print(f"  Summary CSVs: {len(list(EXPERIMENTS_DIR.glob('summary_*.csv')))}")
print(f"  Prediction CSVs: {len(list(EXPERIMENTS_DIR.glob('preds_*.csv')))}")
print(f"  Fairness CSVs: {len(list(EXPERIMENTS_DIR.glob('fairness_*.csv')))}")

print("\nüìà NEXT STEPS:")
print("  1. Run notebooks/analysis_plots.ipynb to generate all visualizations")
print("  2. Review fairness metrics in fairness_*_summary.csv")
print("  3. Copy key plots from experiments/plots/ to your report")
print("  4. Use model_comparison.csv for quantitative results table")

print("\n" + "="*80)
print("Experiment run complete!")
print("="*80)