# Fine-tuning Experiment: Baseline vs SUBTLE_TOXICITY with Balanced Training

This notebook tests the effectiveness of SUBTLE_TOXICITY tagging by fine-tuning models with balanced training data and comparing their performance.

## Experiment Overview
- **Phase 1**: Fine-tune baseline model (no tagging) with balanced training data
- **Phase 2**: Fine-tune SUBTLE_TOXICITY tagged model with balanced training data
- **Phase 3**: Compare performance and analyze results
- **Phase 4**: Generate visualizations and recommendations

## Key Features
- Uses balanced training sets for fair comparison
- Faster training (~45 minutes vs 2+ hours)
- Cleaner experimental design
- Comprehensive evaluation and analysis

## 1. Setup and Imports

In [1]:
import sys
import os
from pathlib import Path

# Add notebook directory to path
notebook_dir = Path.cwd()
if notebook_dir not in sys.path:
    sys.path.append(str(notebook_dir))

# Core libraries
import torch
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, classification_report
import mlflow
import mlflow.pytorch
from tqdm import tqdm
import warnings

# Import our tagging class
parent_dir = os.path.abspath('..')  # Go up one level from notebooks/
sys.path.append(parent_dir)
from src.training.toxic_tagging_methods import ToxicCommentTagger, ImprovedToxicCommentTagger

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

# Set device for training
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Set plotting style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(


Using device: cuda


## 2. Load and Prepare Data

In [2]:
# Load data from the standard directory structure
base_dir = Path.cwd().parent if Path.cwd().name == "notebooks" else Path.cwd()
data_dir = base_dir / "src" / "data" / "raw"

# Load CSV files
train = pd.read_csv(data_dir / "train.csv")
test = pd.read_csv(data_dir / "test.csv")
test_labels = pd.read_csv(data_dir / "test_labels.csv")

# Define the toxicity labels
labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

print(f'Train samples: {len(train)}')
print(f'Test samples: {len(test)}')
print(f'Labels: {labels}')

Train samples: 159571
Test samples: 153164
Labels: ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [3]:
def prepare_test_for_evaluation(test_df, test_labels_df):
    """Prepare test data for evaluation by filtering valid samples."""
    # Filter out samples with -1 labels (invalid for evaluation)
    valid_mask = (test_labels_df[labels] != -1).all(axis=1)
    print(f'Test samples: {len(test_df)} total, {valid_mask.sum()} valid for evaluation')
    return test_df[valid_mask].copy(), test_labels_df[valid_mask].copy()

def create_balanced_subset(df, labels, n_per_class=200):
    """Create balanced subset for fair evaluation."""
    idxs = set()
    print(f'Creating balanced subset with {n_per_class} samples per class...')
    
    # Sample equal number of examples for each toxicity label
    for i, label in enumerate(labels):
        class_idxs = np.where(df[label].values == 1)[0]
        available = len(class_idxs)
        to_sample = min(n_per_class, available)
        
        if available > 0:
            chosen = np.random.choice(class_idxs, to_sample, replace=False)
            idxs.update(chosen)
            print(f'  {label}: {to_sample}/{available} samples')
        else:
            print(f'  {label}: 0 samples available!')
    
    # Add neutral samples (comments with no positive labels)
    neutral_idxs = np.where(df[labels].sum(axis=1) == 0)[0]
    neutral_available = len(neutral_idxs)
    neutral_to_sample = min(n_per_class, neutral_available)
    
    if neutral_available > 0:
        chosen_neutral = np.random.choice(neutral_idxs, neutral_to_sample, replace=False)
        idxs.update(chosen_neutral)
        print(f'  neutral: {neutral_to_sample}/{neutral_available} samples')
    
    # Create balanced dataframe
    idxs = list(idxs)
    balanced_df = df.iloc[idxs].copy()
    print(f'Total balanced samples: {len(balanced_df)}')
    return balanced_df

def create_balanced_training_subset(X_train, y_train, labels, n_per_class=2000):
    """Create balanced training subset using existing function."""
    # Create temporary DataFrame to use existing balanced subset function
    temp_df = pd.DataFrame({'comment_text': X_train})
    temp_df[labels] = y_train

    # Use existing function to create balanced subset
    balanced_df = create_balanced_subset(temp_df, labels, n_per_class)

    # Extract back to arrays for training
    X_balanced = balanced_df['comment_text'].values
    y_balanced = balanced_df[labels].values

    return X_balanced, y_balanced

In [4]:
# Prepare evaluation datasets
test_eval, test_labels_eval = prepare_test_for_evaluation(test, test_labels)

# Split training data into train and validation sets
X = train['comment_text'].values
y = train[labels].values

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.1, random_state=42, stratify=y[:, 0]
)

print(f'Train size: {len(X_train)}')
print(f'Validation size: {len(X_val)}')
print(f'Test evaluation size: {len(test_eval)}')

Test samples: 153164 total, 63978 valid for evaluation
Train size: 143613
Validation size: 15958
Test evaluation size: 63978


In [5]:
# Create balanced evaluation sets
np.random.seed(42)  # For reproducibility

# Create balanced validation set for evaluation
val_df_temp = pd.DataFrame({'comment_text': X_val})
val_df_temp[labels] = y_val
balanced_val_df = create_balanced_subset(val_df_temp, labels, n_per_class=150)

print('\nBalanced validation set created')
print('Class distribution:')
for label in labels:
    count = balanced_val_df[label].sum()
    print(f'  {label}: {count} samples')

Creating balanced subset with 150 samples per class...
  toxic: 150/1529 samples
  severe_toxic: 149/149 samples
  obscene: 150/847 samples
  threat: 50/50 samples
  insult: 150/800 samples
  identity_hate: 150/153 samples
  neutral: 150/14355 samples
Total balanced samples: 759

Balanced validation set created
Class distribution:
  toxic: 584 samples
  severe_toxic: 149 samples
  obscene: 451 samples
  threat: 50 samples
  insult: 439 samples
  identity_hate: 153 samples


## 3. Create Balanced Training Dataset

In [6]:
# Create balanced training subset for faster and fairer training
np.random.seed(42)  # For reproducibility
X_train_balanced, y_train_balanced = create_balanced_training_subset(
    X_train, y_train, labels, n_per_class=2000
)

print('\nBalanced training set created')
print('Training class distribution:')
for i, label in enumerate(labels):
    count = y_train_balanced[:, i].sum()
    print(f'  {label}: {count} samples')

Creating balanced subset with 2000 samples per class...
  toxic: 2000/13765 samples
  severe_toxic: 1446/1446 samples
  obscene: 2000/7602 samples
  threat: 428/428 samples
  insult: 2000/7077 samples
  identity_hate: 1252/1252 samples
  neutral: 2000/128991 samples
Total balanced samples: 8567

Balanced training set created
Training class distribution:
  toxic: 6216 samples
  severe_toxic: 1446 samples
  obscene: 4790 samples
  threat: 428 samples
  insult: 4606 samples
  identity_hate: 1252 samples


## 4. Initialize Tagger and MLflow

In [7]:
# Initialize the improved toxic comment tagger
tagger = ImprovedToxicCommentTagger(
    base_model_name='unitary/toxic-bert',
    device=str(device)
)

# Set MLflow experiment for tracking
mlflow.set_experiment('Toxic_BERT_Fine_Tuning_Balanced')

# Prepare evaluation data from balanced validation set
eval_texts = balanced_val_df['comment_text'].tolist()
eval_labels = balanced_val_df[labels].values

print(f'Evaluating on {len(eval_texts)} balanced samples')
print(f'Label distribution in evaluation set:')
for i, label in enumerate(labels):
    pos_samples = eval_labels[:, i].sum()
    print(f'  {label}: {pos_samples} positive samples')

Initialized ImprovedToxicCommentTagger with unitary/toxic-bert on cuda
Evaluating on 759 balanced samples
Label distribution in evaluation set:
  toxic: 584 positive samples
  severe_toxic: 149 positive samples
  obscene: 451 positive samples
  threat: 50 positive samples
  insult: 439 positive samples
  identity_hate: 153 positive samples


## 5. Import Fine-tuning Framework

In [8]:
# Import the integrated fine-tuning module

from src.training.integrated_fine_tuning import run_fine_tuning_experiment
print("Fine-tuning framework imported successfully!")
print("Ready to run baseline vs SUBTLE_TOXICITY comparison with balanced training data")

Fine-tuning framework imported successfully!
Ready to run baseline vs SUBTLE_TOXICITY comparison with balanced training data


## 6. Configure Experiment Parameters

In [9]:
# Configure experiment parameters for balanced training
EPOCHS = 3
BATCH_SIZE = 16  
LEARNING_RATE = 2e-5

print(f"Experiment Configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Device: {device}")
print(f"  Original training samples: {len(X_train)}")
print(f"  Balanced training samples: {len(X_train_balanced)}")
print(f"  Validation samples: {len(X_val)}")
print(f"  Evaluation samples: {len(eval_texts)}")
print(f"Expected training time: ~45 minutes (much faster than full dataset)")

Experiment Configuration:
  Epochs: 3
  Batch Size: 16
  Learning Rate: 2e-05
  Device: cuda
  Original training samples: 143613
  Balanced training samples: 8567
  Validation samples: 15958
  Evaluation samples: 759
Expected training time: ~45 minutes (much faster than full dataset)


## 7. Run Fine-tuning Experiment

This will run the complete experiment comparing baseline and SUBTLE_TOXICITY models using balanced training data.

In [None]:
# Run the complete fine-tuning experiment with balanced training data
results, ft_manager = run_fine_tuning_experiment(
    tagger=tagger,
    labels=labels,
    device=device,
    X_train=X_train_balanced,  # Use balanced training data
    y_train=y_train_balanced,  # Use balanced training labels
    X_val=X_val,
    y_val=y_val,
    eval_texts=eval_texts,
    eval_labels=eval_labels,
    epochs=EPOCHS,
    batch_size=BATCH_SIZE,
    learning_rate=LEARNING_RATE
)

🚀 STARTING FINE-TUNING EXPERIMENT
FineTuningManager initialized with unitary/toxic-bert on cuda

🔵 PHASE 1: BASELINE MODEL FINE-TUNING

PREPARING TRAINING DATA: BASELINE
Using baseline (no tagging)
Data prepared:
  Train: 8567 samples
  Validation: 15958 samples

FINE-TUNING MODEL: BASELINE


W0807 14:16:57.250000 25456 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.



Epoch 1/3
----------------------------------------


Training:  17%|█▋        | 89/536 [07:48<38:55,  5.23s/it, loss=0.2489]

## 8. Analyze Results

In [None]:
# Extract key results from the experiment
baseline_auc = results['baseline']['results']['test_auc']
tagged_auc = results['subtle_toxicity']['results']['test_auc']
improvement = results['comparison']['overall']['improvement']
improvement_pct = results['comparison']['overall']['improvement_pct']

print(f"EXPERIMENT RESULTS SUMMARY")
print(f"={'='*50}")
print(f"Baseline AUC:           {baseline_auc:.4f}")
print(f"SUBTLE_TOXICITY AUC:    {tagged_auc:.4f}")
print(f"Improvement:            {improvement:+.4f} ({improvement_pct:+.2f}%)")

# Determine result interpretation
if improvement > 0.01:
    print(f"RESULT: SUBTLE_TOXICITY tagging shows meaningful improvement!")
    print(f"Recommendation: Use the SUBTLE_TOXICITY model for production")
elif improvement > -0.01:
    print(f"RESULT: SUBTLE_TOXICITY tagging shows neutral results")
    print(f"Recommendation: Consider other tagging methods or stick with baseline")
else:
    print(f"RESULT: SUBTLE_TOXICITY tagging hurts performance")
    print(f"Recommendation: Use baseline model for production")

In [None]:
# Detailed per-label analysis
print(f"PER-LABEL PERFORMANCE ANALYSIS")
print(f"={'='*60}")

label_comparisons = results['comparison']['per_label']

# Sort labels by improvement for better visualization
sorted_labels = sorted(label_comparisons.items(), key=lambda x: x[1]['improvement'], reverse=True)

print(f"{'Label':<15} {'Baseline':<10} {'Tagged':<10} {'Improvement':<15} {'Status':<10}")
print(f"{'-'*70}")

for label, metrics in sorted_labels:
    baseline_val = metrics['baseline']
    tagged_val = metrics['tagged']
    improvement_val = metrics['improvement']
    improvement_pct_val = metrics['improvement_pct']
    
    # Determine status based on improvement
    if improvement_val > 0.01:
        status = "Better"
    elif improvement_val > -0.01:
        status = "Neutral"
    else:
        status = "Worse"
    
    print(f"{label:<15} {baseline_val:<10.4f} {tagged_val:<10.4f} "
          f"{improvement_val:+.4f} ({improvement_pct_val:+.1f}%) {status:<10}")

In [None]:
# Neutral comment analysis
baseline_neutral = results['baseline']['results']['neutral_analysis']
tagged_neutral = results['subtle_toxicity']['results']['neutral_analysis']

print(f"NEUTRAL COMMENT PERFORMANCE")
print(f"={'='*40}")
print(f"Total neutral comments: {baseline_neutral['total_neutral']}")
print(f"Baseline Model:")
print(f"  Correctly classified: {baseline_neutral['correct_neutral']}")
print(f"  Accuracy: {baseline_neutral['neutral_accuracy']:.4f}")
print(f"  False positive rate: {baseline_neutral['fp_rate']:.4f}")
print(f"SUBTLE_TOXICITY Model:")
print(f"  Correctly classified: {tagged_neutral['correct_neutral']}")
print(f"  Accuracy: {tagged_neutral['neutral_accuracy']:.4f}")
print(f"  False positive rate: {tagged_neutral['fp_rate']:.4f}")

# Calculate change in neutral performance
neutral_accuracy_change = tagged_neutral['neutral_accuracy'] - baseline_neutral['neutral_accuracy']
if abs(neutral_accuracy_change) > 0.01:
    if neutral_accuracy_change > 0:
        print(f"SUBTLE_TOXICITY improved neutral comment handling by {neutral_accuracy_change:+.3f}")
    else:
        print(f"SUBTLE_TOXICITY slightly hurt neutral comment handling by {neutral_accuracy_change:.3f}")
else:
    print(f"Neutral comment performance is similar between models")

## 9. Training Visualizations

In [None]:
# Plot training history comparison
training_plot = ft_manager.plot_training_history(['baseline', 'subtle_toxicity'])
plt.show()

In [None]:
# Create a summary comparison chart
fig, axes = plt.subplots(1, 2, figsize=(15, 6))

# Overall AUC comparison
methods = ['Baseline', 'SUBTLE_TOXICITY']
aucs = [baseline_auc, tagged_auc]
colors = ['skyblue', 'lightcoral']

bars = axes[0].bar(methods, aucs, color=colors, alpha=0.7, edgecolor='black')
axes[0].set_title('Overall AUC Comparison', fontsize=14, fontweight='bold')
axes[0].set_ylabel('AUC Score')
axes[0].set_ylim(0.95, 1.0)

# Add value labels on bars
for bar, auc in zip(bars, aucs):
    axes[0].text(bar.get_x() + bar.get_width()/2, bar.get_height() + 0.001, 
                f'{auc:.4f}', ha='center', va='bottom', fontweight='bold')

# Per-label improvements
label_names = list(label_comparisons.keys())
improvements = [label_comparisons[label]['improvement'] for label in label_names]

bar_colors = ['green' if imp > 0 else 'red' for imp in improvements]
bars2 = axes[1].bar(range(len(label_names)), improvements, color=bar_colors, alpha=0.7)
axes[1].set_title('Per-Label AUC Improvements', fontsize=14, fontweight='bold')
axes[1].set_ylabel('AUC Improvement')
axes[1].set_xticks(range(len(label_names)))
axes[1].set_xticklabels(label_names, rotation=45, ha='right')
axes[1].axhline(y=0, color='black', linestyle='-', alpha=0.3)

# Add value labels
for bar, imp in zip(bars2, improvements):
    axes[1].text(bar.get_x() + bar.get_width()/2, bar.get_height() + (0.001 if imp > 0 else -0.003), 
                f'{imp:+.3f}', ha='center', va='bottom' if imp > 0 else 'top', fontsize=10)

plt.tight_layout()
plt.show()

## 10. Model Selection and Recommendations

In [None]:
# Determine best model and provide recommendations
print(f"FINAL RECOMMENDATIONS")
print(f"={'='*50}")

if improvement > 0.005:  # Meaningful improvement threshold
    best_model = results['subtle_toxicity']['model']
    print(f"RECOMMENDATION: Use SUBTLE_TOXICITY model")
    print(f"Benefits:")
    print(f"   - Overall AUC improvement: {improvement:+.4f} ({improvement_pct:+.2f}%)")
    
    # Show best improved labels
    best_improvements = sorted(label_comparisons.items(), 
                              key=lambda x: x[1]['improvement'], reverse=True)[:3]
    print(f"   - Best improvements in:")
    for label, metrics in best_improvements:
        if metrics['improvement'] > 0.005:
            print(f"     * {label}: +{metrics['improvement']:.4f} ({metrics['improvement_pct']:+.1f}%)")
    
    print(f"Next Steps:")
    print(f"   1. Save the SUBTLE_TOXICITY model for production use")
    print(f"   2. Consider testing on larger evaluation sets")
    print(f"   3. Explore combining with other tagging methods")
    
else:
    best_model = results['baseline']['model']
    print(f"RECOMMENDATION: Use baseline model")
    print(f"Analysis:")
    if improvement > -0.005:
        print(f"   - SUBTLE_TOXICITY shows neutral results ({improvement:+.4f})")
        print(f"   - No significant performance gain from tagging")
    else:
        print(f"   - SUBTLE_TOXICITY hurts performance ({improvement:+.4f})")
        print(f"   - Tagging may be adding noise to the model")
    
    print(f"Next Steps:")
    print(f"   1. Use baseline model for production")
    print(f"   2. Try other tagging methods (EXPLICIT_MARKERS, CONTEXTUAL_INTENSITY)")
    print(f"   3. Consider ensemble approaches")
    print(f"   4. Analyze why tagging didn't help")

print(f"Model Artifacts:")
print(f"   - Baseline model saved as: fine_tuned_baseline_model/")
print(f"   - SUBTLE_TOXICITY model saved as: fine_tuned_subtle_toxicity_model/")
print(f"   - Training plots saved as: training_history_comparison.png")
print(f"   - MLflow experiment: Toxic_BERT_Fine_Tuning_Balanced")

## 11. Optional: Test Best Model on Sample Texts

In [None]:
# Test the best model on some sample texts
sample_texts = [
    "This is a great article, thanks for sharing!",
    "You are such an idiot, go kill yourself",
    "I disagree with your opinion but respect your right to have it",
    "What a stupid waste of time this article is",
    "Can you please provide more information about this topic?"
]

print(f"TESTING BEST MODEL ON SAMPLE TEXTS")
print(f"={'='*60}")

# Determine which model to use based on results
model_name = 'subtle_toxicity' if improvement > 0.005 else 'baseline'
print(f"Using {model_name.upper()} model for predictions\n")

# Use the tagger's predict method for demonstration
# In practice, you would load the fine-tuned model
predictions = tagger.predict(sample_texts)

for i, (text, pred) in enumerate(zip(sample_texts, predictions)):
    print(f"Text {i+1}: {text}")
    print(f"Predictions:")
    for j, label in enumerate(labels):
        score = pred[j]
        status = "TOXIC" if score > 0.5 else "SAFE"
        print(f"  {label:15}: {score:.3f} ({status})")
    print(f"{'-'*50}\n")

## 12. Experiment Summary

In [None]:
# Final experiment summary
print(f"EXPERIMENT SUMMARY")
print(f"={'='*50}")
print(f"Experiment: Baseline vs SUBTLE_TOXICITY Fine-tuning with Balanced Training")
print(f"Dataset: Kaggle Toxic Comment Classification")
print(f"Base Model: unitary/toxic-bert")
print(f"Original Training Samples: {len(X_train):,}")
print(f"Balanced Training Samples: {len(X_train_balanced):,}")
print(f"Validation Samples: {len(X_val):,}")
print(f"Evaluation Samples: {len(eval_texts):,}")
print(f"Training Configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Batch Size: {BATCH_SIZE}")
print(f"  Learning Rate: {LEARNING_RATE}")
print(f"  Device: {device}")
print(f"Results:")
print(f"  Baseline AUC: {baseline_auc:.4f}")
print(f"  SUBTLE_TOXICITY AUC: {tagged_auc:.4f}")
print(f"  Improvement: {improvement:+.4f} ({improvement_pct:+.2f}%)")
print(f"Conclusion: {'SUBTLE_TOXICITY is better' if improvement > 0.005 else 'Baseline is better or equivalent'}")
print(f"Experiment completed successfully!")

## 13. Additional Analysis (Optional)

In [None]:
# Optional: Analyze which types of comments benefit most from tagging
print("ADDITIONAL ANALYSIS: Comment Types That Benefit From Tagging")
print("=" * 65)

# This analysis would require examining specific examples
# where the tagged model performs better than baseline
print("This section could include:")
print("  - Examples where SUBTLE_TOXICITY model performs better")
print("  - Analysis of comment length vs tagging effectiveness")
print("  - Correlation between tagging frequency and improvement")
print("  - Error analysis comparing both models")
print("For now, refer to the detailed results above for insights.")