<a href="https://colab.research.google.com/github/montben/ContentModAPI/blob/main/notebooks/colab_training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Muzzle Content Moderation - Model Training

This notebook trains a BERT model for multi-label content moderation using your collected datasets.

## Training Pipeline:
1. 📊 Merge and prepare datasets
2. 🤖 Fine-tune BERT model 
3. 📈 Evaluate performance
4. 💾 Save trained model


In [None]:
# Prerequisites: Run colab_setup.ipynb first!
# This assumes your project is already loaded and dependencies installed

import os
import sys
import pandas as pd
import numpy as np
import torch
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns

# Make sure we're in the right directory
if not os.path.exists('scripts'):
    print("📁 Changing to project directory...")
    os.chdir('/content/ContentModAPI')

# Add project root to Python path
project_root = os.getcwd()
if project_root not in sys.path:
    sys.path.insert(0, project_root)

print(f"📁 Working directory: {os.getcwd()}")
print(f"📂 Contents: {os.listdir('.')}")

# Import your project modules
from scripts.preprocessing.label_schema import (
    LABEL_SCHEMA, LABEL_NAMES, LABEL_DESCRIPTIONS,
    create_label_vector, vector_to_labels
)
from scripts.training.train_bert import MultiLabelTrainer

print("✅ Imports successful!")
print(f"🚀 GPU available: {torch.cuda.is_available()}")
print(f"🎯 Ready for training!")


## Step 1: Data Preparation

First, let's merge all our collected datasets into a unified training format.


In [None]:
# Create the merge_datasets.py functionality inline for Colab
from sklearn.model_selection import train_test_split
import json

def merge_datasets():
    """Merge all collected datasets into unified training format."""

    data_dir = Path("data/datasets")
    all_data = []

    # Dataset directories to process
    dataset_dirs = [d for d in data_dir.iterdir() if d.is_dir()]
    print(f"Found datasets: {[d.name for d in dataset_dirs]}")

    for dataset_dir in dataset_dirs:
        processed_file = dataset_dir / "processed_data.csv"
        if processed_file.exists():
            print(f"Loading {dataset_dir.name}...")
            df = pd.read_csv(processed_file)
            df['dataset_source'] = dataset_dir.name
            all_data.append(df)
            print(f"  ✅ {len(df):,} samples from {dataset_dir.name}")
        else:
            print(f"  ❌ No processed_data.csv in {dataset_dir.name}")

    if not all_data:
        raise Exception("No datasets found! Run data collection first.")

    # Combine all datasets
    combined_df = pd.concat(all_data, ignore_index=True)
    print(f"\n📊 Combined dataset: {len(combined_df):,} total samples")

    # Ensure required columns exist
    label_columns = list(LABEL_SCHEMA.keys())
    missing_cols = [col for col in label_columns if col not in combined_df.columns]
    if missing_cols:
        print(f"⚠️  Adding missing label columns: {missing_cols}")
        for col in missing_cols:
            combined_df[col] = False

    # Create train/val/test splits
    print("Creating train/val/test splits...")

    # First split: 80% train, 20% temp
    train_df, temp_df = train_test_split(
        combined_df, test_size=0.2, random_state=42,
        stratify=combined_df['dataset_source']
    )

    # Second split: 10% val, 10% test from the temp 20%
    val_df, test_df = train_test_split(
        temp_df, test_size=0.5, random_state=42,
        stratify=temp_df['dataset_source']
    )

    # Add split column
    train_df = train_df.copy()
    val_df = val_df.copy()
    test_df = test_df.copy()

    train_df['split'] = 'train'
    val_df['split'] = 'val'
    test_df['split'] = 'test'

    # Combine back
    final_df = pd.concat([train_df, val_df, test_df], ignore_index=True)

    print(f"📈 Splits created:")
    print(f"  Train: {len(train_df):,} ({len(train_df)/len(final_df)*100:.1f}%)")
    print(f"  Val:   {len(val_df):,} ({len(val_df)/len(final_df)*100:.1f}%)")
    print(f"  Test:  {len(test_df):,} ({len(test_df)/len(final_df)*100:.1f}%)")

    return final_df

# Execute the merge
merged_data = merge_datasets()
