In [None]:
# ===============================================================================
# SECTION 1: INSTALLATIONS AND IMPORTS
# ===============================================================================

!pip install albumentations pillow cma scikit-learn matplotlib seaborn pandas numpy tqdm ipywidgets
!pip install --upgrade tokenizers
!pip install "transformers==4.33.2" "textattack" "sentence-transformers<=2.2.2" --upgrade
!pip install imbalanced-learn
!pip install easyocr
!pip install opencv-python-headless
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install git+https://github.com/openai/CLIP.git
!pip install accelerate

from google.colab import drive
drive.mount('/content/drive')

import os
import torch
import numpy as np
import pandas as pd
import json
import random
from PIL import Image
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.feature_selection import SelectKBest, f_classif, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split, StratifiedShuffleSplit

from sklearn.metrics import (
    classification_report, confusion_matrix, f1_score,
    roc_auc_score, accuracy_score, precision_score,
    recall_score, matthews_corrcoef, balanced_accuracy_score,
    precision_recall_curve, roc_curve, auc
)

import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn.preprocessing import normalize

from transformers import Blip2Processor, Blip2Model
from transformers import (
    AutoTokenizer, AutoModel,
    RobertaTokenizer, RobertaModel,
    BertTokenizer, BertModel
)

try:
    import clip
    print("CLIP imported successfully")
except ImportError:
    print("Installing CLIP...")
    import subprocess
    import sys
    subprocess.check_call([sys.executable, "-m", "pip", "install", "git+https://github.com/openai/CLIP.git"])
    import clip
    print("CLIP installed and imported successfully")

import cma
from imblearn.over_sampling import SMOTE, ADASYN
import easyocr
import cv2

from IPython.display import display, HTML

print("All imports completed successfully")

Collecting cma
  Downloading cma-4.2.0-py3-none-any.whl.metadata (7.7 kB)
Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Downloading jedi-0.19.2-py2.py3-none-any.whl.metadata (22 kB)
Downloading cma-4.2.0-py3-none-any.whl (288 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m288.2/288.2 kB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading jedi-0.19.2-py2.py3-none-any.whl (1.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m82.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jedi, cma
Successfully installed cma-4.2.0 jedi-0.19.2
Collecting transformers==4.33.2
  Downloading transformers-4.33.2-py3-none-any.whl.metadata (119 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.9/119.9 kB[0m [31m10.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting textattack
  Downloading textattack-0.3.10-py3-none-any.whl.metadata (38 kB)
Collecting sentence-transformers<=2.2.2
  Downloadin

In [None]:
# ===============================================================================
# SECTION 2: CONFIGURATION AND CONSTANTS
# ===============================================================================

# Directory Configuration
BASE_DIR = "/content/drive/MyDrive/Colab_Notebooks/FinalYearProject"
DATA_DIR = os.path.join(BASE_DIR, "data")
IMG_DIR = os.path.join(DATA_DIR, "img")
FEATURES_DIR = os.path.join(BASE_DIR, "features")
RESULTS_DIR = os.path.join(BASE_DIR, "results")


# Create directories if they don't exist
for dir_path in [FEATURES_DIR, RESULTS_DIR]:
    os.makedirs(dir_path, exist_ok=True)

# Model Configurations
BLIP2_MODEL = "Salesforce/blip2-opt-2.7b"
CLIP_MODEL = "ViT-B/32"
TEXT_MODEL = "roberta-base"

# Training Settings
OCR_CONFIDENCE_THRESHOLD = 0.75
OCR_MAX_WORDS = 20
USE_OCR = True

# CMA-ES Configuration
CMAES_SIGMA = 0.3
CMAES_POPSIZE = 8
CMAES_MAXITER = 12

# Feature Processing Settings
FEATURE_DIM_TARGET = 512
USE_FEATURE_SELECTION = True
USE_SMOTE = True
HATE_CLASS_WEIGHT = 3.0

DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
RANDOM_SEED = 42

print(f"Configuration completed. Device: {DEVICE}")


Configuration completed. Device: cuda


In [None]:
# ===============================================================================
# SECTION 3: UTILITY FUNCTIONS AND SEED SETTING
# ===============================================================================

def set_all_seeds(seed=42):
    """Set all random seeds for reproducibility"""
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    os.environ['PYTHONHASHSEED'] = str(seed)
    print(f"All seeds set to {seed}")

def safe_image_load(img_path):
    """Safely load an image file"""
    try:
        if not os.path.exists(img_path):
            print(f"Image not found: {img_path}")
            return None
        image = Image.open(img_path).convert("RGB")
        return image
    except Exception as e:
        print(f"Error loading image {img_path}: {e}")
        return None

def load_jsonl_safe(path):
    """Safely load JSONL data"""
    try:
        data = pd.read_json(path, lines=True)
        print(f"Loaded {len(data)} samples from {os.path.basename(path)}")
        return data
    except Exception as e:
        print(f"Error loading {path}: {e}")
        return pd.DataFrame()

def validate_data(df, split_name):
    """Validate dataset structure and content"""
    required_cols = ['img', 'text', 'label']
    missing_cols = [col for col in required_cols if col not in df.columns]
    if missing_cols:
        print(f"Missing columns in {split_name}: {missing_cols}")
        return False

    missing_data = df[required_cols].isnull().sum()
    if missing_data.sum() > 0:
        print(f"Missing values in {split_name}:\n{missing_data}")

    label_dist = df['label'].value_counts().sort_index()
    print(f"{split_name} label distribution:\n{label_dist}")
    return True

set_all_seeds(RANDOM_SEED)

All seeds set to 42


In [None]:
# ===============================================================================
# SECTION 4: HATE-SPECIFIC FEATURE EXTRACTION
# ===============================================================================

class HateFeatureExtractor:
    """Extract hate-specific linguistic and semantic features"""

    def __init__(self):
        # Comprehensive hate speech indicators
        self.hate_keywords = [
            'illegal', 'terrorist', 'invasion', 'destroy', 'criminal', 'violence',
            'attack', 'threat', 'dangerous', 'enemy', 'savage', 'primitive',
            'backwards', 'inferior', 'subhuman', 'vermin', 'plague', 'disease',
            'cancer', 'virus', 'parasite', 'leech', 'scum', 'trash', 'garbage'
        ]

        self.religious_terms = [
            'muslim', 'islam', 'islamic', 'christian', 'jewish', 'jew', 'hindu',
            'buddhist', 'religion', 'religious', 'mosque', 'church', 'synagogue',
            'temple', 'allah', 'god', 'jesus', 'mohammed', 'prophet', 'holy'
        ]

        self.racial_terms = [
            'race', 'racial', 'ethnic', 'ethnicity', 'color', 'skin', 'black',
            'white', 'asian', 'hispanic', 'latino', 'african', 'european',
            'american', 'mexican', 'chinese', 'indian', 'arab', 'nationality'
        ]

        self.political_terms = [
            'liberal', 'conservative', 'democrat', 'republican', 'leftist',
            'rightist', 'communist', 'fascist', 'socialist', 'capitalist',
            'progressive', 'traditional', 'government', 'politics', 'political'
        ]

        self.intensity_words = [
            'very', 'extremely', 'totally', 'completely', 'absolutely', 'definitely',
            'always', 'never', 'all', 'none', 'every', 'no', 'only', 'just'
        ]

        self.negative_emotions = [
            'hate', 'angry', 'furious', 'disgusted', 'disgusting', 'horrible',
            'terrible', 'awful', 'pathetic', 'stupid', 'dumb', 'idiotic',
            'moronic', 'ridiculous', 'absurd', 'insane', 'crazy', 'sick'
        ]

        print("Hate Feature Extractor initialized with comprehensive vocabularies")

    def extract_linguistic_features(self, text):
        """Extract linguistic patterns indicative of hate speech"""
        text_lower = text.lower()
        words = text_lower.split()

        features = []

        # Basic counts
        features.append(len(words))  # Text length
        features.append(len([w for w in words if w.isupper()]))  # ALL CAPS words
        features.append(text.count('!'))  # Exclamation marks
        features.append(text.count('?'))  # Question marks
        features.append(len([w for w in words if any(c in w for c in '!@#$%^&*')]))  # Special chars

        # Hate indicators
        hate_count = sum(1 for word in self.hate_keywords if word in text_lower)
        features.append(hate_count)
        features.append(min(hate_count / max(len(words), 1), 1.0))  # Hate density

        # Category counts
        features.append(sum(1 for word in self.religious_terms if word in text_lower))
        features.append(sum(1 for word in self.racial_terms if word in text_lower))
        features.append(sum(1 for word in self.political_terms if word in text_lower))
        features.append(sum(1 for word in self.intensity_words if word in text_lower))
        features.append(sum(1 for word in self.negative_emotions if word in text_lower))

        # Pattern features
        features.append(1 if any(word in text_lower for word in ['you', 'your', 'you\'re']) else 0)  # Direct address
        features.append(1 if any(word in text_lower for word in ['they', 'them', 'their', 'those']) else 0)  # Group reference
        features.append(1 if any(word in text_lower for word in ['all', 'every', 'always', 'never']) else 0)  # Generalizations

        # Sentiment patterns
        features.append(len([w for w in words if w.endswith('ing')]))  # Action words
        features.append(len([w for w in words if w.endswith('ed')]))  # Past actions
        features.append(1 if text_lower.startswith(('why', 'how', 'what', 'when', 'where')) else 0)  # Questions

        return np.array(features, dtype=np.float32)

    def extract_features(self, image, text, ocr_text=""):
        """Extract comprehensive hate-specific features"""
        # Combine text and OCR
        full_text = f"{text} {ocr_text}".strip()

        # Extract linguistic features
        linguistic_features = self.extract_linguistic_features(full_text)

        # Add OCR-specific features
        ocr_features = []
        if ocr_text:
            ocr_features.append(len(ocr_text.split()))  # OCR word count
            ocr_features.append(len(ocr_text))  # OCR character count
            ocr_features.append(1 if any(word in ocr_text.lower() for word in self.hate_keywords) else 0)  # OCR hate
        else:
            ocr_features = [0, 0, 0]

        # Combine all features
        all_features = np.concatenate([linguistic_features, ocr_features])

        # Pad to standard size
        target_size = 25
        if len(all_features) < target_size:
            all_features = np.pad(all_features, (0, target_size - len(all_features)))
        elif len(all_features) > target_size:
            all_features = all_features[:target_size]

        return all_features

In [None]:
# ===============================================================================
# SECTION 5: SHARED OCR EXTRACTION
# ===============================================================================

def extract_shared_ocr(image):
    """Extract OCR text once per image to avoid redundancy"""
    if not USE_OCR or image is None:
        return ""

    try:
        # Use global OCR reader
        if not hasattr(extract_shared_ocr, 'ocr_reader'):
            print("Initializing shared OCR reader...")
            extract_shared_ocr.ocr_reader = easyocr.Reader(['en'], gpu=torch.cuda.is_available())
            print("Shared OCR reader initialized")

        image_np = np.array(image)
        results = extract_shared_ocr.ocr_reader.readtext(image_np)

        extracted_texts = []
        for (bbox, text, confidence) in results:
            if confidence > OCR_CONFIDENCE_THRESHOLD:
                text_clean = text.strip()
                if (len(text_clean) > 1 and
                    not text_clean.startswith(('©', '™', '®', '@', '#')) and
                    not text_clean.isdigit() and
                    len(text_clean) < 50):
                    extracted_texts.append(text_clean)

        if len(extracted_texts) > OCR_MAX_WORDS:
            extracted_texts = extracted_texts[:OCR_MAX_WORDS]

        ocr_text = " ".join(extracted_texts)
        return ocr_text if ocr_text.strip() else ""
    except Exception as e:
        print(f"OCR extraction failed: {e}")
        return ""



In [None]:
# ===============================================================================
# SECTION 6: OPTIMIZED FEATURE EXTRACTORS
# ===============================================================================

class OptimizedCLIPFeatureExtractor:
    """Optimized CLIP feature extractor without redundant OCR"""

    def __init__(self, model_name="ViT-B/32"):
        self.device = DEVICE
        self.model, self.preprocess = clip.load(model_name, device=self.device)
        self.model.eval()
        print(f"Optimized CLIP {model_name} Feature Extractor initialized")

    def extract_features(self, image, text, ocr_text=""):
        """Extract CLIP features with pre-extracted OCR"""
        try:
            # Combine text and OCR
            if ocr_text.strip():
                combined_text = f"{text} {ocr_text}".strip()
            else:
                combined_text = text

            image_input = self.preprocess(image).unsqueeze(0).to(self.device)
            text_input = clip.tokenize([combined_text], truncate=True).to(self.device)

            with torch.no_grad():
                image_features = self.model.encode_image(image_input)
                text_features = self.model.encode_text(text_input)
                combined_features = torch.cat([
                    image_features.squeeze(),
                    text_features.squeeze()
                ], dim=0)
                combined_features = F.normalize(combined_features, p=2, dim=0)
                features = combined_features.cpu().numpy()

                if np.isnan(features).any() or np.isinf(features).any():
                    return None

                return features

        except Exception as e:
            print(f"CLIP feature extraction failed: {e}")
            return None

class OptimizedTextOnlyFeatureExtractor:
    """Optimized text-only feature extractor without redundant OCR"""

    def __init__(self, model_name="roberta-base"):
        self.device = DEVICE
        self.model_name = model_name

        if 'roberta' in model_name.lower():
            self.tokenizer = RobertaTokenizer.from_pretrained(model_name)
            self.model = RobertaModel.from_pretrained(model_name)
        elif 'bert' in model_name.lower():
            self.tokenizer = BertTokenizer.from_pretrained(model_name)
            self.model = BertModel.from_pretrained(model_name)
        else:
            self.tokenizer = AutoTokenizer.from_pretrained(model_name)
            self.model = AutoModel.from_pretrained(model_name)

        self.model.to(self.device).eval()
        print(f"Optimized Text-only {model_name} Feature Extractor initialized")

    def extract_features(self, image, text, ocr_text=""):
        """Extract text features with pre-extracted OCR"""
        try:
            # Combine text and OCR
            if ocr_text.strip():
                combined_text = f"{text} {ocr_text}".strip()
            else:
                combined_text = text

            inputs = self.tokenizer(
                combined_text,
                return_tensors="pt",
                max_length=512,
                truncation=True,
                padding=True
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs)

                if hasattr(outputs, 'last_hidden_state'):
                    features = outputs.last_hidden_state.mean(dim=1).squeeze()
                else:
                    features = outputs.pooler_output.squeeze()

                features = features.cpu().numpy()

                if np.isnan(features).any() or np.isinf(features).any():
                    return None

                return features

        except Exception as e:
            print(f"Text feature extraction failed: {e}")
            return None

class OptimizedBLIP2FeatureExtractor:
    """Optimized BLIP2 feature extractor without redundant OCR"""

    def __init__(self, model_name="Salesforce/blip2-opt-2.7b"):
        self.device = DEVICE
        self.processor = Blip2Processor.from_pretrained(model_name, use_fast=False)
        self.model = Blip2Model.from_pretrained(model_name).to(self.device).eval()
        print("Optimized BLIP2 Feature Extractor initialized")

    def extract_features(self, image, text, ocr_text=""):
        """Extract BLIP2 features with pre-extracted OCR"""
        try:
            # Combine text and OCR
            if ocr_text.strip():
                combined_text = f"{text} {ocr_text}".strip()
            else:
                combined_text = text

            inputs = self.processor(
                images=image,
                text=combined_text,
                return_tensors="pt",
                truncation=True,
                max_length=77,
                padding=True
            ).to(self.device)

            with torch.no_grad():
                outputs = self.model(**inputs, output_hidden_states=True)

                # Extract Q-Former features
                qformer_features = outputs.qformer_outputs.last_hidden_state.mean(dim=1).squeeze()

                # Extract language features if available
                if hasattr(outputs, 'language_model_outputs') and outputs.language_model_outputs is not None:
                    if hasattr(outputs.language_model_outputs, 'hidden_states') and outputs.language_model_outputs.hidden_states:
                        language_features = outputs.language_model_outputs.hidden_states[-1].mean(dim=1).squeeze()
                    else:
                        language_features = torch.zeros_like(qformer_features)
                else:
                    language_features = torch.zeros_like(qformer_features)

                # Combine features
                combined_features = torch.cat([qformer_features, language_features], dim=0)
                features = combined_features.cpu().numpy()

                if np.isnan(features).any() or np.isinf(features).any():
                    return None

                return features

        except Exception as e:
            print(f"BLIP2 feature extraction failed: {e}")
            return None



In [None]:
# ===============================================================================
# SECTION 7: OPTIMIZED FEATURE EXTRACTION PIPELINE
# ===============================================================================

def extract_optimized_ensemble_features(df, extractors, split_name):
    """Optimized ensemble feature extraction with shared OCR and feature standardization"""
    all_features = {model_name: [] for model_name in extractors.keys()}
    valid_samples = []
    failed_count = 0
    ocr_cache = {}  # Cache OCR results

    print(f"Extracting optimized ensemble features for {split_name}...")

    for i, row in tqdm(df.iterrows(), total=len(df), desc=f"{split_name} extraction"):
        img_path = os.path.join(IMG_DIR, os.path.basename(row['img']))
        text = str(row['text'])
        label = row.get('label', None)

        image = safe_image_load(img_path)
        if image is None:
            failed_count += 1
            continue

        # Extract OCR once per image
        if img_path not in ocr_cache:
            ocr_text = extract_shared_ocr(image)
            ocr_cache[img_path] = ocr_text
        else:
            ocr_text = ocr_cache[img_path]

        # Extract features from all models using shared OCR
        features_dict = {}
        for model_name, extractor in extractors.items():
            if hasattr(extractor, 'extract_features'):
                features = extractor.extract_features(image, text, ocr_text)
            else:
                features = extractor.extract_features(image, text)

            if features is not None and not (np.isnan(features).any() or np.isinf(features).any()):
                features_dict[model_name] = features
            else:
                features_dict = None
                break

        if features_dict and len(features_dict) == len(extractors):
            for model_name, features in features_dict.items():
                all_features[model_name].append(features)

            valid_samples.append({
                'id': i,
                'label': label,
                'img': row['img'],
                'text': text
            })
        else:
            failed_count += 1

    # Convert to numpy arrays
    final_features = {}
    for model_name in extractors.keys():
        if all_features[model_name]:
            final_features[model_name] = np.stack(all_features[model_name])
        else:
            print(f"No valid features extracted for {model_name}")

    print(f"{split_name} optimized extraction: {len(valid_samples)} samples, {failed_count} failed")
    print(f"   Feature shapes: {[(name, arr.shape) for name, arr in final_features.items()]}")

    return final_features, valid_samples



In [None]:
# ===============================================================================
# SECTION 8: PSEUDO-LABELING APPROACH
# ===============================================================================

def pseudo_label_approach(train_features, val_features, test_features, y_train, y_val):
    """Use pseudo-labeling on test set to augment training data"""

    print(" Pseudo-labeling approach...")

    # First, train a strong model on train+val combined
    X_all_train = np.vstack([train_features['blip2'], val_features['blip2']])
    y_all_train = np.hstack([y_train, y_val])
    X_test = test_features['blip2']

    print(f"Combined training data: {X_all_train.shape}")
    print(f"Test data for pseudo-labeling: {X_test.shape}")

    # Clean and scale
    feature_vars = np.var(X_all_train, axis=0)
    useful_features = feature_vars > 1e-8
    X_all_clean = X_all_train[:, useful_features]
    X_test_clean = X_test[:, useful_features]

    scaler = RobustScaler()
    X_all_scaled = scaler.fit_transform(X_all_clean)
    X_test_scaled = scaler.transform(X_test_clean)

    # Train ensemble for pseudo-labeling
    class PseudoLabelModel(nn.Module):
        def __init__(self, input_dim):
            super().__init__()
            self.network = nn.Sequential(
                nn.Linear(input_dim, 1024),
                nn.BatchNorm1d(1024),
                nn.ReLU(),
                nn.Dropout(0.2),
                nn.Linear(1024, 512),
                nn.BatchNorm1d(512),
                nn.ReLU(),
                nn.Dropout(0.3),
                nn.Linear(512, 256),
                nn.BatchNorm1d(256),
                nn.ReLU(),
                nn.Dropout(0.4),
                nn.Linear(256, 2)
            )

        def forward(self, x):
            return self.network(x)

    # Train multiple models for ensemble pseudo-labeling
    pseudo_predictions = []
    pseudo_confidences = []

    for seed in [42, 43, 44]:
        torch.manual_seed(seed)

        model = PseudoLabelModel(X_all_scaled.shape[1]).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4, weight_decay=1e-4)
        criterion = nn.CrossEntropyLoss()

        # Train on all available labeled data
        for epoch in range(30):
            model.train()
            indices = torch.randperm(len(X_all_scaled))

            for i in range(0, len(indices), 64):
                batch_indices = indices[i:i+64]
                batch_x = torch.tensor(X_all_scaled[batch_indices], dtype=torch.float32).to(DEVICE)
                batch_y = torch.tensor(y_all_train[batch_indices], dtype=torch.long).to(DEVICE)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

        # Generate pseudo-labels for test set
        model.eval()
        with torch.no_grad():
            test_tensor = torch.tensor(X_test_scaled, dtype=torch.float32).to(DEVICE)
            test_outputs = model(test_tensor)
            test_probs = torch.softmax(test_outputs, dim=1).cpu().numpy()
            test_preds = test_outputs.argmax(dim=1).cpu().numpy()
            test_conf = test_probs.max(axis=1)

        pseudo_predictions.append(test_preds)
        pseudo_confidences.append(test_conf)

    # Ensemble pseudo-labels with high confidence filtering
    ensemble_preds = np.round(np.mean(pseudo_predictions, axis=0)).astype(int)
    ensemble_conf = np.mean(pseudo_confidences, axis=0)

    # Only use high-confidence pseudo-labels
    confidence_threshold = 0.85  # Very conservative
    high_conf_mask = ensemble_conf > confidence_threshold

    print(f"High-confidence pseudo-labels: {high_conf_mask.sum()}/{len(ensemble_preds)} ({high_conf_mask.mean()*100:.1f}%)")

    if high_conf_mask.sum() > 100:  # Only if we have enough
        # Add pseudo-labeled data to training
        X_pseudo = X_test_scaled[high_conf_mask]
        y_pseudo = ensemble_preds[high_conf_mask]

        X_augmented = np.vstack([X_all_scaled, X_pseudo])
        y_augmented = np.hstack([y_all_train, y_pseudo])

        print(f"Augmented training data: {X_augmented.shape}")
        print(f"Pseudo-label distribution: {np.bincount(y_pseudo)}")

        return X_augmented, y_augmented, X_test_scaled, scaler, useful_features
    else:
        print("Not enough high-confidence pseudo-labels, skipping augmentation")
        return X_all_scaled, y_all_train, X_test_scaled, scaler, useful_featureses

In [None]:

# ===============================================================================
# SECTION 9: SMART VALIDATION SPLIT
# ===============================================================================

def smart_validation_split(X_all, y_all, n_splits=10):
    print(f"Testing {n_splits} different train/val splits...")
    split_results = []

    for i in range(n_splits):
        # Different random splits
        splitter = StratifiedShuffleSplit(n_splits=1, test_size=0.15, random_state=42+i)
        train_idx, val_idx = next(splitter.split(X_all, y_all))

        X_train_split = X_all[train_idx]
        X_val_split = X_all[val_idx]
        y_train_split = y_all[train_idx]
        y_val_split = y_all[val_idx]

        # Quick model training to evaluate split quality
        class QuickModel(nn.Module):
            def __init__(self, input_dim):
                super().__init__()
                self.network = nn.Sequential(
                    nn.Linear(input_dim, 512),
                    nn.ReLU(),
                    nn.Dropout(0.3),
                    nn.Linear(512, 256),
                    nn.ReLU(),
                    nn.Dropout(0.4),
                    nn.Linear(256, 2)
                )
            def forward(self, x):
                return self.network(x)

        torch.manual_seed(42)
        model = QuickModel(X_train_split.shape[1]).to(DEVICE)
        optimizer = torch.optim.AdamW(model.parameters(), lr=1e-4)
        criterion = nn.CrossEntropyLoss()

        # Quick training
        best_f1 = 0
        for epoch in range(20):
            model.train()
            for j in range(0, len(X_train_split), 64):
                batch_x = torch.tensor(X_train_split[j:j+64], dtype=torch.float32).to(DEVICE)
                batch_y = torch.tensor(y_train_split[j:j+64], dtype=torch.long).to(DEVICE)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                optimizer.step()

            # Quick validation
            model.eval()
            with torch.no_grad():
                val_outputs = model(torch.tensor(X_val_split, dtype=torch.float32).to(DEVICE))
                val_preds = val_outputs.argmax(dim=1).cpu().numpy()
                f1 = f1_score(y_val_split, val_preds, average='weighted')
                if f1 > best_f1:
                    best_f1 = f1

        split_results.append((i, best_f1, train_idx, val_idx))
        print(f"Split {i+1}: F1 = {best_f1:.4f}")

    # Choose best split
    best_split_idx, best_split_f1, best_train_idx, best_val_idx = max(split_results, key=lambda x: x[1])

    print(f"\nBest split: #{best_split_idx+1} with F1 = {best_split_f1:.4f}")

    return best_train_idx, best_val_idx

In [None]:
# ===============================================================================
# SECTION 10: CMA-ES OPTIMIZER FOR DATA-CENTRIC APPROACH
# ===============================================================================

class DataCentricCMAESOptimizer:
    """CMA-ES optimizer that saves best models during optimization"""

    def __init__(self, X_train, X_val, y_train, y_val):
        self.X_train = X_train
        self.X_val = X_val
        self.y_train = y_train
        self.y_val = y_val
        self.device = DEVICE

        # Storage for best models during optimization
        self.best_models = []  # Store (model_state, f1_score, params, trial_info)
        self.f1_threshold = 0.77  # Save models above this threshold

        # Simplified parameter bounds for data-centric model
        self.param_bounds = {
            'learning_rate': (1e-5, 1e-3),
            'dropout1': (0.1, 0.3),      # First dropout
            'dropout2': (0.2, 0.4),      # Second dropout
            'dropout3': (0.3, 0.5),      # Third dropout
            'hidden1': (512, 1536),      # First hidden layer
            'hidden2': (256, 768),       # Second hidden layer
            'hidden3': (128, 384),       # Third hidden layer
            'batch_size': (32, 128),
            'weight_decay': (1e-6, 1e-3),
            'epochs': (30, 80)
        }

        # Initial parameter values (current best-known)
        self.initial_params = [
            -4.15,  # log10(learning_rate) = 7e-5
            0.15,   # dropout1
            0.25,   # dropout2
            0.35,   # dropout3
            1024,   # hidden1
            512,    # hidden2
            256,    # hidden3
            64,     # batch_size
            -4.3,   # log10(weight_decay) = 5e-5
            50      # epochs
        ]

        print("Data-Centric CMA-ES Optimizer initialized with model saving")

    def decode_parameters(self, raw_params):
        """Convert CMA-ES parameters to usable hyperparameters"""
        log_lr, d1, d2, d3, h1, h2, h3, batch, log_wd, epochs = raw_params

        return {
            'learning_rate': 10 ** np.clip(log_lr, -5, -3),
            'dropout1': np.clip(d1, 0.1, 0.3),
            'dropout2': np.clip(d2, 0.2, 0.4),
            'dropout3': np.clip(d3, 0.3, 0.5),
            'hidden1': int(np.clip(h1, 512, 1536)),
            'hidden2': int(np.clip(h2, 256, 768)),
            'hidden3': int(np.clip(h3, 128, 384)),
            'batch_size': int(np.clip(batch, 32, 128)),
            'weight_decay': 10 ** np.clip(log_wd, -6, -3),
            'epochs': int(np.clip(epochs, 30, 80))
        }

    def objective_function(self, raw_params, trial_num=None, generation=None):
        """Objective function that saves best models during optimization"""
        try:
            params = self.decode_parameters(raw_params)

            if trial_num is not None:
                print(f"Trial {trial_num}: LR={params['learning_rate']:.2e}, "
                      f"H=[{params['hidden1']},{params['hidden2']},{params['hidden3']}], "
                      f"D=[{params['dropout1']:.2f},{params['dropout2']:.2f},{params['dropout3']:.2f}]")

            # Create model with optimized parameters
            model = CMAESOptimizedModel(self.X_train.shape[1], params).to(self.device)

            # Setup training
            optimizer = torch.optim.AdamW(
                model.parameters(),
                lr=params['learning_rate'],
                weight_decay=params['weight_decay']
            )

            # Enhanced class weighting
            class_counts = np.bincount(self.y_train)
            weights = len(self.y_train) / (2 * class_counts)
            weights[1] *= 1.05  # Slight hate bias
            class_weights = torch.tensor(weights, dtype=torch.float32).to(self.device)
            criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.02)

            # Training loop
            best_val_f1 = 0
            best_model_state = None
            patience_counter = 0
            max_patience = 8

            for epoch in range(params['epochs']):
                model.train()

                # Training
                indices = torch.randperm(len(self.X_train))
                for i in range(0, len(indices), params['batch_size']):
                    batch_indices = indices[i:i + params['batch_size']]
                    batch_x = torch.tensor(self.X_train[batch_indices], dtype=torch.float32).to(self.device)
                    batch_y = torch.tensor(self.y_train[batch_indices], dtype=torch.long).to(self.device)

                    optimizer.zero_grad()
                    outputs = model(batch_x)
                    loss = criterion(outputs, batch_y)

                    if torch.isnan(loss) or torch.isinf(loss):
                        break

                    loss.backward()
                    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                    optimizer.step()

                # Validation every 5 epochs to save time
                if epoch % 5 == 0 or epoch == params['epochs'] - 1:
                    model.eval()
                    with torch.no_grad():
                        val_tensor = torch.tensor(self.X_val, dtype=torch.float32).to(self.device)
                        val_outputs = model(val_tensor)
                        val_predictions = val_outputs.argmax(dim=1).cpu().numpy()

                    val_f1 = f1_score(self.y_val, val_predictions, average='weighted')

                    if val_f1 > best_val_f1:
                        best_val_f1 = val_f1
                        best_model_state = model.state_dict().copy()  # Save best model state
                        patience_counter = 0
                    else:
                        patience_counter += 1
                        if patience_counter >= max_patience:
                            break

            # Save model if it's good enough
            if best_val_f1 > self.f1_threshold and best_model_state is not None:
                model_info = {
                    'model_state': best_model_state,
                    'f1_score': best_val_f1,
                    'params': params.copy(),
                    'trial_info': {
                        'generation': generation,
                        'trial': trial_num,
                        'raw_params': raw_params.copy()
                    }
                }
                self.best_models.append(model_info)
                print(f" SAVED MODEL: F1={best_val_f1:.4f} (Total saved: {len(self.best_models)})")

            if trial_num is not None:
                print(f"  → F1 Score: {best_val_f1:.4f}")

            # Cleanup
            del model
            torch.cuda.empty_cache()

            return -best_val_f1  # CMA-ES minimizes

        except Exception as e:
            print(f"Error in trial {trial_num}: {e}")
            return -0.0

    def get_best_ensemble_models(self, top_k=5):
        """Get the top K best models from optimization"""
        if len(self.best_models) == 0:
            print(" No models were saved during optimization!")
            return None

        # Sort by F1 score descending
        sorted_models = sorted(self.best_models, key=lambda x: x['f1_score'], reverse=True)

        # Take top K models
        top_models = sorted_models[:min(top_k, len(sorted_models))]

        print(f"\n Selected top {len(top_models)} models from optimization:")
        for i, model_info in enumerate(top_models):
            print(f"   Model {i+1}: F1={model_info['f1_score']:.4f} "
                  f"(Gen {model_info['trial_info']['generation']}, "
                  f"Trial {model_info['trial_info']['trial']})")

        return top_models

    def optimize(self, max_generations=CMAES_MAXITER, population_size=CMAES_POPSIZE):
        """Run CMA-ES optimization with model saving"""
        print("🔬 Starting CMA-ES hyperparameter optimization with model saving...")
        print(f"   Generations: {max_generations}, Population: {population_size}")
        print(f"   Saving models with F1 > {self.f1_threshold}")

        # Initialize CMA-ES
        es = cma.CMAEvolutionStrategy(self.initial_params, CMAES_SIGMA, {
            'popsize': population_size,
            'maxiter': max_generations,
            'verb_disp': 1,
            'verb_log': 0,
            'tolfun': 1e-6,
            'tolx': 1e-8
        })

        generation_best_scores = []
        generation_best_params = []
        best_overall_f1 = 0.0

        generation = 0
        while not es.stop():
            generation += 1
            print(f"\n CMA-ES Generation {generation}/{max_generations}")
            print("=" * 60)

            solutions = es.ask()
            scores = []

            for i, params in enumerate(solutions):
                score = self.objective_function(params, trial_num=i+1, generation=generation)
                scores.append(score)

            es.tell(solutions, scores)

            # Track best solution
            best_idx = np.argmin(scores)
            best_score_this_gen = -scores[best_idx]
            best_params_this_gen = solutions[best_idx]

            generation_best_scores.append(best_score_this_gen)
            generation_best_params.append(best_params_this_gen)

            if best_score_this_gen > best_overall_f1:
                best_overall_f1 = best_score_this_gen
                print(f" NEW BEST F1 SCORE: {best_score_this_gen:.4f}")
            else:
                print(f"Best F1 this generation: {best_score_this_gen:.4f} (overall best: {best_overall_f1:.4f})")

            print(f"Generation {generation} Summary:")
            print(f"   Best: {best_score_this_gen:.4f}, "
                  f"Avg: {-np.mean(scores):.4f}, "
                  f"Worst: {-np.max(scores):.4f}, "
                  f"Models saved: {len(self.best_models)}")

        # Find overall best parameters
        overall_best_idx = np.argmax(generation_best_scores)
        overall_best_score = generation_best_scores[overall_best_idx]
        overall_best_params_raw = generation_best_params[overall_best_idx]
        overall_best_params = self.decode_parameters(overall_best_params_raw)

        print(f"\n CMA-ES Optimization completed!")
        print(f"   Best F1 Score: {overall_best_score:.4f}")
        print(f"   Models saved during optimization: {len(self.best_models)}")
        print(f"   Best Parameters: {overall_best_params}")

        return overall_best_params, overall_best_score, generation_best_scores

In [None]:
# ===============================================================================
# SECTION 11: OPTIMIZED MODEL ARCHITECTURE
# ===============================================================================

class CMAESOptimizedModel(nn.Module):
    """Model with CMA-ES optimized architecture"""

    def __init__(self, input_dim, params):
        super().__init__()

        self.network = nn.Sequential(
            nn.Linear(input_dim, params['hidden1']),
            nn.BatchNorm1d(params['hidden1']),
            nn.ReLU(),
            nn.Dropout(params['dropout1']),

            nn.Linear(params['hidden1'], params['hidden2']),
            nn.BatchNorm1d(params['hidden2']),
            nn.ReLU(),
            nn.Dropout(params['dropout2']),

            nn.Linear(params['hidden2'], params['hidden3']),
            nn.BatchNorm1d(params['hidden3']),
            nn.ReLU(),
            nn.Dropout(params['dropout3']),

            nn.Linear(params['hidden3'], 2)
        )

    def forward(self, x):
        return self.network(x)

In [None]:
# ===============================================================================
# SECTION 12: METRICS AND EVALUATION
# ===============================================================================

def calculate_all_metrics(y_true, y_pred, y_proba=None, class_names=['Non-Hate', 'Hate']):
    """Calculate comprehensive evaluation metrics"""
    metrics = {}

    # Basic metrics
    metrics['accuracy'] = accuracy_score(y_true, y_pred)
    metrics['balanced_accuracy'] = balanced_accuracy_score(y_true, y_pred)
    metrics['f1_weighted'] = f1_score(y_true, y_pred, average='weighted')
    metrics['f1_macro'] = f1_score(y_true, y_pred, average='macro')
    metrics['f1_micro'] = f1_score(y_true, y_pred, average='micro')

    # Per-class metrics
    precision_per_class = precision_score(y_true, y_pred, average=None, zero_division=0)
    recall_per_class = recall_score(y_true, y_pred, average=None, zero_division=0)
    f1_per_class = f1_score(y_true, y_pred, average=None, zero_division=0)

    for i, class_name in enumerate(class_names):
        metrics[f'precision_{class_name.lower()}'] = precision_per_class[i] if i < len(precision_per_class) else 0
        metrics[f'recall_{class_name.lower()}'] = recall_per_class[i] if i < len(recall_per_class) else 0
        metrics[f'f1_{class_name.lower()}'] = f1_per_class[i] if i < len(f1_per_class) else 0

    # Matthews correlation coefficient
    metrics['matthews_corrcoef'] = matthews_corrcoef(y_true, y_pred)

    # Probability-based metrics
    if y_proba is not None:
        if len(np.unique(y_true)) == 2:
            metrics['roc_auc'] = roc_auc_score(y_true, y_proba)

    return metrics


In [None]:

# ===============================================================================
# SECTION 13: MAIN EXECUTION PIPELINE
# ===============================================================================

print("\n" + "="*70)
print(" CMA-ES + DATA-CENTRIC HYBRID APPROACH")
print("="*70)
print("Strategy: Pseudo-labeling + Smart validation + CMA-ES optimization")

# ================================
# DATA LOADING AND FEATURE EXTRACTION
# ================================
print("\n Loading data...")
train_df = load_jsonl_safe(os.path.join(DATA_DIR, "train.jsonl"))
val_df = load_jsonl_safe(os.path.join(DATA_DIR, "dev.jsonl"))
test_df = load_jsonl_safe(os.path.join(DATA_DIR, "test.jsonl"))

for df, name in [(train_df, "Train"), (val_df, "Validation"), (test_df, "Test")]:
    if not df.empty:
        validate_data(df, name)

# Initialize extractors
print("\n Initializing feature extractors...")
extractors = {
    'blip2': OptimizedBLIP2FeatureExtractor(BLIP2_MODEL),
    'clip': OptimizedCLIPFeatureExtractor(CLIP_MODEL),
    'text': OptimizedTextOnlyFeatureExtractor(TEXT_MODEL),
    'hate_features': HateFeatureExtractor()
}

# Extract or load cached features
print("\n Extracting features...")
ensemble_train_path = os.path.join(FEATURES_DIR, "ensemble_train.pt")
ensemble_val_path = os.path.join(FEATURES_DIR, "ensemble_val.pt")
ensemble_test_path = os.path.join(FEATURES_DIR, "ensemble_test.pt")

if os.path.exists(ensemble_train_path):
    print("Loading cached training features...")
    cached_data = torch.load(ensemble_train_path, weights_only=False)
    train_features = cached_data['features']
    train_samples = cached_data['samples']
else:
    print("Extracting training features...")
    train_features, train_samples = extract_optimized_ensemble_features(
        train_df, extractors, "Training"
    )
    torch.save({
        'features': train_features,
        'samples': train_samples
    }, ensemble_train_path)

if os.path.exists(ensemble_val_path):
    print("Loading cached validation features...")
    cached_data = torch.load(ensemble_val_path, weights_only=False)
    val_features = cached_data['features']
    val_samples = cached_data['samples']
else:
    print("Extracting validation features...")
    val_features, val_samples = extract_optimized_ensemble_features(
        val_df, extractors, "Validation"
    )
    torch.save({
        'features': val_features,
        'samples': val_samples
    }, ensemble_val_path)

if os.path.exists(ensemble_test_path):
    print("Loading cached test features...")
    cached_data = torch.load(ensemble_test_path, weights_only=False)
    test_features = cached_data['features']
    test_samples = cached_data['samples']
else:
    print("Extracting test features...")
    test_features, test_samples = extract_optimized_ensemble_features(
        test_df, extractors, "Test"
    )
    torch.save({
        'features': test_features,
        'samples': test_samples
    }, ensemble_test_path)

# Prepare labels
y_train = np.array([sample['label'] for sample in train_samples])
y_val = np.array([sample['label'] for sample in val_samples])

print(f"\n Data Summary:")
print(f"   Training samples: {len(y_train)} (distribution: {np.bincount(y_train)})")
print(f"   Validation samples: {len(y_val)} (distribution: {np.bincount(y_val)})")
print(f"   Test samples: {len(test_samples)}")

# ================================
# HYBRID PIPELINE EXECUTION
# ================================

try:
    # Step 1: Pseudo-labeling approach
    print(f"\n" + "="*50)
    print(" STEP 1: PSEUDO-LABELING")
    print("="*50)

    X_augmented, y_augmented, X_test_processed, scaler, features_mask = pseudo_label_approach(
        train_features, val_features, test_features, y_train, y_val
    )

    # Step 2: Smart validation splits
    print(f"\n" + "="*50)
    print("STEP 2: SMART VALIDATION SPLIT")
    print("="*50)

    best_train_idx, best_val_idx = smart_validation_split(X_augmented, y_augmented)

    X_train_smart = X_augmented[best_train_idx]
    X_val_smart = X_augmented[best_val_idx]
    y_train_smart = y_augmented[best_train_idx]
    y_val_smart = y_augmented[best_val_idx]

    print(f"\nSmart split results:")
    print(f"   Train: {X_train_smart.shape} (distribution: {np.bincount(y_train_smart)})")
    print(f"   Val: {X_val_smart.shape} (distribution: {np.bincount(y_val_smart)})")

    # Step 3: CMA-ES Optimization
    print(f"\n" + "="*50)
    print(" STEP 3: CMA-ES HYPERPARAMETER OPTIMIZATION")
    print("="*50)

    cmaes_optimizer = DataCentricCMAESOptimizer(X_train_smart, X_val_smart, y_train_smart, y_val_smart)

    best_params, best_cmaes_score, optimization_history = cmaes_optimizer.optimize(
        max_generations=CMAES_MAXITER, population_size=CMAES_POPSIZE
    )

    print(f"\n CMA-ES Optimization Results:")
    print(f"   Best F1 Score: {best_cmaes_score:.4f}")
    print(f"   Optimized Parameters:")
    for param, value in best_params.items():
        if 'learning_rate' in param or 'weight_decay' in param:
            print(f"     {param}: {value:.2e}")
        elif 'dropout' in param:
            print(f"     {param}: {value:.3f}")
        else:
            print(f"     {param}: {value}")

   # Step 4: Final Ensemble Training
    print(f"\n" + "="*50)
    print(" STEP 4: FINAL ENSEMBLE TRAINING")
    print("="*50)

    print(f"Training ensemble with optimized hyperparameters...")

    final_models = []
    final_predictions = []
    final_probabilities = []

    # Train 5 models with optimized parameters but different seeds
    for seed_idx, seed in enumerate([42, 43, 44, 45, 46]):
        print(f"\n Training model {seed_idx+1}/5 (seed={seed})...")

        torch.manual_seed(seed)

        model = CMAESOptimizedModel(X_train_smart.shape[1], best_params).to(DEVICE)
        optimizer = torch.optim.AdamW(
            model.parameters(),
            lr=best_params['learning_rate'],
            weight_decay=best_params['weight_decay']
        )

        # Enhanced loss with class weighting
        class_counts = np.bincount(y_train_smart)
        weights = len(y_train_smart) / (2 * class_counts)
        weights[1] *= 1.05  # Slight hate bias
        class_weights = torch.tensor(weights, dtype=torch.float32).to(DEVICE)
        criterion = nn.CrossEntropyLoss(weight=class_weights, label_smoothing=0.02)

        # Training with optimized parameters
        best_model_f1 = 0
        best_preds = None
        best_probs = None
        patience_counter = 0
        max_patience = 10

        for epoch in range(best_params['epochs']):
            model.train()

            indices = torch.randperm(len(X_train_smart))
            for i in range(0, len(indices), best_params['batch_size']):
                batch_indices = indices[i:i + best_params['batch_size']]
                batch_x = torch.tensor(X_train_smart[batch_indices], dtype=torch.float32).to(DEVICE)
                batch_y = torch.tensor(y_train_smart[batch_indices], dtype=torch.long).to(DEVICE)

                optimizer.zero_grad()
                outputs = model(batch_x)
                loss = criterion(outputs, batch_y)
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=0.5)
                optimizer.step()

            # Validation every 5 epochs
            if epoch % 5 == 0 or epoch == best_params['epochs'] - 1:
                model.eval()
                with torch.no_grad():
                    val_tensor = torch.tensor(X_val_smart, dtype=torch.float32).to(DEVICE)
                    val_outputs = model(val_tensor)
                    val_predictions = val_outputs.argmax(dim=1).cpu().numpy()
                    val_probabilities = torch.softmax(val_outputs, dim=1).cpu().numpy()

                val_f1 = f1_score(y_val_smart, val_predictions, average='weighted')

                if val_f1 > best_model_f1:
                    best_model_f1 = val_f1
                    best_preds = val_predictions.copy()
                    best_probs = val_probabilities.copy()
                    patience_counter = 0
                else:
                    patience_counter += 1
                    if patience_counter >= max_patience:
                        break

        final_models.append(model)
        final_predictions.append(best_preds)
        final_probabilities.append(best_probs)  # ADD THIS LINE
        print(f"   Model {seed_idx+1} best F1: {best_model_f1:.4f}")

    # Final ensemble prediction WITH probabilities
    ensemble_preds = np.round(np.mean(final_predictions, axis=0)).astype(int)
    ensemble_probabilities = np.mean(final_probabilities, axis=0)
    final_ensemble_f1 = f1_score(y_val_smart, ensemble_preds, average='weighted')

    # Calculate comprehensive metrics WITH probabilities
    final_metrics = calculate_all_metrics(
        y_val_smart,
        ensemble_preds,
        y_proba=ensemble_probabilities[:, 1]
    )

    # ================================
    # RESULTS AND COMPARISON
    # ================================

    print(f"\n" + "="*70)
    print(" FINAL HYBRID RESULTS")
    print("="*70)

    print(f"\n Performance Summary:")
    print(f"   CMA-ES Optimized F1: {final_ensemble_f1:.4f}")
    print(f"   Improvement: {final_ensemble_f1 - 0.78:+.4f}")

    print(f"\n Detailed Metrics:")
    key_metrics = ['accuracy', 'balanced_accuracy', 'f1_weighted', 'f1_macro', 'matthews_corrcoef']
    for metric in key_metrics:
        if metric in final_metrics:
            print(f"   {metric.replace('_', ' ').title()}: {final_metrics[metric]:.4f}")

    print(f"\n Optimized Hyperparameters:")
    print(f"   Architecture: {best_params['hidden1']} → {best_params['hidden2']} → {best_params['hidden3']} → 2")
    print(f"   Learning Rate: {best_params['learning_rate']:.2e}")
    print(f"   Dropouts: [{best_params['dropout1']:.3f}, {best_params['dropout2']:.3f}, {best_params['dropout3']:.3f}]")
    print(f"   Batch Size: {best_params['batch_size']}")
    print(f"   Weight Decay: {best_params['weight_decay']:.2e}")


    # Save results
    results_summary = {
        'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'approach': 'CMA-ES + Data-Centric Hybrid',
        'final_f1_score': final_ensemble_f1,
        'optimized_parameters': best_params,
        'cmaes_best_score': best_cmaes_score,
        'optimization_history': optimization_history,
        'final_metrics': final_metrics,
        'ensemble_size': len(final_models)
    }

    results_path = os.path.join(
        RESULTS_DIR,
        f"hybrid_cmaes_results_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
    )

    with open(results_path, 'w') as f:
        json.dump(results_summary, f, indent=2, default=str)

    print(f"\n💾 Results saved to: {results_path}")

    if 'X_test_processed' in locals() and X_test_processed is not None:
        print(f"\n🔮 Generating test predictions...")

        test_predictions_all = []
        test_probabilities_all = []

        for model in final_models:
            model.eval()
            with torch.no_grad():
                test_tensor = torch.tensor(X_test_processed, dtype=torch.float32).to(DEVICE)
                test_outputs = model(test_tensor)
                test_probs = torch.softmax(test_outputs, dim=1).cpu().numpy()
                test_preds = test_outputs.argmax(dim=1).cpu().numpy()
                test_predictions_all.append(test_preds)
                test_probabilities_all.append(test_probs)

        # Ensemble test predictions
        final_test_predictions = np.round(np.mean(test_predictions_all, axis=0)).astype(int)
        final_test_probabilities = np.mean(test_probabilities_all, axis=0)

        # Save as JSON
        json_path = os.path.join(
            RESULTS_DIR,
            f"hybrid_test_predictions_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json"
        )

        # Create comprehensive results
        test_results = {
            'metadata': {
                'timestamp': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                'model_approach': 'CMA-ES + Data-Centric Hybrid',
                'ensemble_size': len(final_models),
                'total_predictions': len(final_test_predictions),
                'prediction_distribution': {
                    'non_hate': int(np.sum(final_test_predictions == 0)),
                    'hate': int(np.sum(final_test_predictions == 1))
                },
                'average_confidence': float(np.mean(final_test_probabilities.max(axis=1)))
            },
            'predictions': []
        }

        for i, sample in enumerate(test_samples):
            test_results['predictions'].append({
                'image_id': sample['img'],
                'text': sample['text'],
                'predicted_label': int(final_test_predictions[i]),
                'predicted_class': 'hate' if final_test_predictions[i] == 1 else 'non-hate',
                'confidence_scores': {
                    'non_hate': float(final_test_probabilities[i, 0]),
                    'hate': float(final_test_probabilities[i, 1])
                },
                'max_confidence': float(final_test_probabilities[i].max())
            })

        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(test_results, f, indent=2, ensure_ascii=False)

        print(f"    Test predictions saved to: {json_path}")
        print(f"    Predictions shape: {final_test_predictions.shape}")
        print(f"    Prediction distribution: {np.bincount(final_test_predictions)}")

    print(f"\n CMA-ES + Data-Centric Hybrid Pipeline completed successfully!")
    print(f"Final F1 Score: {final_ensemble_f1:.4f}")


except Exception as e:
    print(f" Error in hybrid pipeline: {e}")
    import traceback
    traceback.print_exc()


 CMA-ES + DATA-CENTRIC HYBRID APPROACH
Strategy: Pseudo-labeling + Smart validation + CMA-ES optimization

 Loading data...
Loaded 12826 samples from train.jsonl
Loaded 726 samples from dev.jsonl
Loaded 1505 samples from test.jsonl
Train label distribution:
label
0    7657
1    5169
Name: count, dtype: int64
Validation label distribution:
label
0    361
1    365
Name: count, dtype: int64
Missing columns in Test: ['label']

 Initializing feature extractors...


preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

model.safetensors.index.json: 0.00B [00:00, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Optimized BLIP2 Feature Extractor initialized


100%|████████████████████████████████████████| 338M/338M [00:01<00:00, 290MiB/s]


Optimized CLIP ViT-B/32 Feature Extractor initialized


vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Optimized Text-only roberta-base Feature Extractor initialized
Hate Feature Extractor initialized with comprehensive vocabularies

 Extracting features...
Loading cached training features...
Loading cached validation features...
Loading cached test features...

 Data Summary:
   Training samples: 12826 (distribution: [7657 5169])
   Validation samples: 726 (distribution: [361 365])
   Test samples: 1505

 STEP 1: PSEUDO-LABELING
 Pseudo-labeling approach...
Combined training data: (13552, 3328)
Test data for pseudo-labeling: (1505, 3328)
High-confidence pseudo-labels: 1222/1505 (81.2%)
Augmented training data: (14774, 3328)
Pseudo-label distribution: [699 523]

STEP 2: SMART VALIDATION SPLIT
Testing 10 different train/val splits...
Split 1: F1 = 0.7520
Split 2: F1 = 0.7634
Split 3: F1 = 0.7667
Split 4: F1 = 0.7571
Split 5: F1 = 0.7537
Split 6: F1 = 0.7598
Split 7: F1 = 0.7499
Split 8: F1 = 0.7619
Split 9: F1 = 0.7499
Split 10: F1 = 0.7747

Best split: #10 with F1 = 0.7747

Smart split 