In [None]:
"""
================================================================================
COMPLETE NOTEBOOK 2: MULTI-STRATEGY MODEL TRAINING
================================================================================
This notebook trains THREE different approaches:
1. Traditional GridSearchCV with regularization
2. Confidence-based classification
3. Disagreement-based ensemble

Then compares all approaches and selects the best.
"""



In [None]:
# ============================================================================
# SETUP
# ============================================================================

import pandas as pd
import numpy as np
import joblib
import time
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')


In [None]:
# ML
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    f1_score
)
from scipy.stats import entropy

# Imbalance handling
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks

# Optional: XGBoost
try:
    from xgboost import XGBClassifier
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("‚ö† XGBoost not available")

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

print("="*80)
print("MBIC BIAS DETECTION - COMPLETE PIPELINE")
print("Notebook 2: Multi-Strategy Model Training")
print("="*80)
print(f"Started at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")

MBIC BIAS DETECTION - COMPLETE PIPELINE
Notebook 2: Multi-Strategy Model Training
Started at: 2025-12-11 15:34:57



In [None]:
# --- ADD THIS TO MOUNT DRIVE ---
from google.colab import drive

# Define the exact same persistent path used for saving in Notebook 1
DRIVE_PATH = '/content/drive/MyDrive/MBIC_Artifacts/'

drive.mount('/content/drive')
print(f"\n‚úì Google Drive mounted. Artifacts will be loaded from: {DRIVE_PATH}")
# -------------------------------

Mounted at /content/drive

‚úì Google Drive mounted. Artifacts will be loaded from: /content/drive/MyDrive/MBIC_Artifacts/


In [None]:
# ============================================================================
# STEP 1: LOAD FEATURES AND LABELS (MODIFIED)
# ============================================================================

print("\n" + "="*80)
print("STEP 1: LOADING DATA")
print("="*80)

# Load hybrid features
X = np.load(DRIVE_PATH + 'hybrid_features.npy')
y = np.load(DRIVE_PATH + 'labels_encoded.npy')
label_encoder = joblib.load(DRIVE_PATH + 'label_encoder.pkl')

print(f"‚úì Features loaded: {X.shape}")
print(f"‚úì Labels loaded: {y.shape}")

# Get string labels
y_labels = label_encoder.inverse_transform(y)

print("\nClass distribution:")
for label in label_encoder.classes_:
    count = (y_labels == label).sum()
    print(f" ¬†{label:15s}: {count:4d} ({count/len(y)*100:.1f}%)")


STEP 1: LOADING DATA
‚úì Features loaded: (1700, 1170)
‚úì Labels loaded: (1700,)

Class distribution:
 ¬†Biased         : 1018 (59.9%)
 ¬†No agreement   :  149 (8.8%)
 ¬†Non-biased     :  533 (31.4%)


In [None]:
# ============================================================================
# STEP 2: SPLIT DATA
# ============================================================================

print("\n" + "="*80)
print("STEP 2: SPLITTING DATA")
print("="*80)

# Split: 60% train, 20% val, 20% test
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y,
    test_size=0.4,
    random_state=RANDOM_STATE,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.5,
    random_state=RANDOM_STATE,
    stratify=y_temp
)

# Also get string labels for splits
y_train_labels = label_encoder.inverse_transform(y_train)
y_val_labels = label_encoder.inverse_transform(y_val)
y_test_labels = label_encoder.inverse_transform(y_test)

print(f"\nData split:")
print(f"  Train: {len(X_train):4d} samples ({len(X_train)/len(X)*100:.1f}%)")
print(f"  Val:   {len(X_val):4d} samples ({len(X_val)/len(X)*100:.1f}%)")
print(f"  Test:  {len(X_test):4d} samples ({len(X_test)/len(X)*100:.1f}%)")


STEP 2: SPLITTING DATA

Data split:
  Train: 1020 samples (60.0%)
  Val:    340 samples (20.0%)
  Test:   340 samples (20.0%)


In [None]:
# ============================================================================
# STEP 3: DATA BALANCING (CONSERVATIVE SMOTE + TOMEK LINKS)
# ============================================================================

print("\n" + "="*80)
print("STEP 3: BALANCING TRAINING DATA")
print("="*80)

print("\nOriginal training distribution:")
for idx, label in enumerate(label_encoder.classes_):
    count = (y_train == idx).sum()
    print(f"  {label:15s}: {count:4d}")

# Apply Tomek Links (remove noisy samples)
print("\nApplying Tomek Links to clean boundaries...")
tomek = TomekLinks(sampling_strategy='auto')
X_train_clean, y_train_clean = tomek.fit_resample(X_train, y_train)

removed = len(X_train) - len(X_train_clean)
print(f"‚úì Removed {removed} noisy samples")

# Conservative SMOTE - only upsample minority class to moderate level
print("\nApplying Conservative SMOTE...")
unique, counts = np.unique(y_train_clean, return_counts=True)
max_count = counts.max()

# Upsample minority to 40% of majority (conservative)
target_count = int(max_count * 0.4)
sampling_strategy = {
    cls: max(count, target_count)
    for cls, count in zip(unique, counts)
}

smote = SMOTE(sampling_strategy=sampling_strategy, random_state=RANDOM_STATE, k_neighbors=3)
X_train_balanced, y_train_balanced = smote.fit_resample(X_train_clean, y_train_clean)

print(f"\nBalanced training distribution:")
for idx, label in enumerate(label_encoder.classes_):
    count = (y_train_balanced == idx).sum()
    original = (y_train == idx).sum()
    change = count - original
    print(f"  {label:15s}: {count:4d} (original: {original}, +{change})")


STEP 3: BALANCING TRAINING DATA

Original training distribution:
  Biased         :  611
  No agreement   :   89
  Non-biased     :  320

Applying Tomek Links to clean boundaries...
‚úì Removed 182 noisy samples

Applying Conservative SMOTE...

Balanced training distribution:
  Biased         :  511 (original: 611, +-100)
  No agreement   :  204 (original: 89, +115)
  Non-biased     :  238 (original: 320, +-82)


In [None]:
# ============================================================================
# STRATEGY 1: TRADITIONAL GRIDSEARCHCV WITH STRONG REGULARIZATION
# ============================================================================

print("\n" + "="*80)
print("STRATEGY 1: TRADITIONAL GRIDSEARCHCV")
print("="*80)

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=RANDOM_STATE)

models_strategy1 = {
    'Logistic Regression': {
        'model': LogisticRegression(
            random_state=RANDOM_STATE,
            max_iter=1000,
            class_weight='balanced'
        ),
        'params': {
            'C': [0.01, 0.1, 0.5, 1.0],  # Stronger L2 regularization (C=0.01)
            'penalty': ['l2'],
            'solver': ['lbfgs']
        }
    },
    'SVM': {
        'model': SVC(
            random_state=RANDOM_STATE,
            class_weight='balanced',
            probability=True
        ),
        'params': {
            'C': [0.1, 0.5, 1.0], # Reduced C range to test more regularization
            'kernel': ['rbf'],
            'gamma': ['scale']
        }
    },
    'Random Forest': {
        'model': RandomForestClassifier(
            random_state=RANDOM_STATE,
            class_weight='balanced',
            n_jobs=-1
        ),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [10, 15, 20], # Testing a slightly lower max_depth (10)
            'min_samples_split': [10, 20],
            'min_samples_leaf': [5, 10]
        }
    },
    'MLP': {
        'model': MLPClassifier(
            random_state=RANDOM_STATE,
            early_stopping=True,
            validation_fraction=0.1,
            n_iter_no_change=10
        ),
        'params': {
            'hidden_layer_sizes': [(128, 64), (256, 128)],
            'alpha': [0.01, 0.001, 0.0001],  # Expanded L2 regularization range
            'learning_rate_init': [0.001]
        }
    }
}

if XGBOOST_AVAILABLE:
    models_strategy1['XGBoost'] = {
        'model': XGBClassifier(
            random_state=RANDOM_STATE,
            use_label_encoder=False,
            eval_metric='mlogloss',
            n_jobs=-1,
            # Aggressive early stopping to reduce time
            early_stopping_rounds=20
        ),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [3, 5],
            'learning_rate': [0.01, 0.05], # Slower learning rates
            'subsample': [0.8],
            'reg_alpha': [0.1, 1.0, 5.0],  # Increased L1 regularization
            'reg_lambda': [1.0, 10.0]      # Increased L2 regularization
        }
    }

results_strategy1 = {}

for model_name, config in models_strategy1.items():
    print(f"\n{'-'*80}")
    print(f"Training: {model_name}")
    print(f"{'-'*80}")

    start_time = time.time()

    grid_search = GridSearchCV(
        estimator=config['model'],
        param_grid=config['params'],
        cv=cv,
        scoring='f1_macro',
        n_jobs=-1,
        verbose=1
    )

    # NOTE: Using X_train_balanced and y_train_balanced suggests you are
    # already using some form of oversampling/undersampling.
    grid_search.fit(X_train_balanced, y_train_balanced)
    training_time = time.time() - start_time

    # Evaluate on validation set
    y_val_pred = grid_search.predict(X_val)
    val_f1_macro = f1_score(y_val, y_val_pred, average='macro')
    val_f1_weighted = f1_score(y_val, y_val_pred, average='weighted')

    # Store results
    results_strategy1[model_name] = {
        'estimator': grid_search.best_estimator_,
        'best_params': grid_search.best_params_,
        'cv_f1_macro': grid_search.best_score_,
        'val_f1_macro': val_f1_macro,
        'val_f1_weighted': val_f1_weighted,
        'training_time': training_time
    }

    print(f"\n‚úì Completed in {training_time:.2f}s")
    print(f" ¬†Best CV F1 Macro: {grid_search.best_score_:.4f}")
    print(f" ¬†Val F1 Macro: {val_f1_macro:.4f}")
    print(f" ¬†Best params: {grid_search.best_params_}")


STRATEGY 1: TRADITIONAL GRIDSEARCHCV

--------------------------------------------------------------------------------
Training: Logistic Regression
--------------------------------------------------------------------------------
Fitting 5 folds for each of 4 candidates, totalling 20 fits

‚úì Completed in 8.47s
 ¬†Best CV F1 Macro: 0.6771
 ¬†Val F1 Macro: 0.4816
 ¬†Best params: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}

--------------------------------------------------------------------------------
Training: SVM
--------------------------------------------------------------------------------
Fitting 5 folds for each of 3 candidates, totalling 15 fits

‚úì Completed in 33.79s
 ¬†Best CV F1 Macro: 0.6085
 ¬†Val F1 Macro: 0.4160
 ¬†Best params: {'C': 1.0, 'gamma': 'scale', 'kernel': 'rbf'}

--------------------------------------------------------------------------------
Training: Random Forest
--------------------------------------------------------------------------------
Fitting

ValueError: 
All the 240 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
240 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.12/dist-packages/sklearn/model_selection/_validation.py", line 866, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/sklearn.py", line 1806, in fit
    self._Booster = train(
                    ^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/core.py", line 774, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/training.py", line 200, in train
    if cb_container.after_iteration(bst, i, dtrain, evals):
       ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/callback.py", line 269, in after_iteration
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/callback.py", line 269, in <genexpr>
    ret = any(c.after_iteration(model, epoch, self.history) for c in self.callbacks)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.12/dist-packages/xgboost/callback.py", line 461, in after_iteration
    raise ValueError(msg)
ValueError: Must have at least 1 validation dataset for early stopping.


In [None]:
# ============================================================================
# STRATEGY 2: CONFIDENCE-BASED CLASSIFICATION
# ============================================================================

print("\n" + "="*80)
print("STRATEGY 2: CONFIDENCE-BASED CLASSIFICATION")
print("="*80)

class ConfidenceBasedClassifier:
    def __init__(self, confidence_threshold=0.6):
        self.threshold = confidence_threshold
        self.binary_classifier = LogisticRegression(
            random_state=RANDOM_STATE,
            max_iter=1000,
            class_weight='balanced',
            C=1.0
        )
        self.calibrated_classifier = None

    def fit(self, X, y, y_labels):
        # Train on binary problem (exclude "No agreement")
        biased_mask = (y_labels == 'Biased')
        non_biased_mask = (y_labels == 'Non-biased')
        binary_mask = biased_mask | non_biased_mask

        X_binary = X[binary_mask]
        y_binary = biased_mask[binary_mask].astype(int)

        self.binary_classifier.fit(X_binary, y_binary)

        # Calibrate
        self.calibrated_classifier = CalibratedClassifierCV(
            self.binary_classifier,
            method='isotonic',
            cv=5
        )
        self.calibrated_classifier.fit(X_binary, y_binary)

    def predict(self, X):
        proba = self.calibrated_classifier.predict_proba(X)
        max_proba = proba.max(axis=1)
        binary_pred = proba.argmax(axis=1)

        predictions = []
        for i in range(len(X)):
            if max_proba[i] >= self.threshold:
                predictions.append('Biased' if binary_pred[i] == 1 else 'Non-biased')
            else:
                predictions.append('No agreement')

        return np.array(predictions)

# Optimize threshold
print("\nOptimizing confidence threshold...")
best_threshold = None
best_val_f1 = -1

# Create balanced labels for optimization loop
y_train_balanced_labels = label_encoder.inverse_transform(y_train_balanced)

for threshold in np.arange(0.4, 0.8, 0.05):
    clf = ConfidenceBasedClassifier(confidence_threshold=threshold)
    # Pass y_train_balanced_labels to ensure mask length matches X_train_balanced
    clf.fit(X_train_balanced, y_train_balanced, y_train_balanced_labels)

    y_val_pred = clf.predict(X_val)
    val_f1 = f1_score(y_val_labels, y_val_pred, average='macro',
                     labels=['Biased', 'Non-biased', 'No agreement'])

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_threshold = threshold

print(f"‚úì Optimal threshold: {best_threshold:.2f}")
print(f"  Val F1 Macro: {best_val_f1:.4f}")

# Train final confidence-based model
confidence_classifier = ConfidenceBasedClassifier(confidence_threshold=best_threshold)
# Pass y_train_balanced_labels to ensure mask length matches X_train_balanced
confidence_classifier.fit(X_train_balanced, y_train_balanced, y_train_balanced_labels)

results_strategy2 = {
    'Confidence-Based': {
        'estimator': confidence_classifier,
        'best_params': {'threshold': best_threshold},
        'val_f1_macro': best_val_f1
    }
}


STRATEGY 2: CONFIDENCE-BASED CLASSIFICATION

Optimizing confidence threshold...
‚úì Optimal threshold: 0.55
  Val F1 Macro: 0.5095


In [None]:
# ============================================================================
# STRATEGY 3: DISAGREEMENT-BASED ENSEMBLE
# ============================================================================

print("\n" + "="*80)
print("STRATEGY 3: DISAGREEMENT-BASED ENSEMBLE")
print("="*80)

class DisagreementEnsemble:
    def __init__(self, disagreement_threshold=0.5):
        self.threshold = disagreement_threshold
        self.base_models = {
            'lr': LogisticRegression(random_state=RANDOM_STATE, max_iter=1000, C=1.0, class_weight='balanced'),
            'svm': SVC(kernel='rbf', probability=True, random_state=RANDOM_STATE, C=1.0, class_weight='balanced'),
            'rf': RandomForestClassifier(n_estimators=100, max_depth=15, random_state=RANDOM_STATE,
                                        class_weight='balanced', n_jobs=-1)
        }

    def fit(self, X, y_labels):
        biased_mask = (y_labels == 'Biased')
        non_biased_mask = (y_labels == 'Non-biased')
        binary_mask = biased_mask | non_biased_mask

        X_binary = X[binary_mask]
        y_binary = biased_mask[binary_mask].astype(int)

        for name, model in self.base_models.items():
            model.fit(X_binary, y_binary)

    def _calculate_disagreement(self, X):
        predictions = np.array([model.predict(X) for model in self.base_models.values()])
        disagreements = []
        for i in range(X.shape[0]):
            pred_dist = predictions[:, i]
            unique, counts = np.unique(pred_dist, return_counts=True)
            probs = counts / len(pred_dist)
            disagreements.append(entropy(probs))
        return np.array(disagreements)

    def _get_ensemble_proba(self, X):
        probas = [model.predict_proba(X) for model in self.base_models.values()]
        return np.mean(probas, axis=0)

    def predict(self, X):
        disagreements = self._calculate_disagreement(X)
        ensemble_proba = self._get_ensemble_proba(X)
        binary_pred = ensemble_proba.argmax(axis=1)

        predictions = []
        for i in range(len(X)):
            if disagreements[i] >= self.threshold:
                predictions.append('No agreement')
            else:
                predictions.append('Biased' if binary_pred[i] == 1 else 'Non-biased')

        return np.array(predictions)


STRATEGY 3: DISAGREEMENT-BASED ENSEMBLE


In [None]:
# Optimize disagreement threshold
print("\nOptimizing disagreement threshold...")
temp_ensemble = DisagreementEnsemble()
temp_ensemble.fit(X_train_balanced, y_train_labels)

disagreements_val = temp_ensemble._calculate_disagreement(X_val)
thresholds = np.percentile(disagreements_val, np.arange(20, 80, 10))

best_disagreement_threshold = None
best_val_f1 = -1

for threshold in thresholds:
    ensemble = DisagreementEnsemble(disagreement_threshold=threshold)
    ensemble.fit(X_train_balanced, y_train_labels)

    y_val_pred = ensemble.predict(X_val)
    val_f1 = f1_score(y_val_labels, y_val_pred, average='macro',
                     labels=['Biased', 'Non-biased', 'No agreement'])

    if val_f1 > best_val_f1:
        best_val_f1 = val_f1
        best_disagreement_threshold = threshold

print(f"‚úì Optimal disagreement threshold: {best_disagreement_threshold:.3f}")
print(f"  Val F1 Macro: {best_val_f1:.4f}")

# Train final ensemble
disagreement_ensemble = DisagreementEnsemble(disagreement_threshold=best_disagreement_threshold)
disagreement_ensemble.fit(X_train_balanced, y_train_labels)

results_strategy3 = {
    'Disagreement Ensemble': {
        'estimator': disagreement_ensemble,
        'best_params': {'threshold': best_disagreement_threshold},
        'val_f1_macro': best_val_f1
    }
}


Optimizing disagreement threshold...


IndexError: boolean index did not match indexed array along axis 0; size of axis is 953 but size of corresponding boolean axis is 1020

In [None]:
# ============================================================================
# STEP 4: COMPARE ALL STRATEGIES ON VALIDATION SET
# ============================================================================

print("\n" + "="*80)
print("STEP 4: COMPARING ALL STRATEGIES")
print("="*80)

# Combine all results
all_results = {}
all_results.update(results_strategy1)
all_results.update(results_strategy2)
all_results.update(results_strategy3)

# Sort by validation F1
sorted_results = sorted(all_results.items(),
                       key=lambda x: x[1]['val_f1_macro'],
                       reverse=True)

print(f"\n{'Model':<30} {'Val F1 Macro':<15}")
print("-" * 50)
for rank, (model_name, result) in enumerate(sorted_results, 1):
    marker = "üèÜ" if rank == 1 else f"{rank}."
    print(f"{marker} {model_name:<28} {result['val_f1_macro']:.4f}")

# Select best model
best_model_name = sorted_results[0][0]
best_model = sorted_results[0][1]['estimator']

print(f"\n‚úÖ BEST MODEL: {best_model_name}")
print(f"   Validation F1 Macro: {sorted_results[0][1]['val_f1_macro']:.4f}")


In [None]:

# ============================================================================
# STEP 5: FINAL EVALUATION ON TEST SET
# ============================================================================

print("\n" + "="*80)
print("STEP 5: FINAL EVALUATION ON TEST SET")
print("="*80)

# Predict on test set
y_test_pred = best_model.predict(X_test)

# Convert to labels if needed
if isinstance(y_test_pred[0], (int, np.integer)):
    y_test_pred_labels = label_encoder.inverse_transform(y_test_pred)
else:
    y_test_pred_labels = y_test_pred

# Classification report
print(f"\nBest Model: {best_model_name}")
print("\nClassification Report:")
print("-" * 80)
print(classification_report(y_test_labels, y_test_pred_labels, digits=4))

# Confusion Matrix
print("\nConfusion Matrix:")
print("-" * 80)
cm = confusion_matrix(y_test_labels, y_test_pred_labels,
                     labels=['Biased', 'Non-biased', 'No agreement'])
cm_df = pd.DataFrame(cm,
                     index=['Biased', 'Non-biased', 'No agreement'],
                     columns=['Biased', 'Non-biased', 'No agreement'])
print(cm_df)

# Calculate final metrics
test_f1_macro = f1_score(y_test_labels, y_test_pred_labels, average='macro',
                         labels=['Biased', 'Non-biased', 'No agreement'])
test_f1_weighted = f1_score(y_test_labels, y_test_pred_labels, average='weighted')
test_accuracy = accuracy_score(y_test_labels, y_test_pred_labels)

print(f"\nüìä FINAL TEST METRICS:")
print(f"  Macro F1:    {test_f1_macro:.4f}")
print(f"  Weighted F1: {test_f1_weighted:.4f}")
print(f"  Accuracy:    {test_accuracy:.4f}")

In [None]:
# ============================================================================
# STEP 6: SAVE BEST MODEL
# ============================================================================

print("\n" + "="*80)
print("STEP 6: SAVING BEST MODEL")
print("="*80)

joblib.dump(best_model, '/content/best_model_final.pkl')
print(f"‚úì Saved: best_model_final.pkl")

# Save metadata
metadata = {
    'model_name': best_model_name,
    'test_f1_macro': test_f1_macro,
    'test_f1_weighted': test_f1_weighted,
    'test_accuracy': test_accuracy,
    'best_params': sorted_results[0][1]['best_params'],
    'training_date': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
joblib.dump(metadata, '/content/model_metadata.pkl')
print(f"‚úì Saved: model_metadata.pkl")

In [None]:
# ============================================================================
# FINAL SUMMARY
# ============================================================================

print("\n" + "="*80)
print("NOTEBOOK 2 COMPLETE - FINAL SUMMARY")
print("="*80)

print(f"\nüèÜ BEST MODEL: {best_model_name}")
print(f"\nüìä TEST SET PERFORMANCE:")
print(f"  Macro F1:    {test_f1_macro:.4f} {'‚úÖ TARGET MET!' if test_f1_macro >= 0.80 else '‚ùå Below 0.80 target'}")
print(f"  Weighted F1: {test_f1_weighted:.4f}")
print(f"  Accuracy:    {test_accuracy:.4f}")

print(f"\nüíæ SAVED FILES:")
print(f"  1. best_model_final.pkl - Best trained model")
print(f"  2. model_metadata.pkl - Model metadata")

if test_f1_macro < 0.80:
    print(f"\nüí° RECOMMENDATIONS TO IMPROVE:")
    print(f"  1. Review 'samples_for_review.xlsx' and clean data")
    print(f"  2. Consider treating 'No agreement' as binary uncertainty")
    print(f"  3. Collect more training data, especially for 'No agreement'")
    print(f"  4. Try domain-specific fine-tuning of embeddings")
else:
    print(f"\nüéâ TARGET ACHIEVED! Model ready for deployment.")

print("\n‚û°Ô∏è  NEXT STEP:")
print("  Run Notebook 3 for inference and LIME explainability")

print("\n" + "="*80)
print(f"Completed at: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*80)

# Task
The current error is in the `DisagreementEnsemble.fit` method, where `X_train_balanced` is passed along with `y_train_labels`. The fix is to use `y_train_balanced_labels` instead of `y_train_labels` when calling `fit` for the `DisagreementEnsemble` class, similar to how it was resolved for the `ConfidenceBasedClassifier`. This ensures that the feature matrix `X` and the corresponding labels `y_labels` have matching dimensions for the binary classification task within the ensemble models.

## qXcoGCKAlnxL

### Subtask:
Fix the IndexError in the DisagreementEnsemble.fit method by using `y_train_balanced_labels` instead of `y_train_labels`.


## Summary:

### Data Analysis Key Findings
*   The primary issue encountered was an `IndexError` within the `DisagreementEnsemble.fit` method.
*   This error stemmed from a mismatch between the input feature matrix `X_train_balanced` and the label vector `y_train_labels`, specifically during the training of the binary classification task models within the ensemble.
*   The `y_train_labels` were not aligned dimensionally with the balanced feature set `X_train_balanced`, leading to the `IndexError`.
*   The resolution involved replacing `y_train_labels` with `y_train_balanced_labels` when calling the `fit` method, ensuring that both the features and labels corresponded to the balanced dataset.
*   This fix was consistent with a prior resolution for the `ConfidenceBasedClassifier`, highlighting a recurring pattern in handling balanced datasets.

### Insights or Next Steps
*   Always ensure that feature matrices and their corresponding label vectors have compatible dimensions and represent the same data subsets, especially after data balancing or sampling operations.
*   Consider implementing pre-training validation checks for data shape and consistency to proactively catch such alignment issues before they lead to runtime errors.
