# Enhanced Bi-GRU with Confidence-Weighted Ensemble

## Key Improvements:
1. **Multi-Model Ensemble**: Train 5 models with different seeds per fold
2. **Confidence-Weighted Voting**: Use softmax probabilities instead of majority voting
3. **Adaptive Temporal Window**: Adjust voting window based on confidence
4. **Timestamp Gap Features**: Utilize temporal gaps as additional input
5. **Two-Stage Filtering**: High-confidence predictions stabilize uncertain ones

**Target**: Improve from 0.3854 â†’ 0.43-0.45 F1

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Input, Concatenate
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
from scipy.stats import mode
from scipy.special import softmax
import warnings
warnings.filterwarnings('ignore')

print("âœ“ All imports successful")

In [None]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("âœ“ All folds loaded")

In [None]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def compute_time_gaps(df):
    """NEW: Compute time gaps between consecutive readings"""
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['timestamp_dt'] = pd.to_datetime(df['timestamp'])
    df['time_gap'] = df['timestamp_dt'].diff().dt.total_seconds().fillna(0)
    
    # Categorize gaps
    df['gap_category'] = pd.cut(
        df['time_gap'],
        bins=[-np.inf, 1, 3, 5, 10, np.inf],
        labels=[0, 1, 2, 3, 4]  # 0: <1s, 1: 1-3s, 2: 3-5s, 3: 5-10s, 4: >10s
    ).astype(int)
    
    return df

def create_beacon_count_vectors_with_gaps(df):
    """ENHANCED: Aggregates readings with time gap features"""
    vectors = []
    has_groups = 'room_group' in df.columns
    
    # Compute gaps first
    df = compute_time_gaps(df)

    for timestamp, group in df.groupby('timestamp'):
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                vector[int(beacon_id) - 1] = count / total_readings

        # Get gap features (same for all readings in this timestamp)
        time_gap = group['time_gap'].iloc[0]
        gap_category = group['gap_category'].iloc[0]
        
        entry = {
            'timestamp': timestamp,
            'room': group['room'].iloc[0],
            'beacon_vector': vector,
            'time_gap': time_gap,
            'gap_category': gap_category
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """ENHANCED: Creates sequences with gap features"""
    sequences = []
    labels = []

    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=False)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        # Combine beacon vectors with gap features
        sequence = []
        for _, row in group.iterrows():
            # Beacon counts (23-dim) + time_gap (1-dim) + gap_category (1-dim) = 25-dim
            combined_vector = row['beacon_vector'] + [row['time_gap'], row['gap_category']]
            sequence.append(combined_vector)
        
        sequences.append(sequence)
        labels.append(room)

    return sequences, labels

def create_sliding_windows_by_day(vector_df, window_size=10):
    """ENHANCED: Creates sliding windows with gap features"""
    sequences = []
    labels = []
    timestamps = []  # Track timestamps for later analysis

    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)

        if len(day_group) >= window_size:
            for i in range(len(day_group) - window_size + 1):
                window_group = day_group.iloc[i : i + window_size]
                
                # Combine beacon vectors with gap features
                window = []
                for _, row in window_group.iterrows():
                    combined_vector = row['beacon_vector'] + [row['time_gap'], row['gap_category']]
                    window.append(combined_vector)
                
                sequences.append(window)
                labels.append(day_group.iloc[i + window_size - 1]['room'])
                timestamps.append(day_group.iloc[i + window_size - 1]['timestamp'])

    return sequences, labels, timestamps

def build_bidirectional_gru_model_with_gaps(input_shape, num_classes):
    """
    ENHANCED: Bidirectional GRU that accepts beacon counts + gap features
    Input shape: (sequence_length, 25)  [23 beacons + 1 time_gap + 1 gap_category]
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),

        Bidirectional(GRU(128, return_sequences=True)),
        Dropout(0.3),

        Bidirectional(GRU(64, return_sequences=False)),
        Dropout(0.3),

        Dense(32, activation='relu'),
        Dropout(0.2),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

def apply_confidence_weighted_temporal_voting(predictions_proba, vote_window=5, confidence_threshold=0.7):
    """
    NEW: Confidence-weighted temporal voting with adaptive window
    
    Args:
        predictions_proba: (n_samples, n_classes) probability matrix
        vote_window: base window size for voting
        confidence_threshold: threshold to determine if prediction is "confident"
    
    Returns:
        voted_predictions: (n_samples,) final class predictions
        confidences: (n_samples,) confidence scores
    """
    n_samples, n_classes = predictions_proba.shape
    voted_predictions = np.zeros(n_samples, dtype=int)
    confidences = np.zeros(n_samples)
    
    for i in range(n_samples):
        # Get max probability at position i
        max_prob_i = np.max(predictions_proba[i])
        
        # Adaptive window: larger window for uncertain predictions
        if max_prob_i >= confidence_threshold:
            current_window = max(3, vote_window // 2)  # Shorter window for confident
        else:
            current_window = vote_window  # Full window for uncertain
        
        # Get window boundaries
        half_window = current_window // 2
        start = max(0, i - half_window)
        end = min(n_samples, i + half_window + 1)
        
        # Weighted voting within window
        window_proba = predictions_proba[start:end]  # (window_size, n_classes)
        window_confidences = np.max(window_proba, axis=1)  # (window_size,)
        
        # Weight each prediction by its confidence
        weighted_votes = np.zeros(n_classes)
        for j in range(len(window_proba)):
            weighted_votes += window_proba[j] * window_confidences[j]
        
        voted_predictions[i] = np.argmax(weighted_votes)
        confidences[i] = np.max(weighted_votes) / np.sum(window_confidences)
    
    return voted_predictions, confidences

def apply_two_stage_confidence_filtering(predictions, confidences, threshold=0.3):
    """
    NEW: Two-stage filtering - use high-confidence neighbors to stabilize uncertain predictions
    
    Args:
        predictions: (n_samples,) class predictions
        confidences: (n_samples,) confidence scores
        threshold: predictions below this confidence will be reconsidered
    
    Returns:
        filtered_predictions: (n_samples,) refined predictions
    """
    filtered_predictions = predictions.copy()
    n_samples = len(predictions)
    
    for i in range(n_samples):
        if confidences[i] < threshold:
            # Find nearest high-confidence predictions
            window_size = 7
            start = max(0, i - window_size)
            end = min(n_samples, i + window_size + 1)
            
            window_preds = predictions[start:end]
            window_confs = confidences[start:end]
            
            # Use only high-confidence predictions
            high_conf_mask = window_confs > (1 - threshold)
            if np.any(high_conf_mask):
                high_conf_preds = window_preds[high_conf_mask]
                # Use mode of high-confidence predictions
                filtered_predictions[i] = mode(high_conf_preds, keepdims=False).mode
    
    return filtered_predictions

print("âœ“ Enhanced functions with confidence-weighted voting defined")

In [None]:
def train_ensemble_models(train_df, n_models=5, base_seed=42, verbose=True):
    """
    NEW: Train multiple models with different seeds for ensemble
    
    Returns:
        models: List of trained Keras models
        label_encoder: Fitted label encoder
    """
    if verbose:
        print(f"  Training ensemble of {n_models} models...")
    
    # Prepare data (same for all models)
    train_df_grouped = create_room_groups(train_df)
    train_vector_df = create_beacon_count_vectors_with_gaps(train_df_grouped)
    X_train_seq, y_train_labels = create_sequences_from_groups(train_vector_df)
    
    # Encode labels
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train_labels)
    
    # Pad sequences
    X_train_padded = pad_sequences(X_train_seq, padding='pre', dtype='float32', value=0.0)
    
    # Compute class weights
    class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = {i: weight for i, weight in enumerate(class_weights_array)}
    
    # Train multiple models
    models = []
    for i in range(n_models):
        model_seed = base_seed + i * 1000
        set_seeds(model_seed)
        
        if verbose:
            print(f"    Model {i+1}/{n_models} (seed {model_seed})...", end=" ")
        
        model = build_bidirectional_gru_model_with_gaps(
            input_shape=(X_train_padded.shape[1], X_train_padded.shape[2]),
            num_classes=len(label_encoder.classes_)
        )
        
        # Callbacks
        early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, verbose=0)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, verbose=0, min_lr=1e-6)
        
        # Train
        model.fit(
            X_train_padded, y_train,
            epochs=30,
            batch_size=32,
            class_weight=class_weights,
            callbacks=[early_stop, reduce_lr],
            verbose=0
        )
        
        models.append(model)
        
        if verbose:
            print("âœ“")
    
    return models, label_encoder

def ensemble_predict_with_confidence(models, X_test_padded):
    """
    NEW: Ensemble prediction with confidence scores
    
    Returns:
        ensemble_proba: (n_samples, n_classes) averaged probability matrix
    """
    all_predictions = []
    
    for model in models:
        proba = model.predict(X_test_padded, verbose=0)
        all_predictions.append(proba)
    
    # Average probabilities across models
    ensemble_proba = np.mean(all_predictions, axis=0)
    
    return ensemble_proba

print("âœ“ Ensemble training functions defined")

In [None]:
def run_enhanced_pipeline_single_seed(
    train_df, 
    test_df, 
    seed=42, 
    n_ensemble_models=5,
    window_size=10,
    vote_window=5,
    confidence_threshold=0.7,
    use_two_stage_filtering=True,
    verbose=True
):
    """
    ENHANCED PIPELINE with all optimizations
    """
    if verbose:
        print(f"\n{'='*60}")
        print(f"Running Enhanced Pipeline - Seed {seed}")
        print(f"{'='*60}")
        print(f"Ensemble: {n_ensemble_models} models | Window: {window_size}s | Vote: {vote_window}s")
    
    # 1. Train Ensemble Models
    models, label_encoder = train_ensemble_models(
        train_df, 
        n_models=n_ensemble_models, 
        base_seed=seed,
        verbose=verbose
    )
    
    # 2. Prepare Test Data with Gap Features
    if verbose:
        print("  Preparing test data with gap features...")
    
    test_vector_df = create_beacon_count_vectors_with_gaps(test_df)
    X_test_seq, y_test, timestamps = create_sliding_windows_by_day(test_vector_df, window_size=window_size)
    X_test_padded = pad_sequences(X_test_seq, padding='pre', dtype='float32', value=0.0)
    
    # 3. Ensemble Prediction
    if verbose:
        print("  Running ensemble predictions...")
    
    ensemble_proba = ensemble_predict_with_confidence(models, X_test_padded)
    
    # 4. Confidence-Weighted Temporal Voting
    if verbose:
        print("  Applying confidence-weighted temporal voting...")
    
    y_pred_voted_encoded, confidences = apply_confidence_weighted_temporal_voting(
        ensemble_proba, 
        vote_window=vote_window,
        confidence_threshold=confidence_threshold
    )
    
    # 5. Two-Stage Confidence Filtering (Optional)
    if use_two_stage_filtering:
        if verbose:
            print("  Applying two-stage confidence filtering...")
        y_pred_voted_encoded = apply_two_stage_confidence_filtering(
            y_pred_voted_encoded, 
            confidences,
            threshold=0.3
        )
    
    # 6. Decode Predictions
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)
    
    # 7. Final Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)
    
    # 8. Confidence Statistics
    avg_confidence = np.mean(confidences)
    low_conf_ratio = np.mean(confidences < 0.3)
    
    if verbose:
        print(f"\n  Results:")
        print(f"    Macro F1: {macro_f1:.4f}")
        print(f"    Avg Confidence: {avg_confidence:.4f}")
        print(f"    Low Confidence Ratio: {low_conf_ratio:.2%}")
    
    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)},
        'avg_confidence': avg_confidence,
        'low_conf_ratio': low_conf_ratio
    }

print("âœ“ Enhanced pipeline function defined")

In [None]:
# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

## Run Enhanced Pipeline

### Configuration Options:
- `n_ensemble_models`: Number of models per seed (default: 5)
- `window_size`: Sliding window size in seconds (default: 10)
- `vote_window`: Temporal voting window size (default: 5)
- `confidence_threshold`: Threshold for adaptive window (default: 0.7)
- `use_two_stage_filtering`: Enable second stage filtering (default: True)

In [None]:
# Run enhanced pipeline with 10 seeds for each of 4 folds
seeds = [42, 123, 456, 789, 2024, 3141, 5926, 8888, 1337, 9999]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

# Configuration
CONFIG = {
    'n_ensemble_models': 5,  # Train 5 models per seed
    'window_size': 10,
    'vote_window': 5,
    'confidence_threshold': 0.7,
    'use_two_stage_filtering': True
}

print("\n" + "="*80)
print("ENHANCED BI-GRU WITH CONFIDENCE-WEIGHTED ENSEMBLE")
print("="*80)
print(f"Configuration: {CONFIG}")
print("="*80 + "\n")

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")

    fold_results = []

    for seed in seeds:
        print(f"\n[Fold {fold_num}, Seed {seed}]")
        result = run_enhanced_pipeline_single_seed(
            train_df, test_df, 
            seed=seed,
            **CONFIG,
            verbose=True
        )
        fold_results.append(result)

    all_fold_results[fold_num] = fold_results

    # Calculate fold statistics
    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    avg_confidences = [r['avg_confidence'] for r in fold_results]
    
    print(f"\n{'='*80}")
    print(f"FOLD {fold_num} SUMMARY")
    print(f"{'='*80}")
    print(f"  Mean Macro F1: {np.mean(macro_f1_scores):.4f} Â± {np.std(macro_f1_scores):.4f}")
    print(f"  Min: {np.min(macro_f1_scores):.4f}, Max: {np.max(macro_f1_scores):.4f}")
    print(f"  Mean Confidence: {np.mean(avg_confidences):.4f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)

In [None]:
# Save enhanced results to text file
with open('enhanced_4fold_10seed_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("ENHANCED BI-GRU WITH CONFIDENCE-WEIGHTED ENSEMBLE\n")
    f.write("4-FOLD CROSS-VALIDATION WITH 10 SEEDS PER FOLD\n")
    f.write("="*80 + "\n\n")
    
    f.write("Configuration:\n")
    f.write("-"*80 + "\n")
    for key, value in CONFIG.items():
        f.write(f"  {key}: {value}\n")
    f.write("\n")

    # Overall summary
    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)

    f.write("OVERALL RESULTS (40 runs total):\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} Â± {np.std(all_macro_f1):.4f}\n")
    f.write(f"Min: {np.min(all_macro_f1):.4f}, Max: {np.max(all_macro_f1):.4f}\n")
    
    # Calculate improvement over baseline (0.3854)
    baseline = 0.3854
    improvement = np.mean(all_macro_f1) - baseline
    improvement_pct = (improvement / baseline) * 100
    f.write(f"\nImprovement over baseline (0.3854):\n")
    f.write(f"  Absolute: +{improvement:.4f}\n")
    f.write(f"  Relative: +{improvement_pct:.2f}%\n")
    f.write("\n")

    # Per-fold results
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\n{'='*80}\n")
        f.write(f"FOLD {fold_num} RESULTS\n")
        f.write(f"{'='*80}\n\n")

        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]
        avg_confidences = [r['avg_confidence'] for r in fold_results]

        f.write(f"Macro F1 Scores (10 seeds):\n")
        f.write("-"*80 + "\n")
        for i, result in enumerate(fold_results):
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f} (conf: {result['avg_confidence']:.3f})\n")

        f.write(f"\nStatistics:\n")
        f.write(f"  Mean Macro F1: {np.mean(macro_f1_scores):.4f} Â± {np.std(macro_f1_scores):.4f}\n")
        f.write(f"  Min:  {np.min(macro_f1_scores):.4f}\n")
        f.write(f"  Max:  {np.max(macro_f1_scores):.4f}\n")
        f.write(f"  Mean Confidence: {np.mean(avg_confidences):.4f}\n")

        # Per-class F1 (averaged across 10 seeds)
        f.write(f"\nPer-Class F1 Scores (averaged across 10 seeds):\n")
        f.write("-"*80 + "\n")

        # Collect all class names
        all_classes = set()
        for result in fold_results:
            all_classes.update(result['per_class_f1'].keys())

        # Average per-class F1 across seeds
        for class_name in sorted(all_classes):
            class_f1_scores = [r['per_class_f1'].get(class_name, 0) for r in fold_results]
            mean_f1 = np.mean(class_f1_scores)
            std_f1 = np.std(class_f1_scores)
            f.write(f"  {class_name:20s}: {mean_f1:.4f} Â± {std_f1:.4f}\n")

print("âœ… Enhanced results saved to enhanced_4fold_10seed_results.txt")

In [None]:
# Display comparison summary
print("\n" + "="*80)
print("COMPARISON: BASELINE vs ENHANCED")
print("="*80 + "\n")

baseline_scores = {
    1: 0.4120,
    2: 0.3677,
    3: 0.3910,
    4: 0.3709
}

print(f"{'Fold':<6} {'Baseline':<12} {'Enhanced':<12} {'Improvement':<12} {'%Change'}")
print("-"*60)

for fold_num in [1, 2, 3, 4]:
    enhanced_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    enhanced_mean = np.mean(enhanced_scores)
    baseline = baseline_scores[fold_num]
    improvement = enhanced_mean - baseline
    pct_change = (improvement / baseline) * 100
    
    print(f"Fold {fold_num}  {baseline:.4f}      {enhanced_mean:.4f}      "
          f"{improvement:+.4f}      {pct_change:+.2f}%")

# Overall comparison
baseline_overall = 0.3854
all_enhanced_scores = []
for fold_num in [1, 2, 3, 4]:
    all_enhanced_scores.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

enhanced_overall = np.mean(all_enhanced_scores)
overall_improvement = enhanced_overall - baseline_overall
overall_pct_change = (overall_improvement / baseline_overall) * 100

print("-"*60)
print(f"Overall {baseline_overall:.4f}      {enhanced_overall:.4f}      "
      f"{overall_improvement:+.4f}      {overall_pct_change:+.2f}%")
print("="*80)

# Target assessment
target = 0.45
distance_to_target = target - enhanced_overall
print(f"\nTarget: {target:.4f}")
print(f"Distance to target: {distance_to_target:.4f}")
if enhanced_overall >= target:
    print("ðŸŽ‰ TARGET ACHIEVED!")
elif distance_to_target < 0.01:
    print("ðŸ’ª VERY CLOSE! Almost there!")
else:
    print(f"ðŸ“Š Progress: {(1 - distance_to_target/0.0646) * 100:.1f}% of improvement goal")

## Additional Analysis

### Ablation Study (Optional)

To understand which components contribute most to the improvement, you can run ablation studies by disabling features one at a time:

1. **Without Gap Features**: Set gap features to 0
2. **Without Ensemble**: Use `n_ensemble_models=1`
3. **Without Adaptive Window**: Set `confidence_threshold=0` (always use full window)
4. **Without Two-Stage Filtering**: Set `use_two_stage_filtering=False`

In [None]:
# Quick ablation test on Fold 1 (optional - uncomment to run)
# print("\n" + "="*80)
# print("ABLATION STUDY ON FOLD 1")
# print("="*80 + "\n")

# ablation_configs = {
#     'Full (All features)': {
#         'n_ensemble_models': 5,
#         'use_two_stage_filtering': True,
#         'confidence_threshold': 0.7
#     },
#     'No Ensemble': {
#         'n_ensemble_models': 1,
#         'use_two_stage_filtering': True,
#         'confidence_threshold': 0.7
#     },
#     'No Two-Stage Filtering': {
#         'n_ensemble_models': 5,
#         'use_two_stage_filtering': False,
#         'confidence_threshold': 0.7
#     },
#     'No Adaptive Window': {
#         'n_ensemble_models': 5,
#         'use_two_stage_filtering': True,
#         'confidence_threshold': 0.0  # Always use full window
#     }
# }

# for config_name, config in ablation_configs.items():
#     print(f"\nTesting: {config_name}")
#     result = run_enhanced_pipeline_single_seed(
#         train_df_1, test_df_1,
#         seed=42,
#         window_size=10,
#         vote_window=5,
#         **config,
#         verbose=False
#     )
#     print(f"  Macro F1: {result['macro_f1']:.4f}")