In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("‚úì All imports successful")

‚úì All imports successful


In [3]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("‚úì All folds loaded")

‚úì All folds loaded


In [4]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def create_beacon_count_vectors(df):
    """Aggregates readings into 1s vectors. Handles data with or without 'room_group'."""
    vectors = []
    has_groups = 'room_group' in df.columns

    for _, group in df.groupby('timestamp'):
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                vector[int(beacon_id) - 1] = count / total_readings

        entry = {
            'timestamp': group['timestamp'].iloc[0],
            'room': group['room'].iloc[0],
            'beacon_vector': vector
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """Used for Training: Creates clean sequences where the room is constant."""
    sequences = []
    labels = []

    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=False)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        sequence = [row['beacon_vector'] for _, row in group.iterrows()]
        sequences.append(sequence)
        labels.append(room)

    return sequences, labels

def build_bidirectional_gru_model(input_shape, num_classes):
    """
    Bidirectional GRU Architecture
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),

        Bidirectional(GRU(128, return_sequences=True)),
        Dropout(0.3),

        Bidirectional(GRU(64, return_sequences=False)),
        Dropout(0.3),

        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

print("‚úÖ Basic functions defined")

‚úÖ Basic functions defined


In [5]:
def create_extended_multidirectional_windows(vector_df):
    """
    Create 7 types of sliding windows (same as Experiment 2)
    """
    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    results = {
        'backward_10': {'sequences': [], 'labels': [], 'indices': []},
        'centered_10': {'sequences': [], 'labels': [], 'indices': []},
        'forward_10': {'sequences': [], 'labels': [], 'indices': []},
        'backward_15': {'sequences': [], 'labels': [], 'indices': []},
        'forward_15': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_past': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_future': {'sequences': [], 'labels': [], 'indices': []},
    }

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)
        vectors = list(day_group['beacon_vector'])
        rooms = list(day_group['room'])
        n = len(vectors)

        for i in range(n):
            if i >= 9:
                window = vectors[i - 9 : i + 1]
                results['backward_10']['sequences'].append(window)
                results['backward_10']['labels'].append(rooms[i])
                results['backward_10']['indices'].append((day_group['date'].iloc[0], i))

            if i >= 4 and i + 5 < n:
                window = vectors[i - 4 : i + 6]
                results['centered_10']['sequences'].append(window)
                results['centered_10']['labels'].append(rooms[i])
                results['centered_10']['indices'].append((day_group['date'].iloc[0], i))

            if i + 9 < n:
                window = vectors[i : i + 10]
                results['forward_10']['sequences'].append(window)
                results['forward_10']['labels'].append(rooms[i])
                results['forward_10']['indices'].append((day_group['date'].iloc[0], i))

            if i >= 14:
                window = vectors[i - 14 : i + 1]
                results['backward_15']['sequences'].append(window)
                results['backward_15']['labels'].append(rooms[i])
                results['backward_15']['indices'].append((day_group['date'].iloc[0], i))

            if i + 14 < n:
                window = vectors[i : i + 15]
                results['forward_15']['sequences'].append(window)
                results['forward_15']['labels'].append(rooms[i])
                results['forward_15']['indices'].append((day_group['date'].iloc[0], i))

            if i >= 11 and i + 3 < n:
                window = vectors[i - 11 : i + 4]
                results['asymm_past']['sequences'].append(window)
                results['asymm_past']['labels'].append(rooms[i])
                results['asymm_past']['indices'].append((day_group['date'].iloc[0], i))

            if i >= 3 and i + 11 < n:
                window = vectors[i - 3 : i + 12]
                results['asymm_future']['sequences'].append(window)
                results['asymm_future']['labels'].append(rooms[i])
                results['asymm_future']['indices'].append((day_group['date'].iloc[0], i))

    return results

print("‚úÖ Multi-directional window function defined (7 directions)")

‚úÖ Multi-directional window function defined (7 directions)


In [6]:
def train_ensemble_models(train_df, n_models=5, base_seed=42, verbose=False):
    """
    Train ensemble (same as Experiment 2)
    """
    if verbose:
        print(f"  Training ensemble of {n_models} models...")

    train_df_grouped = create_room_groups(train_df)
    train_vector_df = create_beacon_count_vectors(train_df_grouped)
    X_train_seq, y_train_labels = create_sequences_from_groups(train_vector_df, max_length=50)

    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train_labels)

    X_train_padded = pad_sequences(X_train_seq, maxlen=50, padding='post', dtype='float32', value=0.0)

    class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = {i: weight for i, weight in enumerate(class_weights_array)}

    models = []
    for i in range(n_models):
        model_seed = base_seed + i * 1000
        set_seeds(model_seed)

        if verbose:
            print(f"    Model {i+1}/{n_models} (seed {model_seed})...", end=" ")

        model = build_bidirectional_gru_model(
            input_shape=(50, 23),
            num_classes=len(label_encoder.classes_)
        )

        early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, verbose=0)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, verbose=0, min_lr=1e-6)

        model.fit(
            X_train_padded, y_train,
            epochs=30,
            batch_size=32,
            class_weight=class_weights,
            callbacks=[early_stop, reduce_lr],
            verbose=0
        )

        models.append(model)

        if verbose:
            print("‚úì")

    return models, label_encoder

print("‚úì Ensemble training function defined")

‚úì Ensemble training function defined


In [7]:
def predict_single_direction(models, sequences, max_seq_length=50):
    """
    Get ensemble predictions for a single direction
    """
    X_padded = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)

    all_predictions = []
    for model in models:
        proba = model.predict(X_padded, verbose=0)
        all_predictions.append(proba)

    ensemble_proba = np.mean(all_predictions, axis=0)

    return ensemble_proba

def calculate_percentile_thresholds(direction_results, verbose=False):
    """
    NEW: Calculate thresholds using PERCENTILES instead of mean+std

    Key advantages:
    1. Robust to outliers (not affected by extreme confidence values)
    2. Distribution-agnostic (works for any shape, not just normal)
    3. Directly interpretable (75th percentile = top 25% of predictions)
    4. Automatically adapts to each fold's confidence distribution

    Strategy:
    - Collect ALL confidence values from all 7 directions
    - Calculate percentiles directly from the empirical distribution
    - Use different percentiles for different boosting strategies

    Returns:
        dict with percentile-based thresholds
    """
    direction_names = ['backward_10', 'centered_10', 'forward_10',
                      'backward_15', 'forward_15',
                      'asymm_past', 'asymm_future']

    # Collect all confidences
    all_confidences = []
    centered_confidences = []

    for direction_name in direction_names:
        proba = direction_results[direction_name]['proba']
        confidences = np.max(proba, axis=1)
        all_confidences.extend(confidences)

        if direction_name == 'centered_10':
            centered_confidences = confidences

    all_confidences = np.array(all_confidences)
    centered_confidences = np.array(centered_confidences)

    # Calculate percentile-based thresholds
    # These percentiles chosen based on adaptive strategy needs:

    # For "very high confidence" boost (Strategy 1):
    # Use 85th percentile = top 15% of predictions
    # These are the MOST confident predictions that deserve strong boosting
    very_high_conf_threshold = np.percentile(all_confidences, 85)

    # For "high confidence" boost (Strategy 1):
    # Use 70th percentile = top 30% of predictions
    # These are confident but not extreme
    high_conf_threshold = np.percentile(all_confidences, 70)

    # For "centered boost" (Strategy 2):
    # Use 60th percentile of CENTERED predictions specifically
    # This means we boost centered when it's above its own median confidence
    centered_boost_threshold = np.percentile(centered_confidences, 60)

    # For "low confidence" detection (Strategy 3):
    # Use 40th percentile = bottom 40% of predictions
    # When predictions are this uncertain, use equal weighting
    low_conf_threshold = np.percentile(all_confidences, 40)

    thresholds = {
        'very_high_conf': very_high_conf_threshold,
        'high_conf': high_conf_threshold,
        'centered_boost': centered_boost_threshold,
        'low_conf': low_conf_threshold
    }

    if verbose:
        print(f"  Percentile-based thresholds:")
        print(f"    Confidence distribution: min={np.min(all_confidences):.3f}, "
              f"median={np.median(all_confidences):.3f}, max={np.max(all_confidences):.3f}")
        print(f"    ‚Üí Very high conf (85th percentile): {very_high_conf_threshold:.3f}")
        print(f"    ‚Üí High conf (70th percentile): {high_conf_threshold:.3f}")
        print(f"    ‚Üí Centered boost (60th of centered): {centered_boost_threshold:.3f}")
        print(f"    ‚Üí Low conf (40th percentile): {low_conf_threshold:.3f}")

    return thresholds

def combine_directional_predictions_percentile(direction_results,
                                               percentile_thresholds,
                                               verbose=False):
    """
    NEW: Adaptive combination using PERCENTILE-BASED thresholds

    Same adaptive logic as Exp 4/5, but with robust percentile thresholds!
    """
    # Unpack percentile thresholds
    very_high_conf_threshold = percentile_thresholds['very_high_conf']
    high_conf_threshold = percentile_thresholds['high_conf']
    centered_boost_threshold = percentile_thresholds['centered_boost']
    low_conf_threshold = percentile_thresholds['low_conf']

    # Build position mapping
    all_positions = set()
    direction_names = ['backward_10', 'centered_10', 'forward_10',
                      'backward_15', 'forward_15',
                      'asymm_past', 'asymm_future']

    for direction in direction_names:
        all_positions.update(direction_results[direction]['indices'])

    all_positions = sorted(all_positions)
    position_map = {pos: idx for idx, pos in enumerate(all_positions)}

    n_classes = direction_results['backward_10']['proba'].shape[1]
    n_positions = len(all_positions)

    combined_proba = np.zeros((n_positions, n_classes))

    # Pre-compute confidences
    direction_confidences = {}
    for direction_name in direction_names:
        proba = direction_results[direction_name]['proba']
        direction_confidences[direction_name] = np.max(proba, axis=1)

    # Apply adaptive weighting per position
    for pos_idx, pos in enumerate(all_positions):
        position_directions = {}
        position_confs = {}

        for direction_name in direction_names:
            if pos in direction_results[direction_name]['indices']:
                idx = direction_results[direction_name]['indices'].index(pos)
                position_directions[direction_name] = direction_results[direction_name]['proba'][idx]
                position_confs[direction_name] = direction_confidences[direction_name][idx]

        if not position_directions:
            continue

        # ADAPTIVE WEIGHTING with percentile thresholds
        weights = {}

        max_conf = max(position_confs.values())
        avg_conf = np.mean(list(position_confs.values()))
        centered_conf = position_confs.get('centered_10', 0)

        # Strategy 1: Very high confidence (top 15% of all predictions)
        if max_conf >= very_high_conf_threshold:
            for direction_name, conf in position_confs.items():
                if conf >= very_high_conf_threshold:
                    weights[direction_name] = conf * 2.5  # Strong boost
                elif conf >= high_conf_threshold:
                    weights[direction_name] = conf * 1.2  # Moderate boost
                else:
                    weights[direction_name] = conf * 0.5  # Reduce low-conf

        # Strategy 2: Centered is confident (above its 60th percentile)
        elif centered_conf >= centered_boost_threshold:
            for direction_name, conf in position_confs.items():
                if direction_name == 'centered_10':
                    weights[direction_name] = conf * 1.8  # Boost centered
                else:
                    weights[direction_name] = conf * 0.8  # Slight reduction for others

        # Strategy 3: All low confidence (bottom 40%)
        elif avg_conf < low_conf_threshold:
            for direction_name, conf in position_confs.items():
                weights[direction_name] = 1.0  # Equal weight when uncertain

        # Strategy 4: Normal case (standard confidence weighting)
        else:
            for direction_name, conf in position_confs.items():
                weights[direction_name] = conf

        # Combine predictions
        total_weight = sum(weights.values())
        for direction_name, weight in weights.items():
            combined_proba[pos_idx] += position_directions[direction_name] * weight

        if total_weight > 0:
            combined_proba[pos_idx] /= total_weight

    return combined_proba, position_map

print("‚úÖ Percentile-based threshold calculation and combination defined")

‚úÖ Percentile-based threshold calculation and combination defined


In [8]:
def apply_confidence_weighted_voting(predictions_proba, vote_window=5):
    """
    Confidence-weighted temporal voting (same as Experiment 2)
    """
    n_samples, n_classes = predictions_proba.shape
    voted_predictions = np.zeros(n_samples, dtype=int)

    for i in range(n_samples):
        half_window = vote_window // 2
        start = max(0, i - half_window)
        end = min(n_samples, i + half_window + 1)

        window_proba = predictions_proba[start:end]
        window_confidences = np.max(window_proba, axis=1)

        weighted_votes = np.zeros(n_classes)
        for j in range(len(window_proba)):
            weighted_votes += window_proba[j] * window_confidences[j]

        voted_predictions[i] = np.argmax(weighted_votes)

    return voted_predictions

print("‚úÖ Temporal voting function defined")

‚úÖ Temporal voting function defined


In [9]:
def run_percentile_adaptive_pipeline(train_df, test_df, seed, n_ensemble=5,
                                     vote_window=5,
                                     verbose=False):
    """
    EXPERIMENT 6: 7 directions + PERCENTILE-BASED adaptive thresholds

    Key innovation: Use percentiles (robust, distribution-agnostic) instead of mean+std!

    Pipeline:
    1. Train ensemble (5 models)
    2. Create 7 directional windows
    3. Get predictions for all directions
    4. NEW: Calculate percentile-based thresholds from confidence distributions
    5. Apply adaptive weighting with percentile thresholds
    6. Temporal voting
    """
    tf.keras.backend.clear_session()
    set_seeds(seed)

    if verbose:
        print(f"\n  Seed {seed}: Training ensemble...")

    # 1. Train Ensemble
    models, label_encoder = train_ensemble_models(
        train_df,
        n_models=n_ensemble,
        base_seed=seed,
        verbose=verbose
    )

    if verbose:
        print("  Creating multi-directional windows (7 directions)...")

    # 2. Create Windows
    test_vectors = create_beacon_count_vectors(test_df)
    direction_windows = create_extended_multidirectional_windows(test_vectors)

    if verbose:
        print("  Getting directional predictions...")

    # 3. Get Predictions
    direction_results = {}
    direction_names = ['backward_10', 'centered_10', 'forward_10',
                      'backward_15', 'forward_15',
                      'asymm_past', 'asymm_future']

    for direction_name in direction_names:
        sequences = direction_windows[direction_name]['sequences']
        proba = predict_single_direction(models, sequences, max_seq_length=50)

        direction_results[direction_name] = {
            'proba': proba,
            'indices': direction_windows[direction_name]['indices'],
            'labels': direction_windows[direction_name]['labels']
        }

    if verbose:
        print("  Calculating percentile-based thresholds...")

    # 4. NEW: Calculate percentile thresholds
    percentile_thresholds = calculate_percentile_thresholds(direction_results, verbose=verbose)

    if verbose:
        print("  Combining directions with percentile-based adaptive weighting...")

    # 5. Adaptive Combination with percentile thresholds
    combined_proba, position_map = combine_directional_predictions_percentile(
        direction_results,
        percentile_thresholds,
        verbose=verbose
    )

    # Get ground truth
    y_test = []
    for pos in sorted(position_map.keys()):
        for direction_name in direction_names:
            if pos in direction_results[direction_name]['indices']:
                idx = direction_results[direction_name]['indices'].index(pos)
                y_test.append(direction_results[direction_name]['labels'][idx])
                break

    if verbose:
        print(f"  Applying temporal voting (window={vote_window})...")

    # 6. Temporal Voting
    y_pred_voted_encoded = apply_confidence_weighted_voting(combined_proba, vote_window=vote_window)
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)

    # 7. Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)

    if verbose:
        print(f"  ‚úì Macro F1: {macro_f1:.4f}")

    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)},
        'percentile_thresholds': percentile_thresholds
    }

print("‚úÖ Complete percentile-based adaptive pipeline defined")

‚úÖ Complete percentile-based adaptive pipeline defined


In [10]:
# FULL EXPERIMENT
print("="*80)
print("FULL 4-FOLD CROSS-VALIDATION - EXPERIMENT 6")
print("="*80)

seeds = [42, 123, 456]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")

    fold_results = []

    for seed in seeds:
        print(f"  Running seed {seed}...", end=" ")
        result = run_percentile_adaptive_pipeline(
            train_df, test_df,
            seed=seed,
            n_ensemble=5,
            vote_window=5,
            verbose=False
        )
        fold_results.append(result)
        print(f"Macro F1: {result['macro_f1']:.4f}")

    all_fold_results[fold_num] = fold_results

    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    print(f"\n  Fold {fold_num} Summary:")
    print(f"    Mean Macro F1: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")

    # Show average percentile thresholds for this fold
    avg_thresholds = {
        'very_high': np.mean([r['percentile_thresholds']['very_high_conf'] for r in fold_results]),
        'high': np.mean([r['percentile_thresholds']['high_conf'] for r in fold_results]),
        'centered': np.mean([r['percentile_thresholds']['centered_boost'] for r in fold_results]),
        'low': np.mean([r['percentile_thresholds']['low_conf'] for r in fold_results])
    }
    print(f"    Avg percentile thresholds: centered={avg_thresholds['centered']:.3f}, "
          f"high={avg_thresholds['high']:.3f}, very_high={avg_thresholds['very_high']:.3f}, "
          f"low={avg_thresholds['low']:.3f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)

FULL 4-FOLD CROSS-VALIDATION - EXPERIMENT 6

PROCESSING FOLD 1

  Running seed 42... Macro F1: 0.5275
  Running seed 123... Macro F1: 0.4963
  Running seed 456... Macro F1: 0.4904

  Fold 1 Summary:
    Mean Macro F1: 0.5047 ¬± 0.0163
    Avg percentile thresholds: centered=0.684, high=0.757, very_high=0.847, low=0.613

PROCESSING FOLD 2

  Running seed 42... Macro F1: 0.4142
  Running seed 123... Macro F1: 0.4303
  Running seed 456... Macro F1: 0.4269

  Fold 2 Summary:
    Mean Macro F1: 0.4238 ¬± 0.0070
    Avg percentile thresholds: centered=0.682, high=0.734, very_high=0.786, low=0.575

PROCESSING FOLD 3

  Running seed 42... Macro F1: 0.4056
  Running seed 123... Macro F1: 0.4313
  Running seed 456... Macro F1: 0.3963

  Fold 3 Summary:
    Mean Macro F1: 0.4111 ¬± 0.0148
    Avg percentile thresholds: centered=0.677, high=0.748, very_high=0.808, low=0.603

PROCESSING FOLD 4

  Running seed 42... Macro F1: 0.4239
  Running seed 123... Macro F1: 0.4070
  Running seed 456... Macro 

In [11]:
# Final Summary
print("\n" + "="*80)
print("FINAL SUMMARY - EXPERIMENT 6 (PERCENTILE-BASED)")
print("="*80 + "\n")

for fold_num in [1, 2, 3, 4]:
    macro_f1_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    print(f"Fold {fold_num}: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")

all_macro_f1 = []
for fold_num in [1, 2, 3, 4]:
    all_macro_f1.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

print(f"\n{'='*80}")
print(f"Overall Mean: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}")
print(f"{'='*80}")

print("\n" + "="*80)
print("COMPLETE PROGRESSION:")
print("="*80)
print("Baseline (Approach 24): 0.4106")
print("Exp 2 (7 dir, standard): 0.4384")
print("Exp 4 (fixed thresholds): 0.4392 (Fold 1 great, others declined)")
print("Exp 5 (mean+std adaptive): 0.4390")
print(f"Exp 6 (percentile adaptive): {np.mean(all_macro_f1):.4f}")

# Per-fold comparison
exp2_folds = [0.4896, 0.4295, 0.4113, 0.4230]
exp4_folds = [0.5094, 0.4231, 0.4072, 0.4172]
exp5_folds = [0.4993, 0.4230, 0.4135, 0.4204]

print(f"\n{'='*80}")
print("PER-FOLD COMPARISON:")
print(f"{'='*80}")
print(f"{'Fold':>6} {'Exp2':>8} {'Exp4':>8} {'Exp5':>8} {'Exp6':>8} {'vs Exp2':>9} {'vs Exp4':>9}")
print("-" * 80)
for fold_num in [1, 2, 3, 4]:
    exp6_mean = np.mean([r['macro_f1'] for r in all_fold_results[fold_num]])
    exp2_mean = exp2_folds[fold_num - 1]
    exp4_mean = exp4_folds[fold_num - 1]
    vs_exp2 = exp6_mean - exp2_mean
    vs_exp4 = exp6_mean - exp4_mean
    print(f"{fold_num:>6} {exp2_mean:>8.4f} {exp4_mean:>8.4f} {exp5_folds[fold_num-1]:>8.4f} "
          f"{exp6_mean:>8.4f} {vs_exp2:>+9.4f} {vs_exp4:>+9.4f}")

total_gain = np.mean(all_macro_f1) - 0.4106
gain_from_exp2 = np.mean(all_macro_f1) - 0.4384
target_gap = 0.45 - np.mean(all_macro_f1)

print(f"\n{'='*80}")
print(f"Total gain from baseline (0.4106): {total_gain:+.4f}")
print(f"Gain from Exp 2 (0.4384): {gain_from_exp2:+.4f}")
print(f"Gap to target (0.45): {target_gap:.4f}")
print(f"{'='*80}")

if np.mean(all_macro_f1) >= 0.45:
    print("\nüéØüéØüéØ TARGET ACHIEVED! 0.45 F1 REACHED! üéØüéØüéØ")
    print("\nüéâ PERCENTILE-BASED THRESHOLDS WORKED!")
elif np.mean(all_macro_f1) > 0.4390:
    print(f"\n‚úÖ Percentile thresholds improve over previous! +{gain_from_exp2:.4f}")
    if target_gap < 0.01:
        print(f"   SO CLOSE! Gap is only {target_gap:.4f}")
        print("   Try: Hyperparameter tuning (vote_window, ensemble_size)")
    else:
        print(f"   Remaining gap: {target_gap:.4f}")
elif gain_from_exp2 > -0.005:
    print("\nüìä Percentiles maintain performance (similar to Exp 2)")
    print("   But MORE ROBUST: adapts to each fold's distribution!")
    print(f"   Next: Try hyperparameter tuning to close {target_gap:.4f} gap")
else:
    print("\n‚ö†Ô∏è  Percentiles didn't improve")
    print("   Recommendation: Try hyperparameter tuning instead")


FINAL SUMMARY - EXPERIMENT 6 (PERCENTILE-BASED)

Fold 1: 0.5047 ¬± 0.0163
Fold 2: 0.4238 ¬± 0.0070
Fold 3: 0.4111 ¬± 0.0148
Fold 4: 0.4180 ¬± 0.0078

Overall Mean: 0.4394 ¬± 0.0399

COMPLETE PROGRESSION:
Baseline (Approach 24): 0.4106
Exp 2 (7 dir, standard): 0.4384
Exp 4 (fixed thresholds): 0.4392 (Fold 1 great, others declined)
Exp 5 (mean+std adaptive): 0.4390
Exp 6 (percentile adaptive): 0.4394

PER-FOLD COMPARISON:
  Fold     Exp2     Exp4     Exp5     Exp6   vs Exp2   vs Exp4
--------------------------------------------------------------------------------
     1   0.4896   0.5094   0.4993   0.5047   +0.0151   -0.0047
     2   0.4295   0.4231   0.4230   0.4238   -0.0057   +0.0007
     3   0.4113   0.4072   0.4135   0.4111   -0.0002   +0.0039
     4   0.4230   0.4172   0.4204   0.4180   -0.0050   +0.0008

Total gain from baseline (0.4106): +0.0288
Gain from Exp 2 (0.4384): +0.0010
Gap to target (0.45): 0.0106

‚úÖ Percentile thresholds improve over previous! +0.0010
   Remaining g

In [12]:
# Save results
with open('experiment6_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("EXPERIMENT 6: PERCENTILE-BASED ADAPTIVE THRESHOLDS\n")
    f.write("="*80 + "\n\n")

    f.write("Key Innovation:\n")
    f.write("-"*80 + "\n")
    f.write("Use PERCENTILES instead of mean+std for threshold calculation:\n")
    f.write("  - Very high conf: 85th percentile (top 15%)\n")
    f.write("  - High conf: 70th percentile (top 30%)\n")
    f.write("  - Centered boost: 60th percentile of centered predictions\n")
    f.write("  - Low conf: 40th percentile (bottom 40%)\n")
    f.write("\nAdvantages:\n")
    f.write("  - Robust to outliers\n")
    f.write("  - Distribution-agnostic (no normal distribution assumption)\n")
    f.write("  - Directly interpretable\n")
    f.write("  - Automatically adapts to each fold\n\n")

    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)

    f.write("OVERALL RESULTS:\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}\n\n")

    f.write("PROGRESSION:\n")
    f.write("-"*80 + "\n")
    f.write("Baseline: 0.4106\n")
    f.write("Exp 2: 0.4384 (7 directions, standard weighting)\n")
    f.write("Exp 4: 0.4392 (fixed thresholds, fold-specific)\n")
    f.write("Exp 5: 0.4390 (mean+std adaptive)\n")
    f.write(f"Exp 6: {np.mean(all_macro_f1):.4f} (percentile-based)\n\n")

    f.write("Per-Fold Results:\n")
    f.write("-"*80 + "\n")
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\nFold {fold_num}:\n")
        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]

        for result in fold_results:
            t = result['percentile_thresholds']
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f}")
            f.write(f" (thresholds: centered={t['centered_boost']:.3f}, "
                   f"high={t['high_conf']:.3f}, very_high={t['very_high_conf']:.3f}, "
                   f"low={t['low_conf']:.3f})\n")

        f.write(f"  Mean: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}\n")

print("‚úÖ Results saved to experiment6_results.txt")

‚úÖ Results saved to experiment6_results.txt
