In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("‚úì All imports successful")

In [None]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("‚úì All folds loaded")

In [None]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def create_beacon_count_vectors(df):
    """Aggregates readings into 1s vectors. Handles data with or without 'room_group'."""
    vectors = []
    has_groups = 'room_group' in df.columns

    for _, group in df.groupby('timestamp'):
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                vector[int(beacon_id) - 1] = count / total_readings

        entry = {
            'timestamp': group['timestamp'].iloc[0],
            'room': group['room'].iloc[0],
            'beacon_vector': vector
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """Used for Training: Creates clean sequences where the room is constant."""
    sequences = []
    labels = []

    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=False)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        sequence = [row['beacon_vector'] for _, row in group.iterrows()]
        sequences.append(sequence)
        labels.append(room)

    return sequences, labels

def build_bidirectional_gru_model(input_shape, num_classes):
    """
    Bidirectional GRU Architecture
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),

        Bidirectional(GRU(128, return_sequences=True)),
        Dropout(0.3),

        Bidirectional(GRU(64, return_sequences=False)),
        Dropout(0.3),

        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

print("‚úÖ Basic functions defined")

# EXPERIMENT 5: Fully Adaptive Thresholds (Auto-Calibration)

Problem with Exp 4: Fixed thresholds (0.68, 0.70, 0.75) worked great for Fold 1 but hurt others.

Solution: Calculate optimal thresholds dynamically from each fold's confidence distribution!

In [None]:
def create_extended_multidirectional_windows(vector_df):
    """
    Create 7 types of sliding windows (same as Exp 2/4)
    """
    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date
    
    results = {
        'backward_10': {'sequences': [], 'labels': [], 'indices': []},
        'centered_10': {'sequences': [], 'labels': [], 'indices': []},
        'forward_10': {'sequences': [], 'labels': [], 'indices': []},
        'backward_15': {'sequences': [], 'labels': [], 'indices': []},
        'forward_15': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_past': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_future': {'sequences': [], 'labels': [], 'indices': []},
    }
    
    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)
        vectors = list(day_group['beacon_vector'])
        rooms = list(day_group['room'])
        n = len(vectors)
        
        for i in range(n):
            if i >= 9:
                window = vectors[i - 9 : i + 1]
                results['backward_10']['sequences'].append(window)
                results['backward_10']['labels'].append(rooms[i])
                results['backward_10']['indices'].append((day_group['date'].iloc[0], i))
            
            if i >= 4 and i + 5 < n:
                window = vectors[i - 4 : i + 6]
                results['centered_10']['sequences'].append(window)
                results['centered_10']['labels'].append(rooms[i])
                results['centered_10']['indices'].append((day_group['date'].iloc[0], i))
            
            if i + 9 < n:
                window = vectors[i : i + 10]
                results['forward_10']['sequences'].append(window)
                results['forward_10']['labels'].append(rooms[i])
                results['forward_10']['indices'].append((day_group['date'].iloc[0], i))
            
            if i >= 14:
                window = vectors[i - 14 : i + 1]
                results['backward_15']['sequences'].append(window)
                results['backward_15']['labels'].append(rooms[i])
                results['backward_15']['indices'].append((day_group['date'].iloc[0], i))
            
            if i + 14 < n:
                window = vectors[i : i + 15]
                results['forward_15']['sequences'].append(window)
                results['forward_15']['labels'].append(rooms[i])
                results['forward_15']['indices'].append((day_group['date'].iloc[0], i))
            
            if i >= 11 and i + 3 < n:
                window = vectors[i - 11 : i + 4]
                results['asymm_past']['sequences'].append(window)
                results['asymm_past']['labels'].append(rooms[i])
                results['asymm_past']['indices'].append((day_group['date'].iloc[0], i))
            
            if i >= 3 and i + 11 < n:
                window = vectors[i - 3 : i + 12]
                results['asymm_future']['sequences'].append(window)
                results['asymm_future']['labels'].append(rooms[i])
                results['asymm_future']['indices'].append((day_group['date'].iloc[0], i))
    
    return results

print("‚úÖ Multi-directional window function defined")

In [None]:
def train_ensemble_models(train_df, n_models=5, base_seed=42, verbose=False):
    """
    Train ensemble (same as before)
    """
    if verbose:
        print(f"  Training ensemble of {n_models} models...")
    
    train_df_grouped = create_room_groups(train_df)
    train_vector_df = create_beacon_count_vectors(train_df_grouped)
    X_train_seq, y_train_labels = create_sequences_from_groups(train_vector_df, max_length=50)
    
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train_labels)
    
    X_train_padded = pad_sequences(X_train_seq, maxlen=50, padding='post', dtype='float32', value=0.0)
    
    class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = {i: weight for i, weight in enumerate(class_weights_array)}
    
    models = []
    for i in range(n_models):
        model_seed = base_seed + i * 1000
        set_seeds(model_seed)
        
        if verbose:
            print(f"    Model {i+1}/{n_models} (seed {model_seed})...", end=" ")
        
        model = build_bidirectional_gru_model(
            input_shape=(50, 23),
            num_classes=len(label_encoder.classes_)
        )
        
        early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, verbose=0)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, verbose=0, min_lr=1e-6)
        
        model.fit(
            X_train_padded, y_train,
            epochs=30,
            batch_size=32,
            class_weight=class_weights,
            callbacks=[early_stop, reduce_lr],
            verbose=0
        )
        
        models.append(model)
        
        if verbose:
            print("‚úì")
    
    return models, label_encoder

print("‚úì Ensemble training function defined")

# NEW: Auto-Calibrating Adaptive Thresholds

In [None]:
def predict_single_direction(models, sequences, max_seq_length=50):
    """
    Get ensemble predictions for a single direction
    """
    X_padded = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)
    
    all_predictions = []
    for model in models:
        proba = model.predict(X_padded, verbose=0)
        all_predictions.append(proba)
    
    ensemble_proba = np.mean(all_predictions, axis=0)
    
    return ensemble_proba

def calculate_adaptive_thresholds(direction_results, verbose=False):
    """
    NEW: Calculate optimal thresholds from the actual confidence distributions
    
    Strategy:
    1. Analyze the confidence distribution for each direction
    2. Calculate mean and std of confidences
    3. Set thresholds RELATIVE to what we observe
    
    This auto-calibrates for each fold's unique characteristics!
    
    Returns:
        dict with calculated thresholds
    """
    direction_names = ['backward_10', 'centered_10', 'forward_10', 
                      'backward_15', 'forward_15', 
                      'asymm_past', 'asymm_future']
    
    # Calculate average confidence for each direction
    avg_confidences = {}
    all_confidences = []
    
    for direction_name in direction_names:
        proba = direction_results[direction_name]['proba']
        confidences = np.max(proba, axis=1)
        avg_conf = np.mean(confidences)
        avg_confidences[direction_name] = avg_conf
        all_confidences.extend(confidences)
    
    # Overall statistics
    overall_mean = np.mean(all_confidences)
    overall_std = np.std(all_confidences)
    centered_mean = avg_confidences['centered_10']
    
    # Calculate adaptive thresholds
    # Key insight: Use relative positioning rather than absolute values
    
    # Centered boost: slightly above centered's average (was 0.68 for Fold 1 where centered=0.655)
    # Fold 1: centered=0.655, threshold=0.68 ‚Üí offset +0.025
    centered_boost_threshold = centered_mean + 0.025
    
    # High confidence: overall_mean + 1 std (marks "unusually confident" predictions)
    high_conf_threshold = overall_mean + overall_std
    
    # Very high confidence: overall_mean + 1.5 std (marks "very unusually confident")
    very_high_conf_threshold = overall_mean + (1.5 * overall_std)
    
    thresholds = {
        'centered_boost': centered_boost_threshold,
        'high_conf': high_conf_threshold,
        'very_high_conf': very_high_conf_threshold
    }
    
    if verbose:
        print(f"  Auto-calibrated thresholds:")
        print(f"    Overall mean confidence: {overall_mean:.3f} ¬± {overall_std:.3f}")
        print(f"    Centered mean confidence: {centered_mean:.3f}")
        print(f"    ‚Üí Centered boost threshold: {centered_boost_threshold:.3f}")
        print(f"    ‚Üí High confidence threshold: {high_conf_threshold:.3f}")
        print(f"    ‚Üí Very high confidence threshold: {very_high_conf_threshold:.3f}")
    
    return thresholds

def combine_directional_predictions_auto_adaptive(direction_results, 
                                                  auto_thresholds,
                                                  verbose=False):
    """
    NEW: Adaptive combination using AUTO-CALIBRATED thresholds
    
    Same logic as Exp 4, but thresholds are calculated from data!
    """
    # Unpack auto-calculated thresholds
    centered_boost_threshold = auto_thresholds['centered_boost']
    high_conf_threshold = auto_thresholds['high_conf']
    very_high_conf_threshold = auto_thresholds['very_high_conf']
    
    # Build position mapping
    all_positions = set()
    direction_names = ['backward_10', 'centered_10', 'forward_10', 
                      'backward_15', 'forward_15', 
                      'asymm_past', 'asymm_future']
    
    for direction in direction_names:
        all_positions.update(direction_results[direction]['indices'])
    
    all_positions = sorted(all_positions)
    position_map = {pos: idx for idx, pos in enumerate(all_positions)}
    
    n_classes = direction_results['backward_10']['proba'].shape[1]
    n_positions = len(all_positions)
    
    combined_proba = np.zeros((n_positions, n_classes))
    position_counts = np.zeros(n_positions)
    
    # Pre-compute confidences
    direction_confidences = {}
    for direction_name in direction_names:
        proba = direction_results[direction_name]['proba']
        direction_confidences[direction_name] = np.max(proba, axis=1)
    
    # Apply adaptive weighting per position
    for pos_idx, pos in enumerate(all_positions):
        position_directions = {}
        position_confs = {}
        
        for direction_name in direction_names:
            if pos in direction_results[direction_name]['indices']:
                idx = direction_results[direction_name]['indices'].index(pos)
                position_directions[direction_name] = direction_results[direction_name]['proba'][idx]
                position_confs[direction_name] = direction_confidences[direction_name][idx]
        
        if not position_directions:
            continue
        
        # ADAPTIVE WEIGHTING (same logic as Exp 4, but with auto-thresholds)
        weights = {}
        
        max_conf = max(position_confs.values())
        avg_conf = np.mean(list(position_confs.values()))
        centered_conf = position_confs.get('centered_10', 0)
        
        # Strategy 1: Very high confidence
        if max_conf >= very_high_conf_threshold:
            for direction_name, conf in position_confs.items():
                if conf >= very_high_conf_threshold:
                    weights[direction_name] = conf * 2.5
                elif conf >= high_conf_threshold:
                    weights[direction_name] = conf * 1.2
                else:
                    weights[direction_name] = conf * 0.5
        
        # Strategy 2: Centered is confident
        elif centered_conf >= centered_boost_threshold:
            for direction_name, conf in position_confs.items():
                if direction_name == 'centered_10':
                    weights[direction_name] = conf * 1.8
                else:
                    weights[direction_name] = conf * 0.8
        
        # Strategy 3: All low confidence
        elif avg_conf < (high_conf_threshold - 0.10):  # Adaptive "low" threshold
            for direction_name, conf in position_confs.items():
                weights[direction_name] = 1.0
        
        # Strategy 4: Normal case
        else:
            for direction_name, conf in position_confs.items():
                weights[direction_name] = conf
        
        # Combine
        total_weight = sum(weights.values())
        for direction_name, weight in weights.items():
            combined_proba[pos_idx] += position_directions[direction_name] * weight
        
        if total_weight > 0:
            combined_proba[pos_idx] /= total_weight
    
    return combined_proba, position_map

print("‚úÖ Auto-adaptive threshold calculation and combination defined")

In [None]:
def apply_confidence_weighted_voting(predictions_proba, vote_window=5):
    """
    Confidence-weighted temporal voting (same as before)
    """
    n_samples, n_classes = predictions_proba.shape
    voted_predictions = np.zeros(n_samples, dtype=int)
    
    for i in range(n_samples):
        half_window = vote_window // 2
        start = max(0, i - half_window)
        end = min(n_samples, i + half_window + 1)
        
        window_proba = predictions_proba[start:end]
        window_confidences = np.max(window_proba, axis=1)
        
        weighted_votes = np.zeros(n_classes)
        for j in range(len(window_proba)):
            weighted_votes += window_proba[j] * window_confidences[j]
        
        voted_predictions[i] = np.argmax(weighted_votes)
    
    return voted_predictions

print("‚úÖ Temporal voting function defined")

# Complete Pipeline with Auto-Calibrating Thresholds

In [None]:
def run_auto_adaptive_pipeline(train_df, test_df, seed, n_ensemble=5, 
                               vote_window=5,
                               verbose=False):
    """
    EXPERIMENT 5: 7 directions + AUTO-CALIBRATING adaptive thresholds
    
    Key innovation: Thresholds calculated from each fold's confidence distribution!
    
    Pipeline:
    1. Train ensemble
    2. Create 7 directional windows
    3. Get predictions for all directions
    4. NEW: Auto-calculate optimal thresholds from confidence distributions
    5. Apply adaptive weighting with auto-thresholds
    6. Temporal voting
    """
    tf.keras.backend.clear_session()
    set_seeds(seed)
    
    if verbose:
        print(f"\n  Seed {seed}: Training ensemble...")
    
    # 1. Train Ensemble
    models, label_encoder = train_ensemble_models(
        train_df,
        n_models=n_ensemble,
        base_seed=seed,
        verbose=verbose
    )
    
    if verbose:
        print("  Creating multi-directional windows...")
    
    # 2. Create Windows
    test_vectors = create_beacon_count_vectors(test_df)
    direction_windows = create_extended_multidirectional_windows(test_vectors)
    
    if verbose:
        print("  Getting directional predictions...")
    
    # 3. Get Predictions
    direction_results = {}
    direction_names = ['backward_10', 'centered_10', 'forward_10', 
                      'backward_15', 'forward_15', 
                      'asymm_past', 'asymm_future']
    
    for direction_name in direction_names:
        sequences = direction_windows[direction_name]['sequences']
        proba = predict_single_direction(models, sequences, max_seq_length=50)
        
        direction_results[direction_name] = {
            'proba': proba,
            'indices': direction_windows[direction_name]['indices'],
            'labels': direction_windows[direction_name]['labels']
        }
    
    if verbose:
        print("  Auto-calibrating thresholds from confidence distributions...")
    
    # 4. NEW: Auto-calculate thresholds
    auto_thresholds = calculate_adaptive_thresholds(direction_results, verbose=verbose)
    
    if verbose:
        print("  Combining directions with auto-adaptive weighting...")
    
    # 5. Adaptive Combination with auto-thresholds
    combined_proba, position_map = combine_directional_predictions_auto_adaptive(
        direction_results,
        auto_thresholds,
        verbose=verbose
    )
    
    # Get ground truth
    y_test = []
    for pos in sorted(position_map.keys()):
        for direction_name in direction_names:
            if pos in direction_results[direction_name]['indices']:
                idx = direction_results[direction_name]['indices'].index(pos)
                y_test.append(direction_results[direction_name]['labels'][idx])
                break
    
    if verbose:
        print(f"  Applying temporal voting (window={vote_window})...")
    
    # 6. Temporal Voting
    y_pred_voted_encoded = apply_confidence_weighted_voting(combined_proba, vote_window=vote_window)
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)
    
    # 7. Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)
    
    if verbose:
        print(f"  ‚úì Macro F1: {macro_f1:.4f}")
    
    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)},
        'auto_thresholds': auto_thresholds
    }

print("‚úÖ Complete auto-adaptive pipeline defined")

In [None]:
# Check GPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

# Run Experiment 5: Auto-Calibrating Adaptive Thresholds

## Quick Test on Fold 1

In [None]:
# QUICK TEST: Fold 1, 3 seeds
print("="*80)
print("EXPERIMENT 5: AUTO-CALIBRATING ADAPTIVE THRESHOLDS")
print("Testing on Fold 1 with 3 seeds")
print("="*80)

seeds = [42, 123, 456]
train_df, test_df = train_df_1, test_df_1

results_auto = []

for seed in seeds:
    print(f"\nRunning seed {seed}...")
    result = run_auto_adaptive_pipeline(
        train_df, test_df, 
        seed=seed,
        n_ensemble=5,
        vote_window=5,
        verbose=True
    )
    results_auto.append(result)

macro_f1_scores = [r['macro_f1'] for r in results_auto]

print("\n" + "="*80)
print("EXPERIMENT 5 RESULTS (Fold 1, 3 seeds)")
print("="*80)
print(f"\nAuto-Calibrating Adaptive:")
print(f"  Mean Macro F1: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")
print(f"  Individual runs:")
for result in results_auto:
    print(f"    Seed {result['seed']}: {result['macro_f1']:.4f}")
    print(f"      Auto-thresholds: centered={result['auto_thresholds']['centered_boost']:.3f}, "
          f"high={result['auto_thresholds']['high_conf']:.3f}, "
          f"very_high={result['auto_thresholds']['very_high_conf']:.3f}")

print("\n" + "="*80)
print("COMPARISON:")
print("="*80)
print(f"Exp 2 (standard weighting):       0.4896 ¬± 0.0151")
print(f"Exp 4 (fixed thresholds):         0.5094 ¬± 0.0115")
print(f"Exp 5 (auto-calibrated):          {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")
print(f"\nChange from Exp 4: {np.mean(macro_f1_scores) - 0.5094:+.4f}")

if np.mean(macro_f1_scores) >= 0.5094:
    print("\n‚úÖ Auto-calibration maintains or improves Fold 1 performance!")
else:
    print("\nüìä Auto-calibration slightly different but should generalize better to other folds")

# Full 4-Fold Cross-Validation

In [None]:
# FULL EXPERIMENT
print("="*80)
print("FULL 4-FOLD CROSS-VALIDATION - EXPERIMENT 5")
print("="*80)

seeds = [42, 123, 456]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")
    
    fold_results = []
    
    for seed in seeds:
        print(f"  Running seed {seed}...", end=" ")
        result = run_auto_adaptive_pipeline(
            train_df, test_df, 
            seed=seed,
            n_ensemble=5,
            vote_window=5,
            verbose=False
        )
        fold_results.append(result)
        print(f"Macro F1: {result['macro_f1']:.4f}")
    
    all_fold_results[fold_num] = fold_results
    
    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    print(f"\n  Fold {fold_num} Summary:")
    print(f"    Mean Macro F1: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")
    print(f"    Auto-calibrated thresholds (averaged):")
    avg_centered = np.mean([r['auto_thresholds']['centered_boost'] for r in fold_results])
    avg_high = np.mean([r['auto_thresholds']['high_conf'] for r in fold_results])
    avg_very_high = np.mean([r['auto_thresholds']['very_high_conf'] for r in fold_results])
    print(f"      Centered: {avg_centered:.3f}, High: {avg_high:.3f}, Very high: {avg_very_high:.3f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)

In [None]:
# Final Summary
print("\n" + "="*80)
print("FINAL SUMMARY - EXPERIMENT 5 (AUTO-CALIBRATING)")
print("="*80 + "\n")

for fold_num in [1, 2, 3, 4]:
    macro_f1_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    print(f"Fold {fold_num}: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")

all_macro_f1 = []
for fold_num in [1, 2, 3, 4]:
    all_macro_f1.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

print(f"\n{'='*80}")
print(f"Overall Mean: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}")
print(f"{'='*80}")

print("\n" + "="*80)
print("COMPLETE PROGRESSION:")
print("="*80)
print("Baseline: 0.4106")
print("Exp 2 (7 dir, standard): 0.4384")
print("Exp 4 (7 dir, fixed thresholds): 0.4392 (Fold 1: 0.5094, others: declined)")
print(f"Exp 5 (7 dir, auto-calibrated): {np.mean(all_macro_f1):.4f}")

# Per-fold comparison with Exp 4
exp4_folds = [0.5094, 0.4231, 0.4072, 0.4172]
print(f"\n{'='*80}")
print("PER-FOLD COMPARISON (Exp 4 vs Exp 5):")
print(f"{'='*80}")
for fold_num in [1, 2, 3, 4]:
    exp5_mean = np.mean([r['macro_f1'] for r in all_fold_results[fold_num]])
    exp4_mean = exp4_folds[fold_num - 1]
    change = exp5_mean - exp4_mean
    print(f"Fold {fold_num}: {exp5_mean:.4f} (Exp 4: {exp4_mean:.4f}, change: {change:+.4f})")

total_gain = np.mean(all_macro_f1) - 0.4106
gain_from_exp2 = np.mean(all_macro_f1) - 0.4384
gain_from_exp4 = np.mean(all_macro_f1) - 0.4392
target_gap = 0.45 - np.mean(all_macro_f1)

print(f"\n{'='*80}")
print(f"Total gain from baseline: {total_gain:+.4f}")
print(f"Gain from Exp 2: {gain_from_exp2:+.4f}")
print(f"Gain from Exp 4: {gain_from_exp4:+.4f}")
print(f"Gap to target (0.45): {target_gap:.4f}")
print(f"{'='*80}")

if np.mean(all_macro_f1) >= 0.45:
    print("\nüéØüéØüéØ TARGET ACHIEVED! 0.45 F1 REACHED! üéØüéØüéØ")
    print("\nüéâ AUTO-CALIBRATION WORKED! All folds benefit from adaptive thresholds!")
elif gain_from_exp4 > 0.005:
    print(f"\n‚úÖ Auto-calibration improves consistency! +{gain_from_exp4:.4f} overall")
    print(f"   Remaining gap: {target_gap:.4f}")
    if target_gap < 0.01:
        print("   SO CLOSE! Try hyperparameter tuning for final push")
elif gain_from_exp4 > -0.002:
    print("\nüìä Auto-calibration maintains performance (similar to fixed)")
    print("   But likely MORE ROBUST across different data distributions!")
else:
    print("\n‚ö†Ô∏è  Auto-calibration needs refinement")
    print("   Try: Different threshold calculation strategies")

In [None]:
# Save results
with open('experiment5_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("EXPERIMENT 5: AUTO-CALIBRATING ADAPTIVE THRESHOLDS\n")
    f.write("="*80 + "\n\n")
    
    f.write("Key Innovation:\n")
    f.write("-"*80 + "\n")
    f.write("Instead of fixed thresholds (Exp 4), calculate them dynamically per fold:\n")
    f.write("  - Analyze confidence distributions from all 7 directions\n")
    f.write("  - centered_boost = centered_mean + 0.025\n")
    f.write("  - high_conf = overall_mean + 1.0 * std\n")
    f.write("  - very_high_conf = overall_mean + 1.5 * std\n")
    f.write("This auto-calibrates for each fold's unique characteristics!\n\n")
    
    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)
    
    f.write("OVERALL RESULTS:\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}\n\n")
    
    f.write("PROGRESSION:\n")
    f.write("-"*80 + "\n")
    f.write("Baseline: 0.4106\n")
    f.write("Exp 2: 0.4384\n")
    f.write("Exp 4: 0.4392 (fold-specific: Fold 1 great, others declined)\n")
    f.write(f"Exp 5: {np.mean(all_macro_f1):.4f} (auto-calibrated per fold)\n\n")
    
    f.write("Per-Fold Results:\n")
    f.write("-"*80 + "\n")
    exp4_folds = [0.5094, 0.4231, 0.4072, 0.4172]
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\nFold {fold_num}:\n")
        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]
        
        for result in fold_results:
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f}")
            f.write(f" (thresholds: {result['auto_thresholds']['centered_boost']:.3f}, ")
            f.write(f"{result['auto_thresholds']['high_conf']:.3f}, ")
            f.write(f"{result['auto_thresholds']['very_high_conf']:.3f})\n")
        
        exp5_mean = np.mean(macro_f1_scores)
        exp4_mean = exp4_folds[fold_num - 1]
        f.write(f"  Mean: {exp5_mean:.4f} (Exp 4: {exp4_mean:.4f}, change: {exp5_mean - exp4_mean:+.4f})\n")

print("‚úÖ Results saved to experiment5_results.txt")