In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking, Input, Multiply, Permute, Reshape, Lambda
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU, Layer
from tensorflow.keras import backend as K
from tensorflow.keras import regularizers  # üÜï ADD THIS LINE
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("‚úì All imports successful")

‚úì All imports successful


## üÜï Custom Attention Layer

This attention mechanism learns to focus on the most important timesteps in the sequence.
It computes attention weights for each timestep and creates a weighted representation.

In [3]:
class AttentionLayer(Layer):
    """
    Custom Attention Layer for Sequence Models WITH REGULARIZATION

    This layer learns which timesteps in the sequence are most important
    for the classification task. It computes attention weights and returns
    a weighted sum of the input sequence.

    üÜï Added L2 regularization to prevent overfitting on small datasets

    Architecture:
    1. Dense layer projects sequence to attention scores (with L2 regularization)
    2. Softmax normalizes scores to weights
    3. Weighted sum creates context vector
    """

    def __init__(self, **kwargs):
        super(AttentionLayer, self).__init__(**kwargs)

    def build(self, input_shape):
        # input_shape: (batch_size, timesteps, features)
        # üÜï Added L2 regularization (0.01) to attention weights
        self.W = self.add_weight(
            name='attention_weight',
            shape=(input_shape[-1], 1),
            initializer='glorot_uniform',
            regularizer=regularizers.l2(0.01),  # üÜï L2 regularization
            trainable=True
        )
        self.b = self.add_weight(
            name='attention_bias',
            shape=(1,),
            initializer='zeros',
            regularizer=regularizers.l2(0.01),  # üÜï L2 regularization for bias too
            trainable=True
        )
        super(AttentionLayer, self).build(input_shape)

    def call(self, inputs, mask=None):
        # inputs shape: (batch_size, timesteps, features)

        # Compute attention scores: (batch_size, timesteps, 1)
        attention_scores = K.tanh(K.dot(inputs, self.W) + self.b)

        # Apply mask if provided (for padded sequences)
        if mask is not None:
            # Expand mask to match attention_scores shape
            mask = K.cast(mask, K.floatx())
            mask = K.expand_dims(mask, axis=-1)
            # Set masked positions to very negative value
            attention_scores = attention_scores * mask + (1 - mask) * (-1e10)

        # Compute attention weights: (batch_size, timesteps, 1)
        attention_weights = K.softmax(attention_scores, axis=1)

        # Compute weighted sum: (batch_size, features)
        context_vector = K.sum(inputs * attention_weights, axis=1)

        return context_vector

    def compute_output_shape(self, input_shape):
        # Output shape: (batch_size, features)
        return (input_shape[0], input_shape[-1])

    def get_config(self):
        return super(AttentionLayer, self).get_config()

print("‚úÖ Custom Attention Layer defined (with L2 regularization)")

‚úÖ Custom Attention Layer defined (with L2 regularization)


In [4]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("‚úì All folds loaded")

‚úì All folds loaded


In [5]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def create_beacon_count_vectors(df):
    """Aggregates readings into 1s vectors. Handles data with or without 'room_group'."""
    vectors = []
    has_groups = 'room_group' in df.columns # Check if we are in 'training' mode

    for _, group in df.groupby('timestamp'):
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                vector[int(beacon_id) - 1] = count / total_readings

        entry = {
            'timestamp': group['timestamp'].iloc[0],
            'room': group['room'].iloc[0],
            'beacon_vector': vector
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """Used for Training: Creates clean sequences where the room is constant."""
    sequences = []
    labels = []

    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=False)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        sequence = [row['beacon_vector'] for _, row in group.iterrows()]
        sequences.append(sequence)
        labels.append(room)

    return sequences, labels

print("‚úÖ Basic functions defined")

‚úÖ Basic functions defined


## üÜï Updated Model Architecture with Attention

The new architecture integrates attention in two ways:
1. **Between Bi-GRU layers**: First Bi-GRU extracts features, attention focuses on important timesteps
2. **After final Bi-GRU**: Second Bi-GRU output is attended to create final context vector

This allows the model to:
- Learn which parts of the 10s/15s window are most discriminative
- Focus on room entry/exit moments rather than treating all timesteps equally
- Handle noisy transition periods better

In [6]:
def build_bidirectional_gru_model_with_attention(input_shape, num_classes):
    """
    Bidirectional GRU Architecture with Attention Mechanism + Regularization

    Architecture:
    1. Masking layer (handle variable-length sequences)
    2. First Bi-GRU (128 units) with return_sequences=True
    3. Dropout (0.3)
    4. üÜï ATTENTION LAYER (with L2 regularization) - learns which timesteps matter most
    5. üÜï Dropout after attention (0.3) - prevents attention overfitting
    6. Dense layer (64 units)
    7. Dropout (0.3)
    8. Dense layer (32 units)
    9. Dropout (0.2)
    10. Output layer (softmax)

    The attention mechanism replaces the second Bi-GRU layer with a learned
    attention-based aggregation of the sequence. Regularization prevents
    overfitting on small training sets (like Fold 3).
    """
    inputs = Input(shape=input_shape, name='input_layer')

    # Masking for padded sequences
    masked = Masking(mask_value=0.0, name='masking')(inputs)

    # First Bi-GRU layer - extracts sequential features
    gru1 = Bidirectional(
        GRU(128, return_sequences=True, name='gru_layer_1'),
        name='bidirectional_gru_1'
    )(masked)
    gru1 = Dropout(0.3, name='dropout_1')(gru1)

    # üÜï ATTENTION MECHANISM (with L2 regularization built-in)
    # Instead of second Bi-GRU, use attention to aggregate the sequence
    # This learns which timesteps are most important
    attention_output = AttentionLayer(name='attention_layer')(gru1)

    # üÜï Dropout after attention to prevent overfitting
    attention_output = Dropout(0.3, name='dropout_after_attention')(attention_output)

    # Dense layers for classification
    dense1 = Dense(64, activation='relu', name='dense_1')(attention_output)
    dense1 = Dropout(0.3, name='dropout_2')(dense1)

    dense2 = Dense(32, activation='relu', name='dense_2')(dense1)
    dense2 = Dropout(0.2, name='dropout_3')(dense2)

    # Output layer
    outputs = Dense(num_classes, activation='softmax', name='output_layer')(dense2)

    # Create model
    model = Model(inputs=inputs, outputs=outputs, name='BiGRU_with_Attention')

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

print("‚úÖ Regularized Attention-based Bi-GRU model architecture defined")

‚úÖ Regularized Attention-based Bi-GRU model architecture defined


In [7]:
def build_bidirectional_gru_model_with_deep_attention(input_shape, num_classes):
    """
    ALTERNATIVE: Deeper Bidirectional GRU Architecture with Attention

    Architecture:
    1. Masking layer
    2. First Bi-GRU (128 units) with return_sequences=True
    3. Dropout (0.3)
    4. Second Bi-GRU (64 units) with return_sequences=True
    5. Dropout (0.3)
    6. üÜï ATTENTION LAYER - aggregates the deep sequence features
    7. Dense layers + Output

    This version keeps both Bi-GRU layers and adds attention on top.
    More parameters but potentially better at learning complex patterns.
    """
    inputs = Input(shape=input_shape, name='input_layer')

    # Masking for padded sequences
    masked = Masking(mask_value=0.0, name='masking')(inputs)

    # First Bi-GRU layer
    gru1 = Bidirectional(
        GRU(128, return_sequences=True, name='gru_layer_1'),
        name='bidirectional_gru_1'
    )(masked)
    gru1 = Dropout(0.3, name='dropout_1')(gru1)

    # Second Bi-GRU layer
    gru2 = Bidirectional(
        GRU(64, return_sequences=True, name='gru_layer_2'),
        name='bidirectional_gru_2'
    )(gru1)
    gru2 = Dropout(0.3, name='dropout_2')(gru2)

    # üÜï ATTENTION MECHANISM
    # Attention aggregates the deep bi-directional features
    attention_output = AttentionLayer(name='attention_layer')(gru2)

    # Dense layers for classification
    dense1 = Dense(32, activation='relu', name='dense_1')(attention_output)
    dense1 = Dropout(0.2, name='dropout_3')(dense1)

    # Output layer
    outputs = Dense(num_classes, activation='softmax', name='output_layer')(dense1)

    # Create model
    model = Model(inputs=inputs, outputs=outputs, name='Deep_BiGRU_with_Attention')

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

    return model

print("‚úÖ Deep Attention-based Bi-GRU model architecture defined")

‚úÖ Deep Attention-based Bi-GRU model architecture defined


In [8]:
# üÜï MODEL SELECTION FLAG
# Set this to choose which attention architecture to use
# Options: 'shallow' or 'deep'
ATTENTION_MODEL_TYPE = 'shallow'  # Start with shallow, can try deep if needed

def build_model_with_attention(input_shape, num_classes, model_type='shallow'):
    """
    Wrapper function to build the selected attention model

    Args:
        input_shape: Input shape for the model
        num_classes: Number of room classes
        model_type: 'shallow' or 'deep'

    Returns:
        Compiled Keras model with attention
    """
    if model_type == 'shallow':
        return build_bidirectional_gru_model_with_attention(input_shape, num_classes)
    elif model_type == 'deep':
        return build_bidirectional_gru_model_with_deep_attention(input_shape, num_classes)
    else:
        raise ValueError(f"Unknown model_type: {model_type}. Use 'shallow' or 'deep'.")

print(f"‚úÖ Model selection wrapper defined. Current selection: {ATTENTION_MODEL_TYPE}")

‚úÖ Model selection wrapper defined. Current selection: shallow


In [9]:
def create_extended_multidirectional_windows(vector_df):
    """
    EXPERIMENT 2: Create 7 types of sliding windows for extended multi-directional prediction

    Directions:
    1. backward_10:  [i-9 to i]     - 10s history, predict at i
    2. centered_10:  [i-4 to i+5]   - 10s centered, predict at i
    3. forward_10:   [i to i+9]     - 10s future, predict at i
    4. backward_15:  [i-14 to i]    - 15s history (more context)
    5. forward_15:   [i to i+14]    - 15s future (earlier transition detection)
    6. asymm_past:   [i-11 to i+3]  - 12s past + 4s future (transition from old room)
    7. asymm_future: [i-3 to i+11]  - 4s past + 12s future (entering new room)

    Returns:
        Dictionary with direction names as keys
        Each contains: (sequences, labels, valid_indices)
    """
    # Ensure chronological order and group by day
    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    results = {
        'backward_10': {'sequences': [], 'labels': [], 'indices': []},
        'centered_10': {'sequences': [], 'labels': [], 'indices': []},
        'forward_10': {'sequences': [], 'labels': [], 'indices': []},
        'backward_15': {'sequences': [], 'labels': [], 'indices': []},
        'forward_15': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_past': {'sequences': [], 'labels': [], 'indices': []},
        'asymm_future': {'sequences': [], 'labels': [], 'indices': []},
    }

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)
        vectors = list(day_group['beacon_vector'])
        rooms = list(day_group['room'])
        n = len(vectors)

        for i in range(n):
            # 1. BACKWARD_10: [i-9, ..., i] predict at i
            if i >= 9:
                window = vectors[i - 9 : i + 1]
                results['backward_10']['sequences'].append(window)
                results['backward_10']['labels'].append(rooms[i])
                results['backward_10']['indices'].append((day_group['date'].iloc[0], i))

            # 2. CENTERED_10: [i-4, ..., i, ..., i+5] predict at i
            if i >= 4 and i + 5 < n:
                window = vectors[i - 4 : i + 6]
                results['centered_10']['sequences'].append(window)
                results['centered_10']['labels'].append(rooms[i])
                results['centered_10']['indices'].append((day_group['date'].iloc[0], i))

            # 3. FORWARD_10: [i, ..., i+9] predict at i
            if i + 9 < n:
                window = vectors[i : i + 10]
                results['forward_10']['sequences'].append(window)
                results['forward_10']['labels'].append(rooms[i])
                results['forward_10']['indices'].append((day_group['date'].iloc[0], i))

            # 4. BACKWARD_15: [i-14, ..., i] predict at i (MORE HISTORY)
            if i >= 14:
                window = vectors[i - 14 : i + 1]
                results['backward_15']['sequences'].append(window)
                results['backward_15']['labels'].append(rooms[i])
                results['backward_15']['indices'].append((day_group['date'].iloc[0], i))

            # 5. FORWARD_15: [i, ..., i+14] predict at i (EARLIER TRANSITION DETECTION)
            if i + 14 < n:
                window = vectors[i : i + 15]
                results['forward_15']['sequences'].append(window)
                results['forward_15']['labels'].append(rooms[i])
                results['forward_15']['indices'].append((day_group['date'].iloc[0], i))

            # 6. ASYMM_PAST: [i-11, ..., i, ..., i+3] predict at i (HEAVY PAST BIAS)
            # Good for detecting we're leaving a room
            if i >= 11 and i + 3 < n:
                window = vectors[i - 11 : i + 4]
                results['asymm_past']['sequences'].append(window)
                results['asymm_past']['labels'].append(rooms[i])
                results['asymm_past']['indices'].append((day_group['date'].iloc[0], i))

            # 7. ASYMM_FUTURE: [i-3, ..., i, ..., i+11] predict at i (HEAVY FUTURE BIAS)
            # Good for detecting we're entering a room
            if i >= 3 and i + 11 < n:
                window = vectors[i - 3 : i + 12]
                results['asymm_future']['sequences'].append(window)
                results['asymm_future']['labels'].append(rooms[i])
                results['asymm_future']['indices'].append((day_group['date'].iloc[0], i))

    return results

print("‚úÖ Extended multi-directional window function defined (7 directions)")

‚úÖ Extended multi-directional window function defined (7 directions)


In [10]:
def train_ensemble_models(train_df, n_models=5, base_seed=42, model_type='shallow', verbose=False):
    """
    üÜï UPDATED: Train multiple models with ATTENTION mechanism

    Returns:
        models: List of trained Keras models (with attention)
        label_encoder: Fitted label encoder
    """
    if verbose:
        print(f"  Training ensemble of {n_models} models with ATTENTION ({model_type})...")

    # Prepare data (same for all models)
    train_df_grouped = create_room_groups(train_df)
    train_vector_df = create_beacon_count_vectors(train_df_grouped)
    X_train_seq, y_train_labels = create_sequences_from_groups(train_vector_df, max_length=50)

    # Encode labels
    label_encoder = LabelEncoder()
    y_train = label_encoder.fit_transform(y_train_labels)

    # Pad sequences
    X_train_padded = pad_sequences(X_train_seq, maxlen=50, padding='post', dtype='float32', value=0.0)

    # Compute class weights
    class_weights_array = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
    class_weights = {i: weight for i, weight in enumerate(class_weights_array)}

    # Train multiple models
    models = []
    for i in range(n_models):
        model_seed = base_seed + i * 1000  # 42, 1042, 2042, 3042, 4042
        set_seeds(model_seed)

        if verbose:
            print(f"    Model {i+1}/{n_models} (seed {model_seed})...", end=" ")

        # üÜï BUILD ATTENTION MODEL
        model = build_model_with_attention(
            input_shape=(50, 23),
            num_classes=len(label_encoder.classes_),
            model_type=model_type
        )

        # Callbacks
        early_stop = EarlyStopping(monitor='loss', patience=5, restore_best_weights=True, verbose=0)
        reduce_lr = ReduceLROnPlateau(monitor='loss', factor=0.5, patience=3, verbose=0, min_lr=1e-6)

        # Train
        model.fit(
            X_train_padded, y_train,
            epochs=30,
            batch_size=32,
            class_weight=class_weights,
            callbacks=[early_stop, reduce_lr],
            verbose=0
        )

        models.append(model)

        if verbose:
            print("‚úì")

    return models, label_encoder

print("‚úÖ Ensemble training function defined (with attention support)")

‚úÖ Ensemble training function defined (with attention support)


In [11]:
def predict_single_direction(models, sequences, max_seq_length=50):
    """
    Get ensemble predictions for a single direction

    Returns:
        ensemble_proba: (n_samples, n_classes) averaged probability matrix
    """
    # Pad sequences
    X_padded = pad_sequences(sequences, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)

    # Get predictions from all models
    all_predictions = []
    for model in models:
        proba = model.predict(X_padded, verbose=0)
        all_predictions.append(proba)

    # Average probabilities across ensemble
    ensemble_proba = np.mean(all_predictions, axis=0)

    return ensemble_proba

def combine_directional_predictions(direction_results, method='confidence_weighted'):
    """
    Combine predictions from multiple directions using confidence weighting
    Now handles 7 directions instead of 3

    Args:
        direction_results: Dict with keys for all 7 directions
                          Each value is a dict with 'proba' and 'indices'
        method: 'confidence_weighted', 'equal', or 'softmax'

    Returns:
        combined_proba: (n_positions, n_classes) final probability matrix
        position_map: mapping from (date, position) to array index
    """
    # Build a mapping of all unique positions
    all_positions = set()
    direction_names = ['backward_10', 'centered_10', 'forward_10',
                      'backward_15', 'forward_15',
                      'asymm_past', 'asymm_future']

    for direction in direction_names:
        all_positions.update(direction_results[direction]['indices'])

    # Sort positions for consistent ordering
    all_positions = sorted(all_positions)
    position_map = {pos: idx for idx, pos in enumerate(all_positions)}

    # Get number of classes from first available direction
    n_classes = direction_results['backward_10']['proba'].shape[1]
    n_positions = len(all_positions)

    # Initialize combined predictions
    combined_proba = np.zeros((n_positions, n_classes))
    position_counts = np.zeros(n_positions)  # Track how many directions contributed

    # For each direction, add its weighted contribution
    for direction_name in direction_names:
        direction_data = direction_results[direction_name]
        proba = direction_data['proba']
        indices = direction_data['indices']

        # Get confidence (max probability) for each prediction
        confidences = np.max(proba, axis=1)

        # Add weighted contribution to combined predictions
        for i, pos in enumerate(indices):
            pos_idx = position_map[pos]

            if method == 'confidence_weighted':
                # Weight by confidence
                weight = confidences[i]
                combined_proba[pos_idx] += proba[i] * weight
            elif method == 'equal':
                # Equal weight
                combined_proba[pos_idx] += proba[i]
            elif method == 'softmax':
                # Will apply softmax later
                combined_proba[pos_idx] += proba[i] * confidences[i]

            position_counts[pos_idx] += 1 if method == 'equal' else confidences[i]

    # Normalize by total weight
    for i in range(n_positions):
        if position_counts[i] > 0:
            combined_proba[i] /= position_counts[i]

    return combined_proba, position_map

print("‚úÖ Multi-directional prediction functions defined (handles 7 directions)")

‚úÖ Multi-directional prediction functions defined (handles 7 directions)


In [12]:
def apply_confidence_weighted_voting(predictions_proba, vote_window=5):
    """
    Confidence-weighted temporal voting

    Instead of simple majority voting, weight each prediction by its confidence (max probability).

    Args:
        predictions_proba: (n_samples, n_classes) probability matrix from ensemble
        vote_window: window size for voting

    Returns:
        voted_predictions: (n_samples,) final class predictions
    """
    n_samples, n_classes = predictions_proba.shape
    voted_predictions = np.zeros(n_samples, dtype=int)

    for i in range(n_samples):
        # Get window boundaries
        half_window = vote_window // 2
        start = max(0, i - half_window)
        end = min(n_samples, i + half_window + 1)

        # Get probabilities within window
        window_proba = predictions_proba[start:end]  # (window_size, n_classes)

        # Get confidence (max probability) for each prediction in window
        window_confidences = np.max(window_proba, axis=1)  # (window_size,)

        # Weight each prediction by its confidence
        weighted_votes = np.zeros(n_classes)
        for j in range(len(window_proba)):
            # Each timestep contributes its probability * its confidence
            weighted_votes += window_proba[j] * window_confidences[j]

        # Final prediction: class with highest weighted vote
        voted_predictions[i] = np.argmax(weighted_votes)

    return voted_predictions

print("‚úÖ Temporal voting function defined")

‚úÖ Temporal voting function defined


In [13]:
def run_extended_multidirectional_pipeline(train_df, test_df, seed, n_ensemble=5,
                                           vote_window=5,
                                           combination_method='confidence_weighted',
                                           model_type='shallow',
                                           verbose=False):
    """
    üÜï UPDATED: Extended multi-directional windows with ATTENTION-based models

    Pipeline:
    1. Train ensemble of models WITH ATTENTION (shallow or deep)
    2. Create 7 directional windows (backward_10, centered_10, forward_10, backward_15, forward_15, asymm_past, asymm_future)
    3. Get ensemble predictions for each direction
    4. Combine directions using confidence weighting
    5. Apply temporal voting

    Args:
        model_type: 'shallow' or 'deep' attention architecture
        combination_method: 'confidence_weighted', 'equal', or 'softmax'
    """
    # 0. Clear session and set seeds
    tf.keras.backend.clear_session()
    set_seeds(seed)

    if verbose:
        print(f"\n  Seed {seed}: Training ensemble with {model_type} attention...")

    # 1. Train Ensemble Models WITH ATTENTION
    models, label_encoder = train_ensemble_models(
        train_df,
        n_models=n_ensemble,
        base_seed=seed,
        model_type=model_type,
        verbose=verbose
    )

    if verbose:
        print("  Creating extended multi-directional windows (7 directions)...")

    # 2. Prepare Test Data with Extended Multi-Directional Windows
    test_vectors = create_beacon_count_vectors(test_df)
    direction_windows = create_extended_multidirectional_windows(test_vectors)

    if verbose:
        print(f"    Backward_10 windows: {len(direction_windows['backward_10']['sequences'])}")
        print(f"    Centered_10 windows: {len(direction_windows['centered_10']['sequences'])}")
        print(f"    Forward_10 windows: {len(direction_windows['forward_10']['sequences'])}")
        print(f"    Backward_15 windows: {len(direction_windows['backward_15']['sequences'])}")
        print(f"    Forward_15 windows: {len(direction_windows['forward_15']['sequences'])}")
        print(f"    Asymm_past windows: {len(direction_windows['asymm_past']['sequences'])}")
        print(f"    Asymm_future windows: {len(direction_windows['asymm_future']['sequences'])}")
        print("  Getting directional predictions...")

    # 3. Get Predictions for Each Direction
    direction_results = {}
    direction_names = ['backward_10', 'centered_10', 'forward_10',
                      'backward_15', 'forward_15',
                      'asymm_past', 'asymm_future']

    for direction_name in direction_names:
        if verbose:
            print(f"    Predicting {direction_name}...", end=" ")

        sequences = direction_windows[direction_name]['sequences']
        proba = predict_single_direction(models, sequences, max_seq_length=50)

        direction_results[direction_name] = {
            'proba': proba,
            'indices': direction_windows[direction_name]['indices'],
            'labels': direction_windows[direction_name]['labels']
        }

        if verbose:
            avg_conf = np.mean(np.max(proba, axis=1))
            print(f"avg confidence: {avg_conf:.3f}")

    if verbose:
        print(f"  Combining 7 directions using {combination_method}...")

    # 4. Combine Directional Predictions
    combined_proba, position_map = combine_directional_predictions(
        direction_results,
        method=combination_method
    )

    # Get ground truth labels in same order as combined predictions
    y_test = []
    for pos in sorted(position_map.keys()):
        # Use label from any direction (they should all be the same for a given position)
        for direction_name in direction_names:
            if pos in direction_results[direction_name]['indices']:
                idx = direction_results[direction_name]['indices'].index(pos)
                y_test.append(direction_results[direction_name]['labels'][idx])
                break

    if verbose:
        print(f"  Applying temporal voting (window={vote_window})...")

    # 5. Apply Confidence-Weighted Temporal Voting
    y_pred_voted_encoded = apply_confidence_weighted_voting(combined_proba, vote_window=vote_window)
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)

    # 6. Final Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)

    if verbose:
        print(f"  ‚úì Macro F1: {macro_f1:.4f}")

    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)},
        'combination_method': combination_method,
        'model_type': model_type
    }

print("‚úÖ Complete extended multi-directional pipeline defined (7 directions + ATTENTION)")

‚úÖ Complete extended multi-directional pipeline defined (7 directions + ATTENTION)


## üöÄ EXPERIMENT 3: Multi-Directional Windows with Attention

This experiment tests whether attention mechanisms can improve upon Experiment 2's results.

**Key Changes:**
- Replace standard Bi-GRU with attention-enhanced Bi-GRU
- Two variants available: 'shallow' (faster) and 'deep' (more parameters)
- All other components remain the same (7 directions, confidence weighting, temporal voting)

**Expected Benefits:**
- Attention helps model focus on critical timesteps (e.g., room transitions)
- Should handle noisy periods better
- May improve per-class F1 for difficult rooms

**Current Target:** 0.45 macro F1  
**Experiment 2 Result:** 0.4384 ¬± 0.0329  
**Gap to close:** 0.0116

In [14]:
# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [15]:
# FULL EXPERIMENT: All 4 folds, 3 seeds each, WITH ATTENTION
print("="*80)
print("FULL 4-FOLD CROSS-VALIDATION - EXPERIMENT 3 (WITH ATTENTION)")
print("="*80)
print(f"Model Type: {ATTENTION_MODEL_TYPE} attention")
print("="*80)

seeds = [42, 123, 456]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")

    fold_results = []

    for seed in seeds:
        print(f"  Running seed {seed}...", end=" ")
        result = run_extended_multidirectional_pipeline(
            train_df, test_df,
            seed=seed,
            n_ensemble=5,
            vote_window=5,
            combination_method='confidence_weighted',
            model_type=ATTENTION_MODEL_TYPE,  # üÜï Use attention model
            verbose=False
        )
        fold_results.append(result)
        print(f"Macro F1: {result['macro_f1']:.4f}")

    all_fold_results[fold_num] = fold_results

    # Calculate fold statistics
    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    print(f"\n  Fold {fold_num} Summary:")
    print(f"    Mean Macro F1: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")
    print(f"    Min: {np.min(macro_f1_scores):.4f}, Max: {np.max(macro_f1_scores):.4f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)

FULL 4-FOLD CROSS-VALIDATION - EXPERIMENT 3 (WITH ATTENTION)
Model Type: shallow attention

PROCESSING FOLD 1

  Running seed 42... Macro F1: 0.5059
  Running seed 123... Macro F1: 0.4626
  Running seed 456... Macro F1: 0.5083

  Fold 1 Summary:
    Mean Macro F1: 0.4923 ¬± 0.0210
    Min: 0.4626, Max: 0.5083

PROCESSING FOLD 2

  Running seed 42... Macro F1: 0.3871
  Running seed 123... Macro F1: 0.3610
  Running seed 456... Macro F1: 0.3774

  Fold 2 Summary:
    Mean Macro F1: 0.3752 ¬± 0.0108
    Min: 0.3610, Max: 0.3871

PROCESSING FOLD 3

  Running seed 42... Macro F1: 0.5068
  Running seed 123... Macro F1: 0.3845
  Running seed 456... Macro F1: 0.4949

  Fold 3 Summary:
    Mean Macro F1: 0.4621 ¬± 0.0551
    Min: 0.3845, Max: 0.5068

PROCESSING FOLD 4

  Running seed 42... Macro F1: 0.4648
  Running seed 123... Macro F1: 0.4236
  Running seed 456... Macro F1: 0.4257

  Fold 4 Summary:
    Mean Macro F1: 0.4380 ¬± 0.0189
    Min: 0.4236, Max: 0.4648

ALL FOLDS COMPLETED!


In [16]:
# Display summary and comparison
print("\n" + "="*80)
print(f"FINAL SUMMARY - EXPERIMENT 3 (7 DIRECTIONS + {ATTENTION_MODEL_TYPE.upper()} ATTENTION)")
print("="*80 + "\n")

for fold_num in [1, 2, 3, 4]:
    macro_f1_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    print(f"Fold {fold_num}: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}")

all_macro_f1 = []
for fold_num in [1, 2, 3, 4]:
    all_macro_f1.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

print(f"\n{'='*80}")
print(f"Overall Mean: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}")
print(f"{'='*80}")

print("\n" + "="*80)
print("PROGRESSION:")
print("="*80)
print("Baseline (Approach 24 - single direction):")
print("  Overall: 0.4106 ¬± 0.0266")
print(f"\nExperiment 1 (3 directions):")
print(f"  Overall: 0.4273 ¬± 0.0312  (+0.0167 vs baseline)")
print(f"\nExperiment 2 (7 directions - NO attention):")
print(f"  Overall: 0.4384 ¬± 0.0329  (+0.0278 vs baseline)")
print(f"\nüÜï Experiment 3 (7 directions + {ATTENTION_MODEL_TYPE} attention):")
print(f"  Overall: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}  ({np.mean(all_macro_f1) - 0.4384:+.4f} vs Exp2)")

total_gain = np.mean(all_macro_f1) - 0.4106
attention_gain = np.mean(all_macro_f1) - 0.4384
target_gap = 0.45 - np.mean(all_macro_f1)

print(f"\n{'='*80}")
print(f"Total gain from baseline: {total_gain:+.4f}")
print(f"üÜï Attention improvement: {attention_gain:+.4f}")
print(f"Gap to target (0.45): {target_gap:.4f}")
print(f"{'='*80}")

if np.mean(all_macro_f1) >= 0.45:
    print("\nüéØüéØüéØ TARGET ACHIEVED! 0.45 F1 REACHED! üéØüéØüéØ")
    print("‚úÖ Attention mechanism successfully closed the gap!")
elif attention_gain > 0.005:
    print(f"\n‚úÖ Attention mechanism improved results by {attention_gain:+.4f}!")
    if target_gap < 0.01:
        print(f"   Almost there! Only {target_gap:.4f} away from target.")
        print("   Recommendation: Try hyperparameter tuning (vote_window, ensemble_size)")
    else:
        print(f"   Still {target_gap:.4f} away from target.")
        print("   Recommendation: Try deep attention or hyperparameter tuning")
elif attention_gain > 0:
    print(f"\n‚ö†Ô∏è  Attention provided minor improvement ({attention_gain:+.4f})")
    print("   Recommendation: Try deep attention or focus on hyperparameter tuning")
else:
    print(f"\n‚ùå Attention didn't improve results ({attention_gain:+.4f})")
    print("   Recommendation: Revert to Experiment 2 and focus on hyperparameter tuning")


FINAL SUMMARY - EXPERIMENT 3 (7 DIRECTIONS + SHALLOW ATTENTION)

Fold 1: 0.4923 ¬± 0.0210
Fold 2: 0.3752 ¬± 0.0108
Fold 3: 0.4621 ¬± 0.0551
Fold 4: 0.4380 ¬± 0.0189

Overall Mean: 0.4419 ¬± 0.0533

PROGRESSION:
Baseline (Approach 24 - single direction):
  Overall: 0.4106 ¬± 0.0266

Experiment 1 (3 directions):
  Overall: 0.4273 ¬± 0.0312  (+0.0167 vs baseline)

Experiment 2 (7 directions - NO attention):
  Overall: 0.4384 ¬± 0.0329  (+0.0278 vs baseline)

üÜï Experiment 3 (7 directions + shallow attention):
  Overall: 0.4419 ¬± 0.0533  (+0.0035 vs Exp2)

Total gain from baseline: +0.0313
üÜï Attention improvement: +0.0035
Gap to target (0.45): 0.0081

‚ö†Ô∏è  Attention provided minor improvement (+0.0035)
   Recommendation: Try deep attention or focus on hyperparameter tuning


In [17]:
# Save results to text file
with open('experiment3_attention_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write(f"EXPERIMENT 3: MULTI-DIRECTIONAL (7 DIRECTIONS) + {ATTENTION_MODEL_TYPE.upper()} ATTENTION\n")
    f.write("="*80 + "\n\n")

    f.write("Configuration:\n")
    f.write("-"*80 + "\n")
    f.write(f"Model Architecture: Bidirectional GRU with {ATTENTION_MODEL_TYPE} Attention\n")
    f.write("Directions (7 total):\n")
    f.write("  1. backward_10:  [t-9 to t] - 10s history\n")
    f.write("  2. centered_10:  [t-4 to t+5] - 10s centered\n")
    f.write("  3. forward_10:   [t to t+9] - 10s future\n")
    f.write("  4. backward_15:  [t-14 to t] - 15s history (more context)\n")
    f.write("  5. forward_15:   [t to t+14] - 15s future (earlier transition)\n")
    f.write("  6. asymm_past:   [t-11 to t+3] - heavy past bias\n")
    f.write("  7. asymm_future: [t-3 to t+11] - heavy future bias\n")
    f.write("\nCombination method: Confidence-weighted\n")
    f.write("Ensemble size: 5 models\n")
    f.write("Temporal voting window: 5 seconds\n\n")

    # Overall summary
    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)

    f.write("OVERALL RESULTS:\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}\n")
    f.write(f"Min: {np.min(all_macro_f1):.4f}, Max: {np.max(all_macro_f1):.4f}\n\n")

    # Comparison
    f.write("PROGRESSION:\n")
    f.write("-"*80 + "\n")
    f.write("Baseline (single direction backward): 0.4106 ¬± 0.0266\n")
    f.write("Experiment 1 (3 directions): 0.4273 ¬± 0.0312\n")
    f.write("Experiment 2 (7 directions - no attention): 0.4384 ¬± 0.0329\n")
    f.write(f"Experiment 3 (7 directions + {ATTENTION_MODEL_TYPE} attention): {np.mean(all_macro_f1):.4f} ¬± {np.std(all_macro_f1):.4f}\n\n")

    f.write(f"Gain vs Baseline: {np.mean(all_macro_f1) - 0.4106:+.4f}\n")
    f.write(f"Gain vs Experiment 2 (attention improvement): {np.mean(all_macro_f1) - 0.4384:+.4f}\n")
    f.write(f"Gap to target (0.45): {0.45 - np.mean(all_macro_f1):.4f}\n\n")

    # Per-fold results
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\n{'='*80}\n")
        f.write(f"FOLD {fold_num} RESULTS\n")
        f.write(f"{'='*80}\n\n")

        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]

        f.write(f"Macro F1 Scores:\n")
        f.write("-"*80 + "\n")
        for i, result in enumerate(fold_results):
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f}\n")

        f.write(f"\nStatistics:\n")
        f.write(f"  Mean: {np.mean(macro_f1_scores):.4f} ¬± {np.std(macro_f1_scores):.4f}\n")
        f.write(f"  Min:  {np.min(macro_f1_scores):.4f}\n")
        f.write(f"  Max:  {np.max(macro_f1_scores):.4f}\n")

        # Per-class F1 (averaged across seeds)
        f.write(f"\nPer-Class F1 Scores (averaged across seeds):\n")
        f.write("-"*80 + "\n")

        # Collect all class names
        all_classes = set()
        for result in fold_results:
            all_classes.update(result['per_class_f1'].keys())

        # Average per-class F1 across seeds
        for class_name in sorted(all_classes):
            class_f1_scores = [r['per_class_f1'].get(class_name, 0) for r in fold_results]
            mean_f1 = np.mean(class_f1_scores)
            std_f1 = np.std(class_f1_scores)
            f.write(f"  {class_name:20s}: {mean_f1:.4f} ¬± {std_f1:.4f}\n")

print(f"‚úÖ Results saved to experiment3_attention_results.txt")

# Also print comparison
print("\n" + "="*80)
print("ATTENTION MECHANISM IMPACT")
print("="*80)
attention_gain = np.mean(all_macro_f1) - 0.4384
print(f"Experiment 2 (no attention): 0.4384")
print(f"Experiment 3 (with attention): {np.mean(all_macro_f1):.4f}")
print(f"Improvement: {attention_gain:+.4f} ({attention_gain/0.4384*100:+.2f}%)")
print("="*80)

‚úÖ Results saved to experiment3_attention_results.txt

ATTENTION MECHANISM IMPACT
Experiment 2 (no attention): 0.4384
Experiment 3 (with attention): 0.4419
Improvement: +0.0035 (+0.80%)
