In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("✓ All imports successful")

✓ All imports successful


In [3]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("✓ All folds loaded")

✓ All folds loaded


In [8]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def create_beacon_count_vectors(df):
    """Aggregates readings into 1s vectors. Handles data with or without 'room_group'."""
    vectors = []
    has_groups = 'room_group' in df.columns # Check if we are in 'training' mode

    for _, group in df.groupby('timestamp'):
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                vector[int(beacon_id) - 1] = count / total_readings

        entry = {
            'timestamp': group['timestamp'].iloc[0],
            'room': group['room'].iloc[0],
            'beacon_vector': vector
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """Used for Training: Creates clean sequences where the room is constant."""
    sequences = []
    labels = []

    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=False)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        sequence = [row['beacon_vector'] for _, row in group.iterrows()]
        sequences.append(sequence)
        labels.append(room)

    return sequences, labels

def create_sliding_windows_by_day(vector_df, window_size=10):
    """Used for Inference: Creates a sequence for every frame, respecting day boundaries."""
    sequences = []
    labels = []

    # Ensure chronological order and group by day
    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)

        if len(day_group) >= window_size:
            vectors = list(day_group['beacon_vector'])
            rooms = list(day_group['room'])

            for i in range(len(vectors) - window_size + 1):
                window = vectors[i : i + window_size]
                sequences.append(window)
                # Goal: Predict the room at the final timestamp of the window
                labels.append(rooms[i + window_size - 1])

    return sequences, labels

def build_bidirectional_gru_model(input_shape, num_classes):
    """
    Bidirectional GRU Architecture
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),

        Bidirectional(GRU(128, return_sequences=True)),
        Dropout(0.3),

        Bidirectional(GRU(64, return_sequences=False)),
        Dropout(0.3),

        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

def create_multi_directional_windows_by_day(vector_df, window_size=10):
    """
    Creates 3 types of windows for each timestamp:
    1. Backward-looking: [t-9, t-8, ..., t-1, t] + right-padding
    2. Centered: [t-5, t-4, ..., t, ..., t+3, t+4] + right-padding
    3. Forward-looking: [t, t+1, ..., t+8, t+9] + right-padding

    All windows use RIGHT-PADDING for cuDNN compatibility.
    """
    backward_sequences = []
    centered_sequences = []
    forward_sequences = []
    labels = []
    window_types = []

    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)
        vectors = list(day_group['beacon_vector'])
        rooms = list(day_group['room'])
        day_length = len(vectors)

        if day_length < window_size:
            continue

        # For each position in the day
        for i in range(day_length):
            valid_windows = []
            zero_vector = [0.0] * len(vectors[0])

            # === BACKWARD WINDOW: [t-9, ..., t] ===
            if i >= window_size - 1:
                # Full backward window available
                backward_window = vectors[i - window_size + 1 : i + 1]
            else:
                # Not enough history - take what we have and RIGHT-PAD
                backward_window = vectors[0 : i + 1]
                padding_needed = window_size - len(backward_window)
                backward_window = backward_window + [zero_vector] * padding_needed

            backward_sequences.append(backward_window)
            valid_windows.append('backward')

            # === CENTERED WINDOW: [t-5, ..., t, ..., t+4] ===
            half_window = window_size // 2
            start = max(0, i - half_window)
            end = min(day_length, i + half_window)

            centered_window = vectors[start : end]
            # Right-pad if needed
            if len(centered_window) < window_size:
                padding_needed = window_size - len(centered_window)
                centered_window = centered_window + [zero_vector] * padding_needed

            centered_sequences.append(centered_window)
            valid_windows.append('centered')

            # === FORWARD WINDOW: [t, t+1, ..., t+9] ===
            if i + window_size <= day_length:
                # Full forward window available
                forward_window = vectors[i : i + window_size]
            else:
                # Not enough future - take what we have and RIGHT-PAD
                forward_window = vectors[i : day_length]
                padding_needed = window_size - len(forward_window)
                forward_window = forward_window + [zero_vector] * padding_needed

            forward_sequences.append(forward_window)
            valid_windows.append('forward')

            labels.append(rooms[i])
            window_types.append(valid_windows)

    return backward_sequences, centered_sequences, forward_sequences, labels, window_types

print("✅ Multi-directional ensemble functions defined (RIGHT-PADDING for cuDNN)")

def ensemble_predictions_with_confidence(model, X_backward, X_centered, X_forward, window_types):
    """
    Gets predictions from all 3 window types and selects the one with highest confidence.
    Handles edge cases where certain windows are padded/invalid.
    """
    # Get probability predictions from all 3 models
    probs_backward = model.predict(X_backward, verbose=0)
    probs_centered = model.predict(X_centered, verbose=0)
    probs_forward = model.predict(X_forward, verbose=0)

    # Extract max confidence and predicted class for each
    conf_backward = np.max(probs_backward, axis=1)
    pred_backward = np.argmax(probs_backward, axis=1)

    conf_centered = np.max(probs_centered, axis=1)
    pred_centered = np.argmax(probs_centered, axis=1)

    conf_forward = np.max(probs_forward, axis=1)
    pred_forward = np.argmax(probs_forward, axis=1)

    # For each sample, choose prediction with highest confidence
    final_predictions = []

    for i in range(len(pred_backward)):
        # Compare confidences from all 3 windows
        confidences = [conf_backward[i], conf_centered[i], conf_forward[i]]
        predictions = [pred_backward[i], pred_centered[i], pred_forward[i]]

        # Choose the prediction with highest confidence
        max_conf_idx = np.argmax(confidences)
        final_predictions.append(predictions[max_conf_idx])

    return np.array(final_predictions)

print("✅ Multi-directional ensemble functions defined")
print("✅ Bidirectional GRU model function defined")

✅ Multi-directional ensemble functions defined (RIGHT-PADDING for cuDNN)
✅ Multi-directional ensemble functions defined
✅ Bidirectional GRU model function defined


In [9]:
def run_pipeline_single_seed(train_df, test_df, seed, verbose=False):
    """
    Run realistic pipeline with Multi-Directional Ensemble + Temporal Voting.
    1. Resets memory/seeds.
    2. Trains on pure segments.
    3. Infers with 3 sliding window directions (backward, centered, forward).
    4. Ensemble: Chooses prediction with highest confidence.
    5. Smooths predictions with a Majority Vote filter.
    """
    # 0. PREVENT MEMORY LEAKS & ENSURE SEEDING
    tf.keras.backend.clear_session()
    set_seeds(seed)

    # HYPERPARAMETERS
    window_size = 10     # Window size for all 3 directions
    vote_window = 5      # The smoothing neighborhood (5 seconds)
    max_seq_length = 50

    # 1. Preprocessing
    train_df = create_room_groups(train_df)
    train_vectors = create_beacon_count_vectors(train_df)
    test_vectors = create_beacon_count_vectors(test_df)

    # 2. Sequence Creation
    X_train, y_train = create_sequences_from_groups(train_vectors, max_length=max_seq_length)

    # NEW: Create multi-directional windows for test
    X_test_backward, X_test_centered, X_test_forward, y_test, window_types = \
        create_multi_directional_windows_by_day(test_vectors, window_size=window_size)

    # 3. Encoding & Padding
    label_encoder = LabelEncoder()
    label_encoder.fit(list(y_train) + list(y_test))
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    X_train_padded = pad_sequences(X_train, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)

    # Pad all 3 test window types
    X_test_backward_padded = pad_sequences(X_test_backward, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)
    X_test_centered_padded = pad_sequences(X_test_centered, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)
    X_test_forward_padded = pad_sequences(X_test_forward, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)

    # 4. Train Model with Macro F1 Optimization
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
    class_weight_dict = dict(enumerate(class_weights))

    model = build_bidirectional_gru_model(input_shape=(max_seq_length, 23), num_classes=len(label_encoder.classes_))

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=0)
    ]

    # Use backward window for validation (standard approach)
    model.fit(
        X_train_padded, y_train_encoded,
        validation_data=(X_test_backward_padded, y_test_encoded),
        epochs=100, batch_size=32,
        class_weight=class_weight_dict,
        callbacks=callbacks, verbose=0
    )

    # 5. MULTI-DIRECTIONAL ENSEMBLE INFERENCE
    y_pred_raw_encoded = ensemble_predictions_with_confidence(
        model,
        X_test_backward_padded,
        X_test_centered_padded,
        X_test_forward_padded,
        window_types
    )

    # 6. TEMPORAL VOTING (Smoothing)
    def apply_temporal_voting(preds, v_window):
        """Applies a majority vote filter to smooth room predictions."""
        smoothed = []
        for i in range(len(preds)):
            start = max(0, i - v_window // 2)
            end = min(len(preds), i + v_window // 2 + 1)
            neighborhood = preds[start:end]
            smoothed.append(np.bincount(neighborhood).argmax())
        return np.array(smoothed)

    y_pred_voted_encoded = apply_temporal_voting(y_pred_raw_encoded, vote_window)
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)

    # 7. Final Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)

    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)}
    }

print("✓ Pipeline updated: Now uses Multi-Directional Ensemble (Backward + Centered + Forward)")

✓ Pipeline updated: Now uses Multi-Directional Ensemble (Backward + Centered + Forward)


In [10]:
# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [11]:
# Run 10 seeds for each of 4 folds
seeds = [42, 123, 456, 789, 2024, 3141, 5926, 8888, 1337, 9999]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")

    fold_results = []

    for seed in seeds:
        print(f"  Running seed {seed}...", end=" ")
        result = run_pipeline_single_seed(train_df, test_df, seed, verbose=False)
        fold_results.append(result)
        print(f"Macro F1: {result['macro_f1']:.4f}")

    all_fold_results[fold_num] = fold_results

    # Calculate fold statistics
    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    print(f"\n  Fold {fold_num} Summary:")
    print(f"    Mean Macro F1: {np.mean(macro_f1_scores):.4f} ± {np.std(macro_f1_scores):.4f}")
    print(f"    Min: {np.min(macro_f1_scores):.4f}, Max: {np.max(macro_f1_scores):.4f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)


PROCESSING FOLD 1

  Running seed 42... Macro F1: 0.3051
  Running seed 123... Macro F1: 0.4499
  Running seed 456... Macro F1: 0.3712
  Running seed 789... Macro F1: 0.5420
  Running seed 2024... Macro F1: 0.4566
  Running seed 3141... Macro F1: 0.3650
  Running seed 5926... Macro F1: 0.5293
  Running seed 8888... Macro F1: 0.4271
  Running seed 1337... Macro F1: 0.3962
  Running seed 9999... Macro F1: 0.4287

  Fold 1 Summary:
    Mean Macro F1: 0.4271 ± 0.0692
    Min: 0.3051, Max: 0.5420

PROCESSING FOLD 2

  Running seed 42... Macro F1: 0.4110
  Running seed 123... Macro F1: 0.3425
  Running seed 456... Macro F1: 0.4468
  Running seed 789... Macro F1: 0.3036
  Running seed 2024... Macro F1: 0.4156
  Running seed 3141... Macro F1: 0.3752
  Running seed 5926... Macro F1: 0.3789
  Running seed 8888... Macro F1: 0.3647
  Running seed 1337... Macro F1: 0.3998
  Running seed 9999... Macro F1: 0.3487

  Fold 2 Summary:
    Mean Macro F1: 0.3787 ± 0.0394
    Min: 0.3036, Max: 0.4468

PRO

In [12]:
# Save results to text file
with open('4fold_10seed_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("4-FOLD CROSS-VALIDATION WITH 10 SEEDS PER FOLD\n")
    f.write("="*80 + "\n\n")

    # Overall summary
    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)

    f.write("OVERALL RESULTS (40 runs total):\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} ± {np.std(all_macro_f1):.4f}\n")
    f.write(f"Min: {np.min(all_macro_f1):.4f}, Max: {np.max(all_macro_f1):.4f}\n\n")

    # Per-fold results
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\n{'='*80}\n")
        f.write(f"FOLD {fold_num} RESULTS\n")
        f.write(f"{'='*80}\n\n")

        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]

        f.write(f"Macro F1 Scores (10 seeds):\n")
        f.write("-"*80 + "\n")
        for i, result in enumerate(fold_results):
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f}\n")

        f.write(f"\nStatistics:\n")
        f.write(f"  Mean: {np.mean(macro_f1_scores):.4f} ± {np.std(macro_f1_scores):.4f}\n")
        f.write(f"  Min:  {np.min(macro_f1_scores):.4f}\n")
        f.write(f"  Max:  {np.max(macro_f1_scores):.4f}\n")

        # Per-class F1 (averaged across 10 seeds)
        f.write(f"\nPer-Class F1 Scores (averaged across 10 seeds):\n")
        f.write("-"*80 + "\n")

        # Collect all class names
        all_classes = set()
        for result in fold_results:
            all_classes.update(result['per_class_f1'].keys())

        # Average per-class F1 across seeds
        for class_name in sorted(all_classes):
            class_f1_scores = [r['per_class_f1'].get(class_name, 0) for r in fold_results]
            mean_f1 = np.mean(class_f1_scores)
            std_f1 = np.std(class_f1_scores)
            f.write(f"  {class_name:20s}: {mean_f1:.4f} ± {std_f1:.4f}\n")

print("✅ Results saved to 4fold_10seed_results.txt")

✅ Results saved to 4fold_10seed_results.txt


In [13]:
# Display summary
print("\n" + "="*80)
print("SUMMARY - 4 FOLDS × 10 SEEDS = 40 TOTAL RUNS")
print("="*80 + "\n")

for fold_num in [1, 2, 3, 4]:
    macro_f1_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    print(f"Fold {fold_num}: {np.mean(macro_f1_scores):.4f} ± {np.std(macro_f1_scores):.4f}")

all_macro_f1 = []
for fold_num in [1, 2, 3, 4]:
    all_macro_f1.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

print(f"\n{'='*80}")
print(f"Overall Mean: {np.mean(all_macro_f1):.4f} ± {np.std(all_macro_f1):.4f}")
print(f"{'='*80}")


SUMMARY - 4 FOLDS × 10 SEEDS = 40 TOTAL RUNS

Fold 1: 0.4271 ± 0.0692
Fold 2: 0.3787 ± 0.0394
Fold 3: 0.3670 ± 0.0685
Fold 4: 0.3718 ± 0.0439

Overall Mean: 0.3862 ± 0.0618
