In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Setup and Imports
import pandas as pd
import numpy as np
import tensorflow as tf
import random
import os
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Masking
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.layers import Conv1D, MaxPooling1D, BatchNormalization
from tensorflow.keras.layers import Bidirectional, GRU
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings('ignore')

print("âœ“ All imports successful")

âœ“ All imports successful


In [3]:
def load_and_filter_fold(i):
    train_dir = f'/content/drive/MyDrive/split_data/fold{i}/train.csv'
    test_dir = f'/content/drive/MyDrive/split_data/fold{i}/test.csv'
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())
    common_labels = list(set(train_labels) & set(test_labels))

    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load all 4 folds
train_df_1, test_df_1 = load_and_filter_fold(1)
train_df_2, test_df_2 = load_and_filter_fold(2)
train_df_3, test_df_3 = load_and_filter_fold(3)
train_df_4, test_df_4 = load_and_filter_fold(4)

print("âœ“ All folds loaded")

âœ“ All folds loaded


In [4]:
def set_seeds(seed=42):
    np.random.seed(seed)
    tf.random.set_seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    os.environ['TF_DETERMINISTIC_OPS'] = '1'
    os.environ['TF_CUDNN_DETERMINISTIC'] = '1'

def create_room_groups(df):
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['room_group'] = (df['room'] != df['room'].shift()).cumsum()
    return df

def create_beacon_count_vectors(df):
    """
    Enhanced version that includes temporal features.
    Aggregates readings into 1s vectors with:
    - 23 beacon percentage features
    - 1 time_delta feature (seconds since last reading)
    - 1 time_of_day feature (normalized hour)
    Total: 25 features per timestamp
    """
    vectors = []
    has_groups = 'room_group' in df.columns

    # Ensure chronological order
    df = df.sort_values('timestamp').reset_index(drop=True)
    df['dt'] = pd.to_datetime(df['timestamp'])

    prev_timestamp = None

    for timestamp, group in df.groupby('timestamp', sort=False):
        # Calculate time delta
        current_timestamp = group['dt'].iloc[0]
        if prev_timestamp is not None:
            time_delta = (current_timestamp - prev_timestamp).total_seconds()
            # Clip to reasonable range (max 60 seconds)
            time_delta = min(time_delta, 60.0)
        else:
            time_delta = 1.0  # Default for first reading

        prev_timestamp = current_timestamp

        # Original beacon features
        beacon_counts = group['mac address'].value_counts()
        total_readings = len(group)

        percentage_vector = [0.0] * 23
        for beacon_id, count in beacon_counts.items():
            if 1 <= beacon_id <= 23:
                percentage_vector[int(beacon_id) - 1] = count / total_readings

        # NEW: Add temporal features
        time_delta_normalized = min(time_delta / 10.0, 1.0)  # Normalize: 10s = 1.0
        time_of_day = current_timestamp.hour / 24.0  # Hour of day [0, 1]

        # Combine: beacon features (23) + temporal features (2) = 25 total
        combined_vector = percentage_vector + [time_delta_normalized, time_of_day]

        entry = {
            'timestamp': group['timestamp'].iloc[0],
            'room': group['room'].iloc[0],
            'beacon_vector': combined_vector
        }

        if has_groups:
            entry['room_group'] = group['room_group'].iloc[0]

        vectors.append(entry)

    return pd.DataFrame(vectors)

print("âœ… Beacon vector creation updated with temporal features (25 features)")

def create_sequences_from_groups(vector_df, min_length=3, max_length=50):
    """
    Creates TWO types of training sequences:
    1. Pure sequences (original approach): all same room
    2. Transition sequences (NEW): includes room changes with temporal gaps

    This helps the model learn to handle boundaries during inference.
    """
    pure_sequences = []
    pure_labels = []

    transition_sequences = []
    transition_labels = []

    # Part 1: Create pure sequences (original)
    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=True)
        seq_length = len(group)

        if seq_length < min_length:
            continue

        if seq_length > max_length:
            group = group.tail(max_length)

        sequence = [row['beacon_vector'] for _, row in group.iterrows()]
        pure_sequences.append(sequence)
        pure_labels.append(room)

    # Part 2: Create transition sequences (NEW!)
    # Collect all room groups with metadata
    all_groups = []
    for (room, room_group), group in vector_df.groupby(['room', 'room_group']):
        group = group.sort_values('timestamp').reset_index(drop=True)
        if len(group) >= min_length:
            all_groups.append({
                'room': room,
                'group': group,
                'start_time': pd.to_datetime(group['timestamp'].iloc[0]),
                'end_time': pd.to_datetime(group['timestamp'].iloc[-1])
            })

    # Sort by time
    all_groups.sort(key=lambda x: x['start_time'])

    # Create transition sequences between consecutive room visits
    for i in range(len(all_groups) - 1):
        current = all_groups[i]
        next_room = all_groups[i + 1]

        # Skip if same room (shouldn't happen, but safety check)
        if current['room'] == next_room['room']:
            continue

        # Check temporal proximity (within 30 seconds)
        time_gap = (next_room['start_time'] - current['end_time']).total_seconds()

        if time_gap < 30:  # Only create transition if rooms are temporally close
            # Take last 5-7 timesteps of current room
            tail_size = min(7, len(current['group']))
            current_tail = current['group'].tail(tail_size)

            # Take first 3-5 timesteps of next room
            head_size = min(5, len(next_room['group']))
            next_head = next_room['group'].head(head_size)

            # Combine them
            transition_data = pd.concat([current_tail, next_head])

            # Ensure we don't exceed max_length
            if len(transition_data) > max_length:
                transition_data = transition_data.tail(max_length)

            # Only add if we have enough data
            if len(transition_data) >= min_length:
                transition_seq = [row['beacon_vector'] for _, row in transition_data.iterrows()]

                # Label is the room we're transitioning TO (destination)
                transition_sequences.append(transition_seq)
                transition_labels.append(next_room['room'])

    # Combine pure and transition sequences
    all_sequences = pure_sequences + transition_sequences
    all_labels = pure_labels + transition_labels

    print(f"ðŸ“Š Training data created:")
    print(f"   Pure sequences:       {len(pure_sequences):5d} ({100*len(pure_sequences)/len(all_sequences):5.1f}%)")
    print(f"   Transition sequences: {len(transition_sequences):5d} ({100*len(transition_sequences)/len(all_sequences):5.1f}%)")
    print(f"   Total:                {len(all_sequences):5d}")

    return all_sequences, all_labels

print("âœ… Sequence creation updated with transition sequences")

def create_sliding_windows_by_day(vector_df, window_size=10):
    """Used for Inference: Creates a sequence for every frame, respecting day boundaries."""
    sequences = []
    labels = []

    # Ensure chronological order and group by day
    vector_df['dt'] = pd.to_datetime(vector_df['timestamp'])
    vector_df['date'] = vector_df['dt'].dt.date

    for _, day_group in vector_df.groupby('date'):
        day_group = day_group.sort_values('timestamp').reset_index(drop=True)

        if len(day_group) >= window_size:
            vectors = list(day_group['beacon_vector'])
            rooms = list(day_group['room'])

            for i in range(len(vectors) - window_size + 1):
                window = vectors[i : i + window_size]
                sequences.append(window)
                # Goal: Predict the room at the final timestamp of the window
                labels.append(rooms[i + window_size - 1])

    return sequences, labels

def build_bidirectional_gru_model(input_shape, num_classes):
    """
    Bidirectional GRU Architecture
    NOTE: input_shape is now (max_seq_length, 25) instead of (max_seq_length, 23)
    Features: 23 beacons + time_delta + time_of_day = 25 total
    """
    model = Sequential([
        Masking(mask_value=0.0, input_shape=input_shape),

        Bidirectional(GRU(128, return_sequences=True)),
        Dropout(0.3),

        Bidirectional(GRU(64, return_sequences=False)),
        Dropout(0.3),

        Dense(32, activation='relu'),
        Dropout(0.2),
        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer='adam',
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )
    return model

print("âœ… Bidirectional GRU model function defined (supports 25 features)")

âœ… Beacon vector creation updated with temporal features (25 features)
âœ… Sequence creation updated with transition sequences
âœ… Bidirectional GRU model function defined (supports 25 features)


In [5]:
def run_pipeline_single_seed(train_df, test_df, seed, verbose=False):
    """
    Run realistic pipeline with Temporal Features + Transition Training + Temporal Voting.
    1. Resets memory/seeds.
    2. Trains on pure sequences + transition sequences.
    3. Infers with sliding window (now with temporal features).
    4. Smooths predictions with a Majority Vote filter.
    """
    # 0. PREVENT MEMORY LEAKS & ENSURE SEEDING
    tf.keras.backend.clear_session()
    set_seeds(seed)

    # HYPERPARAMETERS
    window_size = 10     # Your sliding window size
    vote_window = 5      # The smoothing neighborhood (5 seconds)
    max_seq_length = 50

    # 1. Preprocessing
    train_df = create_room_groups(train_df)
    train_vectors = create_beacon_count_vectors(train_df)  # Now returns 25 features
    test_vectors = create_beacon_count_vectors(test_df)    # Now returns 25 features

    # 2. Sequence Creation (now includes transition sequences!)
    X_train, y_train = create_sequences_from_groups(train_vectors, max_length=max_seq_length)
    X_test, y_test = create_sliding_windows_by_day(test_vectors, window_size=window_size)

    # 3. Encoding & Padding
    label_encoder = LabelEncoder()
    label_encoder.fit(list(y_train) + list(y_test))
    y_train_encoded = label_encoder.transform(y_train)
    y_test_encoded = label_encoder.transform(y_test)

    X_train_padded = pad_sequences(X_train, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)
    X_test_padded = pad_sequences(X_test, maxlen=max_seq_length, dtype='float32', padding='post', value=0.0)

    # 4. Train Model with Macro F1 Optimization
    class_weights = compute_class_weight('balanced', classes=np.unique(y_train_encoded), y=y_train_encoded)
    class_weight_dict = dict(enumerate(class_weights))

    # UPDATED: Now using 25 features instead of 23
    model = build_bidirectional_gru_model(input_shape=(max_seq_length, 25), num_classes=len(label_encoder.classes_))

    callbacks = [
        EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True, verbose=0),
        ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-6, verbose=0)
    ]

    model.fit(
        X_train_padded, y_train_encoded,
        validation_data=(X_test_padded, y_test_encoded),
        epochs=100, batch_size=32,
        class_weight=class_weight_dict,
        callbacks=callbacks, verbose=0
    )

    # 5. INFERENCE
    y_pred_probs = model.predict(X_test_padded, verbose=0)
    y_pred_raw_encoded = np.argmax(y_pred_probs, axis=1)

    # 6. TEMPORAL VOTING (Smoothing)
    def apply_temporal_voting(preds, v_window):
        """Applies a majority vote filter to smooth room predictions."""
        smoothed = []
        for i in range(len(preds)):
            start = max(0, i - v_window // 2)
            end = min(len(preds), i + v_window // 2 + 1)
            neighborhood = preds[start:end]
            smoothed.append(np.bincount(neighborhood).argmax())
        return np.array(smoothed)

    y_pred_voted_encoded = apply_temporal_voting(y_pred_raw_encoded, vote_window)
    y_pred = label_encoder.inverse_transform(y_pred_voted_encoded)

    # 7. Final Evaluation
    macro_f1 = f1_score(y_test, y_pred, average='macro', zero_division=0)
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_, zero_division=0)

    return {
        'seed': seed,
        'macro_f1': macro_f1,
        'per_class_f1': {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)}
    }

print("âœ“ Pipeline updated: Temporal Features (25-dim) + Transition Sequences + Voting")

âœ“ Pipeline updated: Temporal Features (25-dim) + Transition Sequences + Voting


In [6]:
# Check GPU availability
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))
print(tf.config.list_physical_devices('GPU'))

Num GPUs Available:  1
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]


In [7]:
# Run 10 seeds for each of 4 folds
seeds = [42, 123, 456, 789, 2024, 3141, 5926, 8888, 1337, 9999]
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

all_fold_results = {}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")

    fold_results = []

    for seed in seeds:
        print(f"  Running seed {seed}...", end=" ")
        result = run_pipeline_single_seed(train_df, test_df, seed, verbose=False)
        fold_results.append(result)
        print(f"Macro F1: {result['macro_f1']:.4f}")

    all_fold_results[fold_num] = fold_results

    # Calculate fold statistics
    macro_f1_scores = [r['macro_f1'] for r in fold_results]
    print(f"\n  Fold {fold_num} Summary:")
    print(f"    Mean Macro F1: {np.mean(macro_f1_scores):.4f} Â± {np.std(macro_f1_scores):.4f}")
    print(f"    Min: {np.min(macro_f1_scores):.4f}, Max: {np.max(macro_f1_scores):.4f}")

print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)


PROCESSING FOLD 1

  Running seed 42... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)
   Transition sequences:   149 ( 42.2%)
   Total:                  353
Macro F1: 0.4283
  Running seed 123... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)
   Transition sequences:   149 ( 42.2%)
   Total:                  353
Macro F1: 0.4443
  Running seed 456... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)
   Transition sequences:   149 ( 42.2%)
   Total:                  353
Macro F1: 0.3391
  Running seed 789... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)
   Transition sequences:   149 ( 42.2%)
   Total:                  353
Macro F1: 0.4538
  Running seed 2024... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)
   Transition sequences:   149 ( 42.2%)
   Total:                  353
Macro F1: 0.4406
  Running seed 3141... ðŸ“Š Training data created:
   Pure sequences:         204 ( 57.8%)

In [8]:
# Save results to text file
with open('4fold_10seed_results.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("4-FOLD CROSS-VALIDATION WITH 10 SEEDS PER FOLD\n")
    f.write("="*80 + "\n\n")

    # Overall summary
    all_macro_f1 = []
    for fold_num in [1, 2, 3, 4]:
        fold_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
        all_macro_f1.extend(fold_scores)

    f.write("OVERALL RESULTS (40 runs total):\n")
    f.write("-"*80 + "\n")
    f.write(f"Mean Macro F1: {np.mean(all_macro_f1):.4f} Â± {np.std(all_macro_f1):.4f}\n")
    f.write(f"Min: {np.min(all_macro_f1):.4f}, Max: {np.max(all_macro_f1):.4f}\n\n")

    # Per-fold results
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\n{'='*80}\n")
        f.write(f"FOLD {fold_num} RESULTS\n")
        f.write(f"{'='*80}\n\n")

        fold_results = all_fold_results[fold_num]
        macro_f1_scores = [r['macro_f1'] for r in fold_results]

        f.write(f"Macro F1 Scores (10 seeds):\n")
        f.write("-"*80 + "\n")
        for i, result in enumerate(fold_results):
            f.write(f"  Seed {result['seed']:5d}: {result['macro_f1']:.4f}\n")

        f.write(f"\nStatistics:\n")
        f.write(f"  Mean: {np.mean(macro_f1_scores):.4f} Â± {np.std(macro_f1_scores):.4f}\n")
        f.write(f"  Min:  {np.min(macro_f1_scores):.4f}\n")
        f.write(f"  Max:  {np.max(macro_f1_scores):.4f}\n")

        # Per-class F1 (averaged across 10 seeds)
        f.write(f"\nPer-Class F1 Scores (averaged across 10 seeds):\n")
        f.write("-"*80 + "\n")

        # Collect all class names
        all_classes = set()
        for result in fold_results:
            all_classes.update(result['per_class_f1'].keys())

        # Average per-class F1 across seeds
        for class_name in sorted(all_classes):
            class_f1_scores = [r['per_class_f1'].get(class_name, 0) for r in fold_results]
            mean_f1 = np.mean(class_f1_scores)
            std_f1 = np.std(class_f1_scores)
            f.write(f"  {class_name:20s}: {mean_f1:.4f} Â± {std_f1:.4f}\n")

print("âœ… Results saved to 4fold_10seed_results.txt")

âœ… Results saved to 4fold_10seed_results.txt


In [9]:
# Display summary
print("\n" + "="*80)
print("SUMMARY - 4 FOLDS Ã— 10 SEEDS = 40 TOTAL RUNS")
print("="*80 + "\n")

for fold_num in [1, 2, 3, 4]:
    macro_f1_scores = [r['macro_f1'] for r in all_fold_results[fold_num]]
    print(f"Fold {fold_num}: {np.mean(macro_f1_scores):.4f} Â± {np.std(macro_f1_scores):.4f}")

all_macro_f1 = []
for fold_num in [1, 2, 3, 4]:
    all_macro_f1.extend([r['macro_f1'] for r in all_fold_results[fold_num]])

print(f"\n{'='*80}")
print(f"Overall Mean: {np.mean(all_macro_f1):.4f} Â± {np.std(all_macro_f1):.4f}")
print(f"{'='*80}")


SUMMARY - 4 FOLDS Ã— 10 SEEDS = 40 TOTAL RUNS

Fold 1: 0.4115 Â± 0.0417
Fold 2: 0.3407 Â± 0.0196
Fold 3: 0.3616 Â± 0.0391
Fold 4: 0.3961 Â± 0.0409

Overall Mean: 0.3775 Â± 0.0459
