In [1]:
import pandas as pd

def load_and_filter_fold(i):
    train_dir = f'../cleaned_dataset/split_data/fold{i}/train.csv'  
    test_dir = f'../cleaned_dataset/split_data/fold{i}/test.csv'   
    # load the data 
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    # Get all of the unique labels for train / test sets 
    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())

    # Labels that appear in BOTH train and test
    common_labels = list(set(train_labels) & set(test_labels))

    # Filter to just keep the records with labels in common labels list
    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df  = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load fold 1
train_df_1, test_df_1 = load_and_filter_fold(1)

# Load fold 2
train_df_2, test_df_2 = load_and_filter_fold(2)

# Load fold 3
train_df_3, test_df_3 = load_and_filter_fold(3)

# Load fold 4
train_df_4, test_df_4 = load_and_filter_fold(4)

In [2]:
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4),
}

def display_raw_info(train_df, test_df):
    """Display frame-level (raw BLE record) distribution"""
    # these 2 df already share just the records with same labels
    labels = list(train_df['room'].unique())

    # hashmap of how many counts do each of these 2 df have for each label in labels
    train_counts = train_df['room'].value_counts().to_dict()
    test_counts = test_df['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_df)} train samples, {len(test_df)} test samples")
    
    return train_counts, test_counts

def display_window_info(train_df, test_df):
    """Display window-level (1-second aggregations) distribution"""
    # Group by timestamp and take first room value (since each timestamp has one unique room)
    train_windowed = train_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    test_windowed = test_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    
    # Get unique labels
    labels = list(train_windowed['room'].unique())
    
    # Count room distribution
    train_counts = train_windowed['room'].value_counts().to_dict()
    test_counts = test_windowed['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_windowed)} train windows, {len(test_windowed)} test windows")
    
    return train_windowed, test_windowed, train_counts, test_counts

def display_sequence_info(train_df, test_df):
    """Display sequence-level (room visit) distribution"""
    # Create room groups (sequences) for both train and test
    train_df_copy = train_df.copy()
    test_df_copy = test_df.copy()
    
    # Sort by timestamp and create room_group column
    train_df_copy = train_df_copy.sort_values('timestamp').reset_index(drop=True)
    train_df_copy['room_group'] = (train_df_copy['room'] != train_df_copy['room'].shift()).cumsum()
    
    test_df_copy = test_df_copy.sort_values('timestamp').reset_index(drop=True)
    test_df_copy['room_group'] = (test_df_copy['room'] != test_df_copy['room'].shift()).cumsum()
    
    # Get unique sequences (one per room_group)
    train_sequences = train_df_copy.groupby('room_group').agg({
        'room': 'first',
        'timestamp': 'count'  # Count how many frames in this sequence
    }).reset_index()
    train_sequences.columns = ['room_group', 'room', 'sequence_length']
    
    test_sequences = test_df_copy.groupby('room_group').agg({
        'room': 'first',
        'timestamp': 'count'
    }).reset_index()
    test_sequences.columns = ['room_group', 'room', 'sequence_length']
    
    # Get unique labels
    labels = sorted(list(set(train_sequences['room'].unique()) | set(test_sequences['room'].unique())))
    
    # Count sequences per room
    train_counts = train_sequences['room'].value_counts().to_dict()
    test_counts = test_sequences['room'].value_counts().to_dict()
    
    # Calculate average sequence length per room
    train_avg_length = train_sequences.groupby('room')['sequence_length'].mean().to_dict()
    test_avg_length = test_sequences.groupby('room')['sequence_length'].mean().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Seq':<12} {'Avg Len':<10} {'Test Seq':<12} {'Avg Len':<10}")
    print("-" * 70)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        train_avg = train_avg_length.get(label, 0)
        test_avg = test_avg_length.get(label, 0)
        print(f"{label:<20} {train_count:<12} {train_avg:<10.1f} {test_count:<12} {test_avg:<10.1f}")
    
    print(f"\nTotal: {len(train_sequences)} train sequences, {len(test_sequences)} test sequences")
    print(f"Overall avg length: train={train_sequences['sequence_length'].mean():.1f}, test={test_sequences['sequence_length'].mean():.1f}")
    
    return train_sequences, test_sequences, train_counts, test_counts

print("✅ All distribution analysis functions defined")

✅ All distribution analysis functions defined


In [3]:
import sys
from io import StringIO

# Open file to write results
with open('folds_analysis.txt', 'w') as f:
    for fold_num, (train_df, test_df) in folds.items():
        # Redirect print to capture output
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        
        # Display raw info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - RAW DATA (Frame-level) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_raw_info(train_df, test_df)
        
        print("\n")
        
        # Display windowed info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - WINDOWED DATA (1-second windows) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_window_info(train_df, test_df)
        
        print("\n")
        
        # Display sequence info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - SEQUENCE DATA (Room visits) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_sequence_info(train_df, test_df)
        
        print("\n\n")
        
        # Get the captured output
        output = sys.stdout.getvalue()
        sys.stdout = old_stdout
        
        # Write to file
        f.write(output)
        
        # Also print to console so you can see progress
        print(f"✓ Completed fold {fold_num}")

print("\n✅ Analysis saved to 'folds_analysis.txt'")
print("\nThe file contains 3 levels of analysis for each fold:")
print("  1. Frame-level: Raw BLE records")
print("  2. Window-level: 1-second aggregations")
print("  3. Sequence-level: Room visit sequences (used by the model)")

✓ Completed fold 1
✓ Completed fold 2
✓ Completed fold 3
✓ Completed fold 4

✅ Analysis saved to 'folds_analysis.txt'

The file contains 3 levels of analysis for each fold:
  1. Frame-level: Raw BLE records
  2. Window-level: 1-second aggregations
  3. Sequence-level: Room visit sequences (used by the model)
