In [1]:
import pandas as pd

def load_and_filter_fold(i):
    train_dir = f'../cleaned_dataset/split_data/fold{i}/train.csv'  
    test_dir = f'../cleaned_dataset/split_data/fold{i}/test.csv'   
    # load the data 
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    # Get all of the unique labels for train / test sets 
    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())

    # Labels that appear in BOTH train and test
    common_labels = list(set(train_labels) & set(test_labels))

    # Filter to just keep the records with labels in common labels list
    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df  = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load fold 1
train_df_1, test_df_1 = load_and_filter_fold(1)

# Load fold 2
train_df_2, test_df_2 = load_and_filter_fold(2)

# Load fold 3
train_df_3, test_df_3 = load_and_filter_fold(3)

# Load fold 4
train_df_4, test_df_4 = load_and_filter_fold(4)

In [2]:
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4),
}

def display_raw_info(train_df, test_df):
    """Display frame-level (raw BLE record) distribution"""
    # these 2 df already share just the records with same labels
    labels = list(train_df['room'].unique())

    # hashmap of how many counts do each of these 2 df have for each label in labels
    train_counts = train_df['room'].value_counts().to_dict()
    test_counts = test_df['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_df)} train samples, {len(test_df)} test samples")
    
    return train_counts, test_counts

def display_window_info(train_df, test_df):
    """Display window-level (1-second aggregations) distribution"""
    # Group by timestamp and take first room value (since each timestamp has one unique room)
    train_windowed = train_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    test_windowed = test_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    
    # Get unique labels
    labels = list(train_windowed['room'].unique())
    
    # Count room distribution
    train_counts = train_windowed['room'].value_counts().to_dict()
    test_counts = test_windowed['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_windowed)} train windows, {len(test_windowed)} test windows")
    
    return train_windowed, test_windowed, train_counts, test_counts

def display_sequence_info(train_df, test_df):
    """Display sequence-level (room visit) distribution"""
    # Create room groups (sequences) for both train and test
    train_df_copy = train_df.copy()
    test_df_copy = test_df.copy()
    
    # Sort by timestamp and create room_group column
    train_df_copy = train_df_copy.sort_values('timestamp').reset_index(drop=True)
    train_df_copy['room_group'] = (train_df_copy['room'] != train_df_copy['room'].shift()).cumsum()
    
    test_df_copy = test_df_copy.sort_values('timestamp').reset_index(drop=True)
    test_df_copy['room_group'] = (test_df_copy['room'] != test_df_copy['room'].shift()).cumsum()
    
    # Get unique sequences (one per room_group)
    train_sequences = train_df_copy.groupby('room_group').agg({
        'room': 'first',
        'timestamp': 'count'  # Count how many frames in this sequence
    }).reset_index()
    train_sequences.columns = ['room_group', 'room', 'sequence_length']
    
    test_sequences = test_df_copy.groupby('room_group').agg({
        'room': 'first',
        'timestamp': 'count'
    }).reset_index()
    test_sequences.columns = ['room_group', 'room', 'sequence_length']
    
    # Get unique labels
    labels = sorted(list(set(train_sequences['room'].unique()) | set(test_sequences['room'].unique())))
    
    # Count sequences per room
    train_counts = train_sequences['room'].value_counts().to_dict()
    test_counts = test_sequences['room'].value_counts().to_dict()
    
    # Calculate average sequence length per room
    train_avg_length = train_sequences.groupby('room')['sequence_length'].mean().to_dict()
    test_avg_length = test_sequences.groupby('room')['sequence_length'].mean().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Seq':<12} {'Avg Len':<10} {'Test Seq':<12} {'Avg Len':<10}")
    print("-" * 70)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        train_avg = train_avg_length.get(label, 0)
        test_avg = test_avg_length.get(label, 0)
        print(f"{label:<20} {train_count:<12} {train_avg:<10.1f} {test_count:<12} {test_avg:<10.1f}")
    
    print(f"\nTotal: {len(train_sequences)} train sequences, {len(test_sequences)} test sequences")
    print(f"Overall avg length: train={train_sequences['sequence_length'].mean():.1f}, test={test_sequences['sequence_length'].mean():.1f}")
    
    return train_sequences, test_sequences, train_counts, test_counts

print("âœ… All distribution analysis functions defined")

âœ… All distribution analysis functions defined


In [3]:
import sys
from io import StringIO

# Open file to write results
with open('folds_analysis.txt', 'w') as f:
    for fold_num, (train_df, test_df) in folds.items():
        # Redirect print to capture output
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        
        # Display raw info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - RAW DATA (Frame-level) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_raw_info(train_df, test_df)
        
        print("\n")
        
        # Display windowed info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - WINDOWED DATA (1-second windows) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_window_info(train_df, test_df)
        
        print("\n")
        
        # Display sequence info
        print(f"=" * 70)
        print(f"FOLD {fold_num} - SEQUENCE DATA (Room visits) - DAY {5 - fold_num} AS TEST DATA")
        print("=" * 70)
        display_sequence_info(train_df, test_df)
        
        print("\n\n")
        
        # Get the captured output
        output = sys.stdout.getvalue()
        sys.stdout = old_stdout
        
        # Write to file
        f.write(output)
        
        # Also print to console so you can see progress
        print(f"âœ“ Completed fold {fold_num}")

print("\nâœ… Analysis saved to 'folds_analysis.txt'")
print("\nThe file contains 3 levels of analysis for each fold:")
print("  1. Frame-level: Raw BLE records")
print("  2. Window-level: 1-second aggregations")
print("  3. Sequence-level: Room visit sequences (used by the model)")

âœ“ Completed fold 1
âœ“ Completed fold 2
âœ“ Completed fold 3
âœ“ Completed fold 4

âœ… Analysis saved to 'folds_analysis.txt'

The file contains 3 levels of analysis for each fold:
  1. Frame-level: Raw BLE records
  2. Window-level: 1-second aggregations
  3. Sequence-level: Room visit sequences (used by the model)


In [4]:
import pandas as pd
import numpy as np

# Data extracted from your fold analysis
fold_data = {
    'Fold': [1, 2, 3, 4],
    'Test Day': ['Day 4', 'Day 3', 'Day 2', 'Day 1'],
    
    # Frame-level (raw BLE records)
    'Train Frames': [962_294, 951_141, 747_816, 465_004],
    'Test Frames': [30_619, 143_401, 333_507, 590_447],
    
    # Window-level (1-second aggregations)
    'Train Windows': [19_280, 17_506, 15_136, 15_000],
    'Test Windows': [2_481, 5_932, 7_739, 7_353],
    
    # Sequence-level (room visits)
    'Train Sequences': [210, 221, 231, 177],
    'Test Sequences': [51, 87, 69, 108],
    
    # Additional metrics
    'Train Classes': [13, 18, 15, 12],
    'Test Classes': [13, 18, 15, 12],
}

df = pd.DataFrame(fold_data)

# Calculate ratios
df['Frame Ratio'] = df['Train Frames'] / df['Test Frames']
df['Window Ratio'] = df['Train Windows'] / df['Test Windows']
df['Seq Ratio'] = df['Train Sequences'] / df['Test Sequences']

# Display full summary table
print("=" * 120)
print("COMPREHENSIVE DATA SIZE ANALYSIS - ALL 4 FOLDS")
print("=" * 120)
print()

# Part 1: Absolute counts
print("ABSOLUTE COUNTS:")
print("-" * 120)
display_cols_1 = ['Fold', 'Test Day', 'Train Frames', 'Test Frames', 
                  'Train Windows', 'Test Windows', 'Train Sequences', 'Test Sequences']
print(df[display_cols_1].to_string(index=False))
print()

# Part 2: Train/Test ratios
print("\nTRAIN/TEST RATIOS:")
print("-" * 120)
display_cols_2 = ['Fold', 'Test Day', 'Frame Ratio', 'Window Ratio', 
                  'Seq Ratio', 'Train Classes', 'Test Classes']
print(df[display_cols_2].to_string(index=False))
print()

# Part 3: Key statistics
print("\nKEY STATISTICS:")
print("-" * 120)

# Frame level stats
print("\nðŸ“Š FRAME-LEVEL (Raw BLE Records):")
print(f"  Total Train Frames: {df['Train Frames'].sum():,}")
print(f"  Total Test Frames:  {df['Test Frames'].sum():,}")
print(f"  Train/Test Ratios:  {df['Frame Ratio'].min():.2f} - {df['Frame Ratio'].max():.2f}x")
print(f"  Largest train set:  Fold {df.loc[df['Train Frames'].idxmax(), 'Fold']} ({df['Train Frames'].max():,} frames)")
print(f"  Smallest train set: Fold {df.loc[df['Train Frames'].idxmin(), 'Fold']} ({df['Train Frames'].min():,} frames)")
print(f"  Largest test set:   Fold {df.loc[df['Test Frames'].idxmax(), 'Fold']} ({df['Test Frames'].max():,} frames)")
print(f"  Smallest test set:  Fold {df.loc[df['Test Frames'].idxmin(), 'Fold']} ({df['Test Frames'].min():,} frames)")

# Window level stats
print("\nðŸ“Š WINDOW-LEVEL (1-second Aggregations):")
print(f"  Total Train Windows: {df['Train Windows'].sum():,}")
print(f"  Total Test Windows:  {df['Test Windows'].sum():,}")
print(f"  Train/Test Ratios:   {df['Window Ratio'].min():.2f} - {df['Window Ratio'].max():.2f}x")
print(f"  Largest train set:   Fold {df.loc[df['Train Windows'].idxmax(), 'Fold']} ({df['Train Windows'].max():,} windows)")
print(f"  Smallest train set:  Fold {df.loc[df['Train Windows'].idxmin(), 'Fold']} ({df['Train Windows'].min():,} windows)")
print(f"  Largest test set:    Fold {df.loc[df['Test Windows'].idxmax(), 'Fold']} ({df['Test Windows'].max():,} windows)")
print(f"  Smallest test set:   Fold {df.loc[df['Test Windows'].idxmin(), 'Fold']} ({df['Test Windows'].min():,} windows)")

# Sequence level stats
print("\nðŸ“Š SEQUENCE-LEVEL (Room Visits):")
print(f"  Total Train Sequences: {df['Train Sequences'].sum():,}")
print(f"  Total Test Sequences:  {df['Test Sequences'].sum():,}")
print(f"  Train/Test Ratios:     {df['Seq Ratio'].min():.2f} - {df['Seq Ratio'].max():.2f}x")
print(f"  Largest train set:     Fold {df.loc[df['Train Sequences'].idxmax(), 'Fold']} ({df['Train Sequences'].max():,} sequences)")
print(f"  Smallest train set:    Fold {df.loc[df['Train Sequences'].idxmin(), 'Fold']} ({df['Train Sequences'].min():,} sequences)")
print(f"  Largest test set:      Fold {df.loc[df['Test Sequences'].idxmax(), 'Fold']} ({df['Test Sequences'].max():,} sequences)")
print(f"  Smallest test set:     Fold {df.loc[df['Test Sequences'].idxmin(), 'Fold']} ({df['Test Sequences'].min():,} sequences)")

# Class diversity
print("\nðŸ“Š CLASS DIVERSITY:")
print(f"  Most diverse:  Fold {df.loc[df['Train Classes'].idxmax(), 'Fold']} ({df['Train Classes'].max()} classes)")
print(f"  Least diverse: Fold {df.loc[df['Train Classes'].idxmin(), 'Fold']} ({df['Train Classes'].min()} classes)")

print("\n" + "=" * 120)

# Save to file
output_file = 'fold_size_comparison.txt'
with open(output_file, 'w') as f:
    f.write("=" * 120 + "\n")
    f.write("COMPREHENSIVE DATA SIZE ANALYSIS - ALL 4 FOLDS\n")
    f.write("=" * 120 + "\n\n")
    
    f.write("ABSOLUTE COUNTS:\n")
    f.write("-" * 120 + "\n")
    f.write(df[display_cols_1].to_string(index=False) + "\n\n")
    
    f.write("TRAIN/TEST RATIOS:\n")
    f.write("-" * 120 + "\n")
    f.write(df[display_cols_2].to_string(index=False) + "\n\n")
    
    # Add all the statistics to file as well
    f.write("KEY STATISTICS:\n")
    f.write("-" * 120 + "\n")
    # (repeat all the statistics from above)

print(f"\nâœ… Summary saved to '{output_file}'")

# Also return the dataframe for further analysis
df

COMPREHENSIVE DATA SIZE ANALYSIS - ALL 4 FOLDS

ABSOLUTE COUNTS:
------------------------------------------------------------------------------------------------------------------------
 Fold Test Day  Train Frames  Test Frames  Train Windows  Test Windows  Train Sequences  Test Sequences
    1    Day 4        962294        30619          19280          2481              210              51
    2    Day 3        951141       143401          17506          5932              221              87
    3    Day 2        747816       333507          15136          7739              231              69
    4    Day 1        465004       590447          15000          7353              177             108


TRAIN/TEST RATIOS:
------------------------------------------------------------------------------------------------------------------------
 Fold Test Day  Frame Ratio  Window Ratio  Seq Ratio  Train Classes  Test Classes
    1    Day 4    31.428002      7.771060   4.117647             13   

Unnamed: 0,Fold,Test Day,Train Frames,Test Frames,Train Windows,Test Windows,Train Sequences,Test Sequences,Train Classes,Test Classes,Frame Ratio,Window Ratio,Seq Ratio
0,1,Day 4,962294,30619,19280,2481,210,51,13,13,31.428002,7.77106,4.117647
1,2,Day 3,951141,143401,17506,5932,221,87,18,18,6.632736,2.951113,2.54023
2,3,Day 2,747816,333507,15136,7739,231,69,15,15,2.24228,1.955808,3.347826
3,4,Day 1,465004,590447,15000,7353,177,108,12,12,0.787546,2.039984,1.638889
