In [3]:
import pandas as pd

def load_and_filter_fold(i):
    train_dir = f'../cleaned_dataset/split_data/fold{i}/train.csv'  
    test_dir = f'../cleaned_dataset/split_data/fold{i}/test.csv'   
    # load the data 
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    # Get all of the unique labels for train / test sets 
    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())

    # Labels that appear in BOTH train and test
    common_labels = list(set(train_labels) & set(test_labels))

    # Filter to just keep the records with labels in common labels list
    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df  = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load fold 1
train_df_1, test_df_1 = load_and_filter_fold(1)

# Load fold 2
train_df_2, test_df_2 = load_and_filter_fold(2)

# Load fold 3
train_df_3, test_df_3 = load_and_filter_fold(3)

# Load fold 4
train_df_4, test_df_4 = load_and_filter_fold(4)

In [5]:
folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4),
}

def display_raw_info(train_df, test_df):
    # these 2 df already share just the records with same labels
    labels = list(train_df['room'].unique())

    # hashmap of how many counts do each of these 2 df have for each label in labels
    train_counts = train_df['room'].value_counts().to_dict()
    test_counts = test_df['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_df)} train samples, {len(test_df)} test samples")
    
    return train_counts, test_counts

def display_window_info(train_df, test_df):
    # Group by timestamp and take first room value (since each timestamp has one unique room)
    train_windowed = train_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    test_windowed = test_df.groupby('timestamp').agg({'room': 'first'}).reset_index()
    
    # Get unique labels
    labels = list(train_windowed['room'].unique())
    
    # Count room distribution
    train_counts = train_windowed['room'].value_counts().to_dict()
    test_counts = test_windowed['room'].value_counts().to_dict()
    
    # Display the information
    print(f"{'Room':<20} {'Train Count':<15} {'Test Count':<15}")
    print("-" * 50)
    
    for label in labels:
        train_count = train_counts.get(label, 0)
        test_count = test_counts.get(label, 0)
        print(f"{label:<20} {train_count:<15} {test_count:<15}")
    
    print(f"\nTotal: {len(train_windowed)} train windows, {len(test_windowed)} test windows")
    
    return train_windowed, test_windowed, train_counts, test_counts


In [6]:
import sys
from io import StringIO

# Open file to write results
with open('folds_analysis.txt', 'w') as f:
    for fold_num, (train_df, test_df) in folds.items():
        # Redirect print to capture output
        old_stdout = sys.stdout
        sys.stdout = StringIO()
        
        # Display raw info
        print(f"=" * 60)
        print(f"FOLD {fold_num} - RAW DATA")
        print("=" * 60)
        display_raw_info(train_df, test_df)
        
        print("\n")
        
        # Display windowed info
        print(f"=" * 60)
        print(f"FOLD {fold_num} - WINDOWED DATA")
        print("=" * 60)
        display_window_info(train_df, test_df)
        
        print("\n\n")
        
        # Get the captured output
        output = sys.stdout.getvalue()
        sys.stdout = old_stdout
        
        # Write to file
        f.write(output)
        
        # Also print to console so you can see progress
        print(f"Completed fold {fold_num}")

print("Analysis saved to 'folds_analysis.txt'")

Completed fold 1
Completed fold 2
Completed fold 3
Completed fold 4
Analysis saved to 'folds_analysis.txt'
