In [1]:
# load and prepare all of 4 folds data
import pandas as pd

def load_and_filter_fold(i):
    train_dir = f'../../cleaned_dataset/split_data/fold{i}/train.csv'  
    test_dir = f'../../cleaned_dataset/split_data/fold{i}/test.csv'   
    # load the data 
    train_df = pd.read_csv(train_dir)
    test_df = pd.read_csv(test_dir)

    # Get all of the unique labels for train / test sets 
    train_labels = list(train_df['room'].unique())
    test_labels = list(test_df['room'].unique())

    # Labels that appear in BOTH train and test
    common_labels = list(set(train_labels) & set(test_labels))

    # Filter to just keep the records with labels in common labels list
    train_df = train_df[train_df['room'].isin(common_labels)].reset_index(drop=True)
    test_df  = test_df[test_df['room'].isin(common_labels)].reset_index(drop=True)

    return train_df, test_df

# Load fold 1
train_df_1, test_df_1 = load_and_filter_fold(1)

# Load fold 2
train_df_2, test_df_2 = load_and_filter_fold(2)

# Load fold 3
train_df_3, test_df_3 = load_and_filter_fold(3)

# Load fold 4
train_df_4, test_df_4 = load_and_filter_fold(4)

In [2]:
# Function to add 25 beacon_1, beacon_2, ... , beacon_25
def add_beacon_features(df, num_beacons=25):
    df = df.copy()

    for i in range(1, num_beacons + 1):
        df[f'beacon_{i}'] = df['RSSI'].where(df['mac address'] == i, 0)

    return df

def aggregate_by_timestamp(df):
    """
    Aggregate beacon data by timestamp (1-second windows)
    
    For each beacon column:
    - If ALL values are 0: set mean, std, min, max, count = 0
    - If ANY non-zero values exist: calculate statistics ONLY on non-zero values
    
    Args:
        df: DataFrame with columns: timestamp, room, beacon_1, beacon_2, ..., beacon_25
    
    Returns:
        windowed_df: DataFrame with columns:
            - timestamp
            - room
            - beacon_1_mean, beacon_1_std, beacon_1_min, beacon_1_max, beacon_1_count
            - beacon_2_mean, beacon_2_std, beacon_2_min, beacon_2_max, beacon_2_count
            - ...
            - beacon_25_mean, beacon_25_std, beacon_25_min, beacon_25_max, beacon_25_count
    """    
    df = df.copy()
    
    # Group by timestamp
    grouped = df.groupby('timestamp')
    
    result_rows = []
    
    for timestamp, group in grouped:
        # Initialize row with timestamp and room
        row_data = {
            'timestamp': timestamp,
            'room': group['room'].iloc[0]  # Take first room value (should be consistent)
        }
        
        # Process each beacon column (beacon_1 to beacon_25)
        for beacon_id in range(1, 26):
            beacon_col = f'beacon_{beacon_id}'
            
            # Get all values for this beacon in this window
            beacon_values = group[beacon_col].values
            
            # Filter to get only non-zero values
            non_zero_values = beacon_values[beacon_values != 0]
            
            # Check if we have any non-zero values
            if len(non_zero_values) > 0:
                # Calculate statistics on non-zero values ONLY
                row_data[f'{beacon_col}_mean'] = non_zero_values.mean()
                row_data[f'{beacon_col}_std'] = non_zero_values.std() if len(non_zero_values) > 1 else 0.0
                row_data[f'{beacon_col}_min'] = non_zero_values.min()
                row_data[f'{beacon_col}_max'] = non_zero_values.max()
                row_data[f'{beacon_col}_count'] = len(non_zero_values)
            else:
                # All values are zero - set everything to 0
                row_data[f'{beacon_col}_mean'] = 0.0
                row_data[f'{beacon_col}_std'] = 0.0
                row_data[f'{beacon_col}_min'] = 0.0
                row_data[f'{beacon_col}_max'] = 0.0
                row_data[f'{beacon_col}_count'] = 0
        
        result_rows.append(row_data)
    
    # Create DataFrame from result rows
    windowed_df = pd.DataFrame(result_rows)
    
    # Filter out completely empty windows (all beacons are 0)
    beacon_mean_cols = [f'beacon_{i}_mean' for i in range(1, 26)]
    valid_windows = windowed_df[beacon_mean_cols].sum(axis=1) != 0
    
    removed_count = (~valid_windows).sum()
    windowed_df = windowed_df[valid_windows].reset_index(drop=True)
    
    print(f"Total windows after aggregation: {len(windowed_df)}")
    print(f"Removed {removed_count} empty windows (all beacons = 0)")
    print(f"Features: 25 beacons × 5 statistics = 125 features")
    
    return windowed_df

In [3]:
# import essential libraries for model 
import xgboost as xgb
from sklearn.metrics import classification_report, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_sample_weight
import warnings
warnings.filterwarnings('ignore')

In [4]:
def train_evaluate_fold(i, train_df, test_df):
    # firstly get the train and test df
    print(f"Loading the data frame from fold {i}")
    
    # secondly create 25 beacon vector features 
    print("Adding 25 beacon vector features for both set")
    train_df = add_beacon_features(train_df)
    test_df = add_beacon_features(test_df)

    # apply windowing to both of the set 
    print("Applying windowing for both sets")
    windowed_train_df = aggregate_by_timestamp(train_df)
    windowed_test_df = aggregate_by_timestamp(test_df)

    # prepare training data
    feature_cols = [col for col in windowed_train_df.columns 
                    if col not in ['room', 'timestamp']]

    X_train = windowed_train_df[feature_cols]
    y_train = windowed_train_df['room']

    # Encode labels for XGBoost (XGBoost requires numeric labels)
    label_encoder = LabelEncoder()
    y_train_encoded = label_encoder.fit_transform(y_train)

    # =====================================================================
    # IDENTIFY 3 MOST MINORITY CLASSES AND APPLY 3X WEIGHTS
    # =====================================================================
    
    # Get class distribution
    class_counts = y_train.value_counts()
    
    # Identify 3 most minority classes (smallest counts)
    minority_classes = class_counts.nsmallest(3).index.tolist()
    
    print(f"\n{'='*60}")
    print(f"CLASS DISTRIBUTION IN FOLD {i}:")
    print(f"{'='*60}")
    print(class_counts.to_string())
    print(f"\n{'='*60}")
    print(f"3 MOST MINORITY CLASSES:")
    print(f"{'='*60}")
    for idx, minority_class in enumerate(minority_classes, 1):
        count = class_counts[minority_class]
        percentage = (count / len(y_train)) * 100
        print(f"{idx}. {minority_class:20s}: {count:5d} samples ({percentage:5.2f}%)")
    print(f"{'='*60}\n")
    
    # Calculate base balanced weights
    sample_weights = compute_sample_weight(class_weight='balanced', y=y_train_encoded)
    
    # Apply 3x multiplier to the 3 most minority classes
    for minority_class in minority_classes:
        minority_mask = y_train == minority_class
        sample_weights[minority_mask] *= 3.0
    
    print(f"✓ Applied 3x weight multiplier to minority classes")
    print(f"  Minority classes: {minority_classes}\n")

    # Train XGBoost with parameters optimized for macro F1
    xgb_model = xgb.XGBClassifier(
        n_estimators=100,
        max_depth=10,
        learning_rate=0.1,
        min_child_weight=1,
        subsample=0.8,
        colsample_bytree=0.8,
        gamma=0,
        reg_alpha=0,
        reg_lambda=1,
        objective='multi:softmax',
        num_class=len(y_train.unique()),
        eval_metric='mlogloss',
        random_state=42,
        n_jobs=-1,
        tree_method='hist'
    )

    print("Training XGBoost...")
    xgb_model.fit(
        X_train, 
        y_train_encoded,
        sample_weight=sample_weights
    )
    print("Training completed!")
    
    # Prepare test data
    X_test = windowed_test_df[feature_cols]
    y_test = windowed_test_df['room']
    y_test_encoded = label_encoder.transform(y_test)
    
    # Make predictions
    print("Making predictions...")
    y_pred_encoded = xgb_model.predict(X_test)
    y_pred = label_encoder.inverse_transform(y_pred_encoded)
    
    # Calculate macro F1 score
    macro_f1 = f1_score(y_test, y_pred, average='macro')
    
    # Calculate per-class F1 scores
    per_class_f1 = f1_score(y_test, y_pred, average=None, labels=label_encoder.classes_)
    per_class_f1_dict = {label: f1 for label, f1 in zip(label_encoder.classes_, per_class_f1)}
    
    print(f"Fold {i} - Macro F1 Score: {macro_f1:.4f}")
    
    return {
        'fold': i,
        'macro_f1': macro_f1,
        'per_class_f1': per_class_f1_dict,
        'classes': label_encoder.classes_,
        'minority_classes': minority_classes,
        'class_counts': class_counts.to_dict()
    }

In [5]:
# Train and evaluate all folds
results = {}

folds = {
    1: (train_df_1, test_df_1),
    2: (train_df_2, test_df_2),
    3: (train_df_3, test_df_3),
    4: (train_df_4, test_df_4)
}

for fold_num, (train_df, test_df) in folds.items():
    print(f"\n{'='*80}")
    print(f"PROCESSING FOLD {fold_num}")
    print(f"{'='*80}\n")
    
    result = train_evaluate_fold(fold_num, train_df, test_df)
    results[fold_num] = result
    
print("\n" + "="*80)
print("ALL FOLDS COMPLETED!")
print("="*80)


PROCESSING FOLD 1

Loading the data frame from fold 1
Adding 25 beacon vector features for both set
Applying windowing for both sets
Total windows after aggregation: 19280
Removed 0 empty windows (all beacons = 0)
Features: 25 beacons × 5 statistics = 125 features
Total windows after aggregation: 2481
Removed 0 empty windows (all beacons = 0)
Features: 25 beacons × 5 statistics = 125 features

CLASS DISTRIBUTION IN FOLD 1:
room
nurse station    8487
kitchen          4588
cafeteria        4568
cleaning          583
523               342
520               260
513               195
506               162
515                39
518                18
508                17
517                17
505                 4

3 MOST MINORITY CLASSES:
1. 505                 :     4 samples ( 0.02%)
2. 508                 :    17 samples ( 0.09%)
3. 517                 :    17 samples ( 0.09%)

✓ Applied 3x weight multiplier to minority classes
  Minority classes: ['505', '508', '517']

Training XGBoost

In [6]:
import numpy as np

# Save results to text file
with open('pipeline_minor_weighting.txt', 'w') as f:
    f.write("="*80 + "\n")
    f.write("CROSS-VALIDATION RESULTS - WITH 3X MINORITY CLASS WEIGHTING\n")
    f.write("="*80 + "\n\n")
    
    # Overall macro F1 scores
    f.write("MACRO F1 SCORES PER FOLD:\n")
    f.write("-"*80 + "\n")
    for fold_num in [1, 2, 3, 4]:
        f.write(f"Fold {fold_num}: {results[fold_num]['macro_f1']:.4f}\n")
    
    # Calculate mean and std
    macro_f1_scores = [results[i]['macro_f1'] for i in [1, 2, 3, 4]]
    mean_f1 = np.mean(macro_f1_scores)
    std_f1 = np.std(macro_f1_scores)
    
    f.write(f"\nMean Macro F1: {mean_f1:.4f} ± {std_f1:.4f}\n")
    f.write(f"Min: {np.min(macro_f1_scores):.4f}, Max: {np.max(macro_f1_scores):.4f}\n")
    
    # Minority classes information
    f.write("\n" + "="*80 + "\n")
    f.write("MINORITY CLASSES (RECEIVED 3X WEIGHT MULTIPLIER)\n")
    f.write("="*80 + "\n\n")
    
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\nFold {fold_num}:\n")
        f.write("-"*80 + "\n")
        minority_classes = results[fold_num]['minority_classes']
        class_counts = results[fold_num]['class_counts']
        for minority_class in minority_classes:
            count = class_counts[minority_class]
            f.write(f"{minority_class:20s}: {count:5d} samples\n")
    
    # Per-class F1 scores for each fold
    f.write("\n" + "="*80 + "\n")
    f.write("PER-CLASS F1 SCORES\n")
    f.write("="*80 + "\n\n")
    
    for fold_num in [1, 2, 3, 4]:
        f.write(f"\nFold {fold_num}:\n")
        f.write("-"*80 + "\n")
        per_class = results[fold_num]['per_class_f1']
        minority_classes = results[fold_num]['minority_classes']
        for class_name in sorted(per_class.keys()):
            marker = " [MINORITY - 3X WEIGHT]" if class_name in minority_classes else ""
            f.write(f"{class_name:20s}: {per_class[class_name]:.4f}{marker}\n")

print("✅ Results saved to pipeline_minor_weighting.txt")

✅ Results saved to pipeline_minor_weighting.txt


In [7]:
# Display summary
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

for fold_num in [1, 2, 3, 4]:
    print(f"\nFold {fold_num}: Macro F1 = {results[fold_num]['macro_f1']:.4f}")
    print(f"  Minority classes (3x weight): {results[fold_num]['minority_classes']}")

macro_f1_scores = [results[i]['macro_f1'] for i in [1, 2, 3, 4]]
print(f"\n{'='*80}")
print(f"Mean Macro F1: {np.mean(macro_f1_scores):.4f} ± {np.std(macro_f1_scores):.4f}")
print(f"{'='*80}")


SUMMARY

Fold 1: Macro F1 = 0.2803
  Minority classes (3x weight): ['505', '508', '517']

Fold 2: Macro F1 = 0.2504
  Minority classes (3x weight): ['502', '518', '517']

Fold 3: Macro F1 = 0.2652
  Minority classes (3x weight): ['516', '517', '522']

Fold 4: Macro F1 = 0.3394
  Minority classes (3x weight): ['516', '501', '508']

Mean Macro F1: 0.2838 ± 0.0338
