<a href="https://colab.research.google.com/github/nemda234/ABC-2026/blob/main/ABC2026_challenge_balance_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [10]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

#ĐỌC FILE + FILTER + SPLIT

df = pd.read_csv('/content/drive/MyDrive/data/ABC2026 Sozolab Challenge/Dataset/5f_label_loc_train.csv')

df = df[(df['floor'] == '5th') & (df['activity'] == 'Location')].copy()

df['started_at'] = pd.to_datetime(df['started_at'])
df = df.sort_values('started_at').reset_index(drop=True)

split_idx = int(len(df) * 0.8)
train = df.iloc[:split_idx].copy()
test  = df.iloc[split_idx:].copy()

print(f"Train: {len(train)} | Test: {len(test)}")



#HÀM BALANCE

def smart_balance(df, target_min=3, target_max=30):
    room_counts = df['room'].value_counts()
    balanced_rows = []

    for room, count in room_counts.items():
        room_df = df[df['room'] == room]

        if count < target_min:
            sampled = room_df.sample(n=target_min, replace=True, random_state=42)
        elif count > target_max:
            sampled = room_df.sample(n=target_max, replace=False, random_state=42)
        else:
            sampled = room_df

        balanced_rows.append(sampled)

    return (
        pd.concat(balanced_rows, ignore_index=True)
        .sample(frac=1, random_state=42)
        .reset_index(drop=True)
    )
#tạo trainset
train_balanced = smart_balance(train)

print("Imbalance ratio:")
print(" Original:", train['room'].value_counts().max() / train['room'].value_counts().min())
print(" Balanced:", train_balanced['room'].value_counts().max() / train_balanced['room'].value_counts().min())
#FEATURE
def prepare_features(df):
    df = df.copy()
    df['started_at'] = pd.to_datetime(df['started_at'])
    df['finished_at'] = pd.to_datetime(df['finished_at'])

    df['hour'] = df['started_at'].dt.hour
    df['minute'] = df['started_at'].dt.minute
    df['day_of_week'] = df['started_at'].dt.dayofweek
    df['duration'] = (df['finished_at'] - df['started_at']).dt.total_seconds()
    df['user_id_encoded'] = df['user_id'].astype('category').cat.codes

    return df[['user_id_encoded', 'hour', 'minute', 'day_of_week', 'duration']]
#LABEL ENCODING
le = LabelEncoder()
le.fit(train['room'])

X_train = prepare_features(train)
X_train_bal = prepare_features(train_balanced)
X_test = prepare_features(test)

y_train = le.transform(train['room'])
y_train_bal = le.transform(train_balanced['room'])
y_test = le.transform(test['room'])
#MODEL A — BASELINE
model_A = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model_A.fit(X_train, y_train)
pred_A = model_A.predict(X_test)

#MODEL B — CLASS_WEIGHT ONLY

model_B = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1,
    class_weight='balanced'
)

model_B.fit(X_train, y_train)
pred_B = model_B.predict(X_test)

#MODEL C — BALANCED DATA ONLY

model_C = RandomForestClassifier(
    n_estimators=100,
    random_state=42,
    n_jobs=-1
)

model_C.fit(X_train_bal, y_train_bal)
pred_C = model_C.predict(X_test)


results = {
    "Baseline": pred_A,
    "Class_weight": pred_B,
    "Balanced_data": pred_C
}

print(f"{'Model':<18} {'Macro-F1':<12} {'Weighted-F1':<12}")
print("-" * 45)

for name, pred in results.items():
    f1_macro = f1_score(y_test, pred, average='macro')
    f1_weighted = f1_score(y_test, pred, average='weighted')
    print(f"{name:<18} {f1_macro:<12.4f} {f1_weighted:<12.4f}")




Train: 334 | Test: 84
Imbalance ratio:
 Original: 96.0
 Balanced: 10.0
Model              Macro-F1     Weighted-F1 
---------------------------------------------
Baseline           0.0529       0.2597      
Class_weight       0.0563       0.3018      
Balanced_data      0.0566       0.2955      
