# Data check

In [None]:
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv('merged_df_original.csv')

# '캐시워크'부터 '우체국보험'까지의 열 이름 목록 추출
start_col = '캐시워크'
end_col = '우체국보험'
cols_to_sum = df.loc[:, start_col:end_col].columns

df['m_usagestats'] = df[cols_to_sum].sum(axis=1)

df.drop(columns=cols_to_sum, inplace=True)

merged_df_original = df.copy()

merged_df_original.rename(columns={'m_wtb_rssi_x': 'm_wtb_rssi', 'm_wtb_rssi_y': 'm_wtw_rssi'}, inplace=True)


print("✅ usage 합산 완료")


✅ usage 합산 완료


### Filling zero values for missing values

In [44]:
#대상 열 목록
cols_to_check = [
    'met_activity', 'm_wtb_rssi', 'm_wtw_rssi', 'heart_rate',
    'distance', 'latitude', 'longitude', 'altitude', 'speed', 'm_usagestats', 'w_light'
]

# 결측치 보간 함수 (앞뒤 평균)
def fill_nearest_avg(series):
    forward = series.ffill()  # 앞쪽 값으로 채우기
    backward = series.bfill()  # 뒤쪽 값으로 채우기
    filled = series.copy()
    
    # 앞뒤 값이 모두 있는 경우 평균으로
    for i in series[series.isnull()].index:
        f, b = forward[i], backward[i]
        if pd.notnull(f) and pd.notnull(b):
            filled[i] = (f + b) / 2
        elif pd.notnull(f):
            filled[i] = f
        elif pd.notnull(b):
            filled[i] = b
    return filled

# 'burned_calories' 열의 결측치를 0으로 채우기
merged_df_original['burned_calories'] = merged_df_original['burned_calories'].fillna(0)

# 각 열에 대해 결측치 처리
for col in cols_to_check:
    merged_df_original[col] = fill_nearest_avg(merged_df_original[col])

merged_df_original.head()


Unnamed: 0,subject_id,timestamp,met_activity,m_wtb_rssi,m_wtw_rssi,heart_rate,distance,burned_calories,latitude,longitude,...,id02,id03,id04,id05,id06,id07,id08,id09,id10,m_usagestats
0,id01,2024-06-26 12:00:00,6.16,0.102155,0.202476,121.781354,8.33,0.0,0.207832,0.169962,...,0,0,0,0,0,0,0,0,0,0.0
1,id01,2024-06-26 12:10:00,7.84,0.102155,0.091135,121.781354,0.0,0.0,0.207858,0.169967,...,0,0,0,0,0,0,0,0,0,0.0
2,id01,2024-06-26 12:20:00,7.7,0.098621,0.063361,121.781354,0.0,0.0,0.207863,0.169975,...,0,0,0,0,0,0,0,0,0,0.0
3,id01,2024-06-26 12:30:00,7.7,0.037712,0.005904,91.059545,108.830027,5.990001,0.207815,0.169852,...,0,0,0,0,0,0,0,0,0,0.0
4,id01,2024-06-26 12:40:00,13.23,0.02742,0.035869,89.13254,206.570137,12.010003,0.209973,0.16865,...,0,0,0,0,0,0,0,0,0,0.0


In [45]:
remaining_cols = [col for col in merged_df_original.columns if col not in cols_to_check]

df_original = merged_df_original.copy()
df_original = df_original[remaining_cols].fillna(0)
df_original.to_csv('merged_df_original_tozero.csv')
print("✅ 결측치를 0으로 채운 파일 저장 완료: merged_df_original_tozero.csv")

✅ 결측치를 0으로 채운 파일 저장 완료: merged_df_original_tozero.csv


In [46]:
df_original.head()

Unnamed: 0,subject_id,timestamp,burned_calories,Music,Vehicle,Motor vehicle (road),"Outside, urban or manmade","Outside, rural or natural",Car,Speech,...,id01,id02,id03,id04,id05,id06,id07,id08,id09,id10
0,id01,2024-06-26 12:00:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,id01,2024-06-26 12:10:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,id01,2024-06-26 12:20:00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,id01,2024-06-26 12:30:00,5.990001,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,id01,2024-06-26 12:40:00,12.010003,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


# Train & test set

In [47]:
trainset = pd.read_csv('../ch2025_metrics_train.csv')
testset = pd.read_csv('../ch2025_submission_sample.csv')

In [48]:
# Define metric columns
metrics = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']

# Split df_train and df_val for each metric
for metric in metrics:
    globals()[f'trainset_{metric.lower()}'] = trainset[['subject_id', 'sleep_date', 'lifelog_date', metric]].copy()

# Display head of all 12 dataframes
for metric in metrics:
    print(f"trainset_{metric.lower()} head:")
    print(globals()[f'trainset_{metric.lower()}'].head())


trainset_q1 head:
  subject_id  sleep_date lifelog_date  Q1
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   1
3       id01  2024-06-30   2024-06-29   1
4       id01  2024-07-01   2024-06-30   0
trainset_q2 head:
  subject_id  sleep_date lifelog_date  Q2
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   0
3       id01  2024-06-30   2024-06-29   0
4       id01  2024-07-01   2024-06-30   1
trainset_q3 head:
  subject_id  sleep_date lifelog_date  Q3
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   0
3       id01  2024-06-30   2024-06-29   1
4       id01  2024-07-01   2024-06-30   1
trainset_s1 head:
  subject_id  sleep_date lifelog_date  S1
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   1
3   

# GRU
- data : merged_df_original

In [None]:
import os, copy, random
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# Preprocess
TARGETS = ['Q1','Q2','Q3','S1','S2','S3']
df_original['timestamp'] = pd.to_datetime(df_original['timestamp'])
df_original['lifelog_date'] = df_original['timestamp'].dt.date.astype(str)

DROP_COLS = ['timestamp', 'subject_id', 'lifelog_date']
SENSOR_COLS = [c for c in df_original.columns if c not in DROP_COLS]
MAX_SEQ = 144

def build_sequences(df):
    seqs = {}
    for (sid, day), g in df.groupby(['subject_id', 'lifelog_date']):
        g = g.sort_values('timestamp')
        x = g[SENSOR_COLS].to_numpy(np.float32)
        if len(x) > MAX_SEQ: x = x[:MAX_SEQ]
        if len(x) < MAX_SEQ:
            x = np.concatenate([x, np.zeros((MAX_SEQ-len(x), x.shape[1]), np.float32)])
        seqs[(sid, day)] = x
    return seqs

SEQ_DICT = build_sequences(df_original)

def rows_to_xy(df):
    xs, ys, groups = [], [], []
    for _, r in df.iterrows():
        key = (r.subject_id, r.lifelog_date)
        if key not in SEQ_DICT:
            continue
        xs.append(SEQ_DICT[key])
        ys.append(r[TARGETS].to_list())
        groups.append(r.subject_id)
    return np.stack(xs), np.array(ys, np.int64), np.array(groups)

X_all, y_all, group_all = rows_to_xy(trainset)
X_test, _, _ = rows_to_xy(testset)


# Scale
scaler = StandardScaler().fit(X_all.reshape(-1, X_all.shape[-1]))
def scale(x):
    shp = x.shape
    return scaler.transform(x.reshape(-1, shp[-1])).reshape(shp)

X_all = scale(X_all)
X_test = scale(X_test)

# ------------------------
# Dataset / Model
# ------------------------
class SleepDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return self.X[i] if self.y is None else (self.X[i], self.y[i])

class SingleHeadGRU(nn.Module):
    def __init__(self, input_dim, out_dim, hidden=128, layers=2, drop=0.3):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden, layers, batch_first=True, dropout=drop)
        self.fc = nn.Linear(hidden, out_dim)
    def forward(self, x):
        _, h = self.gru(x)
        return self.fc(h[-1])

# ------------------------
# Train per target
# ------------------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_FOLD = 5
EPOCH_MAX = 50
BATCH_SIZE = 64

preds_dict = {}

for k, target in enumerate(TARGETS):
    print(f"\n=== {target} training ===")
    y_target = y_all[:, k]
    out_dim = 3 if target == 'S1' else 2
    criterion = nn.CrossEntropyLoss()
    best_epochs = []

    gkf = GroupKFold(n_splits=N_FOLD)
    for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_all, y_target, group_all)):
        model = SingleHeadGRU(X_all.shape[-1], out_dim).to(DEVICE)
        opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)

        best_f1, best_ep, best_state = -1, 0, None
        for epoch in range(1, EPOCH_MAX+1):
            model.train()
            for xb, yb in DataLoader(SleepDS(X_all[tr_idx], y_target[tr_idx]), batch_size=BATCH_SIZE, shuffle=True):
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                pred = model(xb)
                loss = criterion(pred, yb)
                opt.zero_grad(); loss.backward(); opt.step()

            # validation
            model.eval()
            all_pred, all_true = [], []
            with torch.no_grad():
                for xb, yb in DataLoader(SleepDS(X_all[val_idx], y_target[val_idx]), batch_size=BATCH_SIZE):
                    pred = model(xb.to(DEVICE)).argmax(1).cpu().numpy()
                    all_pred.extend(pred)
                    all_true.extend(yb.numpy())
            f1 = f1_score(all_true, all_pred, average='macro')
            if f1 > best_f1:
                best_f1 = f1
                best_ep = epoch
                best_state = copy.deepcopy(model.state_dict())

        best_epochs.append(best_ep)
        print(f"Fold {fold+1} - Best F1: {best_f1:.4f} @ Epoch {best_ep}")

    # Final train on all data
    final_model = SingleHeadGRU(X_all.shape[-1], out_dim).to(DEVICE)
    opt = torch.optim.AdamW(final_model.parameters(), lr=3e-4, weight_decay=1e-2)
    loader = DataLoader(SleepDS(X_all, y_target), batch_size=BATCH_SIZE, shuffle=True)
    for epoch in range(1, int(round(np.mean(best_epochs)))+1):
        final_model.train()
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            pred = final_model(xb)
            loss = criterion(pred, yb)
            opt.zero_grad(); loss.backward(); opt.step()

    # Predict
    final_model.eval()
    preds = []
    with torch.no_grad():
        for xb in DataLoader(SleepDS(X_test), batch_size=BATCH_SIZE):
            out = final_model(xb.to(DEVICE))
            preds.extend(out.argmax(1).cpu().numpy())
    preds_dict[target] = preds

# ------------------------
# Submission
# ------------------------
sub = testset[['subject_id','sleep_date','lifelog_date']].copy()
for t in TARGETS:
    sub[t] = preds_dict[t]

SAVE_PATH = '../results/sub_original_GRU.csv'
sub.to_csv(SAVE_PATH, index=False)
print('✅ submission saved ->', SAVE_PATH)



=== Q1 training ===
Fold 1 - Best F1: 0.5998 @ Epoch 8
Fold 2 - Best F1: 0.5436 @ Epoch 26
Fold 3 - Best F1: 0.5181 @ Epoch 5
Fold 4 - Best F1: 0.4436 @ Epoch 3
Fold 5 - Best F1: 0.5955 @ Epoch 31

=== Q2 training ===
Fold 1 - Best F1: 0.4665 @ Epoch 13
Fold 2 - Best F1: 0.6185 @ Epoch 29
Fold 3 - Best F1: 0.6087 @ Epoch 38
Fold 4 - Best F1: 0.5799 @ Epoch 42
Fold 5 - Best F1: 0.5232 @ Epoch 19

=== Q3 training ===
Fold 1 - Best F1: 0.5111 @ Epoch 24
Fold 2 - Best F1: 0.4877 @ Epoch 32
Fold 3 - Best F1: 0.4656 @ Epoch 16
Fold 4 - Best F1: 0.4535 @ Epoch 48
Fold 5 - Best F1: 0.5374 @ Epoch 18

=== S1 training ===
Fold 1 - Best F1: 0.4552 @ Epoch 18
Fold 2 - Best F1: 0.3842 @ Epoch 13
Fold 3 - Best F1: 0.3634 @ Epoch 1
Fold 4 - Best F1: 0.3763 @ Epoch 5
Fold 5 - Best F1: 0.3381 @ Epoch 3

=== S2 training ===
Fold 1 - Best F1: 0.4785 @ Epoch 12
Fold 2 - Best F1: 0.5505 @ Epoch 39
Fold 3 - Best F1: 0.5264 @ Epoch 3
Fold 4 - Best F1: 0.7637 @ Epoch 8
Fold 5 - Best F1: 0.5773 @ Epoch 32

==

In [18]:
print("SENSOR_COLS:", SENSOR_COLS)
print("df.columns:", df_original.columns.tolist())


SENSOR_COLS: ['met_activity', 'm_wtb_rssi_x', 'm_wtb_rssi_y', 'heart_rate', 'distance', 'burned_calories', 'latitude', 'longitude', 'altitude', 'speed', 'w_light', 'total_time', 'weighted_sum', 'id01', 'id02', 'id03', 'id04', 'id05', 'id06', 'id07', 'id08', 'id09', 'id10']
df.columns: ['subject_id', 'lifelog_date', 'met_activity', 'm_wtb_rssi_x', 'm_wtb_rssi_y', 'heart_rate', 'distance', 'burned_calories', 'latitude', 'longitude', 'altitude', 'speed', 'w_light', 'total_time', 'weighted_sum', 'id01', 'id02', 'id03', 'id04', 'id05', 'id06', 'id07', 'id08', 'id09', 'id10']
