# 전처리

In [13]:
import pandas as pd

# CSV 파일 불러오기
df = pd.read_csv('../data/merged_df_original.csv')

# '캐시워크'부터 '우체국보험'까지의 열 이름 목록 추출
start_col = '캐시워크'
end_col = '우체국보험'
cols_to_sum = df.loc[:, start_col:end_col].columns

df['m_usagestats'] = df[cols_to_sum].sum(axis=1)

df.drop(columns=cols_to_sum, inplace=True)

merged_df_cwj = df.copy()

merged_df_cwj.rename(columns={'m_wtb_rssi_x': 'm_wtb_rssi', 'm_wtb_rssi_y': 'm_wtw_rssi'}, inplace=True)

# CSV로 내보내기
merged_df_cwj.to_csv('../data/merged_df_cwj.csv', index=False)


print("✅ merged_df_cwj.csv 저장 완료")


✅ merged_df_cwj.csv 저장 완료


In [15]:
df = pd.read_csv('../data/merged_df_cwj.csv')

# 검사할 열 목록
cols_to_check = [
    'met_activity', 'm_wtb_rssi', 'm_wtw_rssi', 'heart_rate',
    'distance', 'latitude', 'longitude', 'altitude', 'speed', 'm_usagestats', 'w_light'
]

# 각 열의 결측치 개수 출력
missing_counts = df[cols_to_check].isnull().sum()
print("📊 결측치 개수:")
print(missing_counts)


📊 결측치 개수:
met_activity     2690
m_wtb_rssi      77337
m_wtw_rssi      22842
heart_rate      55846
distance        22852
latitude        16044
longitude       16044
altitude        16044
speed           16044
m_usagestats        0
w_light         19040
dtype: int64


### 결측치를 가장 가까운 이웃의 값(앞뒤 기준)으로 채움

In [16]:
#대상 열 목록
cols_to_check = [
    'met_activity', 'm_wtb_rssi', 'm_wtw_rssi', 'heart_rate',
    'distance', 'latitude', 'longitude', 'altitude', 'speed', 'm_usagestats', 'w_light'
]

# 결측치 보간 함수 (앞뒤 평균)
def fill_nearest_avg(series):
    forward = series.ffill()  # 앞쪽 값으로 채우기
    backward = series.bfill()  # 뒤쪽 값으로 채우기
    filled = series.copy()
    
    # 앞뒤 값이 모두 있는 경우 평균으로
    for i in series[series.isnull()].index:
        f, b = forward[i], backward[i]
        if pd.notnull(f) and pd.notnull(b):
            filled[i] = (f + b) / 2
        elif pd.notnull(f):
            filled[i] = f
        elif pd.notnull(b):
            filled[i] = b
    return filled

# 'burned_calories' 열의 결측치를 0으로 채우기
df['burned_calories'] = df['burned_calories'].fillna(0)

# 각 열에 대해 결측치 처리
for col in cols_to_check:
    df[col] = fill_nearest_avg(df[col])

# 확인용 출력 (선택사항)
print("✅ 결측치 보간 완료. 남은 결측치 수:")
print(df[cols_to_check].isnull().sum())

# 저장
df.to_csv('../data/merged_df_cwj_filled.csv', index=False)
print("📁 '../data/merged_df_cwj_filled.csv' 저장 완료")


✅ 결측치 보간 완료. 남은 결측치 수:
met_activity    0
m_wtb_rssi      0
m_wtw_rssi      0
heart_rate      0
distance        0
latitude        0
longitude       0
altitude        0
speed           0
m_usagestats    0
w_light         0
dtype: int64
📁 '../data/merged_df_cwj_filled.csv' 저장 완료


In [23]:

# 나머지 열들 선택
remaining_cols = [col for col in df.columns if col not in cols_to_check]

# 결측치 개수 확인
missing_counts = df[remaining_cols].isnull().sum()

# 결측치가 존재하는 열만 출력
missing_counts = missing_counts[missing_counts > 0]

print("📊 [cols_to_check 외] 결측치가 있는 열들:")
print(missing_counts)

# 결측치를 0으로 채워 저장
df_zero_filled = df.copy()
df_zero_filled[remaining_cols] = df_zero_filled[remaining_cols].fillna(0)
df_zero_filled.to_csv('../data/merged_df_cwj_tozero.csv', index=False)
print("✅ 결측치를 0으로 채운 파일 저장 완료: merged_df_cwj_tozero.csv")

# 결측치가 있는 행을 삭제하여 저장
df_dropna = df.copy()
df_dropna = df_dropna.dropna(subset=remaining_cols)
df_dropna.to_csv('../data/merged_df_cwj_delete.csv', index=False)
print("✅ 결측치 행 삭제 파일 저장 완료: merged_df_cwj_delete.csv")


📊 [cols_to_check 외] 결측치가 있는 열들:
Music                        3717
Vehicle                      3717
Motor vehicle (road)         3717
Outside, urban or manmade    3717
Outside, rural or natural    3717
                             ... 
Heavy metal                  3717
Double bass                  3717
Drum and bass                3717
String section               3717
Punk rock                    3717
Length: 517, dtype: int64
✅ 결측치를 0으로 채운 파일 저장 완료: merged_df_cwj_tozero.csv
✅ 결측치 행 삭제 파일 저장 완료: merged_df_cwj_delete.csv


In [26]:
# 결측치 개수 세기
missing_zero_filled = df_zero_filled.isnull().sum().sum()
missing_dropna = df_dropna.isnull().sum().sum()

print(f"df_zero_filled 결측치 총 개수: {missing_zero_filled}")
print(f"df_dropna 결측치 총 개수: {missing_dropna}")

print(df_zero_filled.shape, df_dropna.shape)

df_zero_filled 결측치 총 개수: 0
df_dropna 결측치 총 개수: 0
(99190, 541) (95473, 541)


# train, test 셋 준비

In [17]:
import pandas as pd


df_zero_filled = pd.read_csv('../data/merged_df_cwj_tozero.csv')
print("✅ df_zero_filled shape:", df_zero_filled.shape)
#df_zero_filled.head()

✅ df_zero_filled shape: (99190, 541)


In [18]:
from sklearn.model_selection import train_test_split

train_val_df = pd.read_csv('../data/ch2025_metrics_train.csv')

df_train, df_val = train_test_split(train_val_df, test_size=0.2, random_state=42, shuffle=True)

print("df_train.head():")
print(df_train.head(10))
print("\ndf_val.head():")
print(df_val.head(10))


df_val.shape, df_train.shape

df_train.head():
    subject_id  sleep_date lifelog_date  Q1  Q2  Q3  S1  S2  S3
24        id01  2024-07-24   2024-07-23   0   1   1   0   0   1
17        id01  2024-07-15   2024-07-14   0   1   0   0   1   1
66        id02  2024-08-19   2024-08-18   0   1   1   1   0   1
148       id04  2024-09-02   2024-09-01   0   0   1   1   1   0
249       id06  2024-07-03   2024-07-02   0   1   1   1   1   1
31        id01  2024-08-21   2024-08-20   0   0   1   0   0   1
84        id02  2024-09-24   2024-09-23   0   1   1   1   1   1
307       id07  2024-08-02   2024-08-01   0   0   0   0   0   0
406       id09  2024-08-24   2024-08-23   0   0   0   0   1   0
389       id09  2024-07-18   2024-07-17   1   0   0   1   1   1

df_val.head():
    subject_id  sleep_date lifelog_date  Q1  Q2  Q3  S1  S2  S3
407       id09  2024-08-25   2024-08-24   1   0   0   1   1   1
444       id10  2024-09-03   2024-09-02   0   0   0   0   0   0
117       id03  2024-09-07   2024-09-06   1   1   0   1   0   0
30     

((90, 9), (360, 9))

In [19]:
# Define metric columns
metrics = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']

# Split df_train and df_val for each metric
for metric in metrics:
    globals()[f'df_train_{metric.lower()}'] = df_train[['subject_id', 'sleep_date', 'lifelog_date', metric]].copy()
    globals()[f'df_val_{metric.lower()}'] = df_val[['subject_id', 'sleep_date', 'lifelog_date', metric]].copy()

# Display head of all 12 dataframes
for metric in metrics:
    print(f"df_train_{metric.lower()} head:")
    print(globals()[f'df_train_{metric.lower()}'].head())
    print(f"\ndf_val_{metric.lower()} head:")
    print(globals()[f'df_val_{metric.lower()}'].head())
    print("\n" + "-"*40 + "\n")

df_train_q1 head:
    subject_id  sleep_date lifelog_date  Q1
24        id01  2024-07-24   2024-07-23   0
17        id01  2024-07-15   2024-07-14   0
66        id02  2024-08-19   2024-08-18   0
148       id04  2024-09-02   2024-09-01   0
249       id06  2024-07-03   2024-07-02   0

df_val_q1 head:
    subject_id  sleep_date lifelog_date  Q1
407       id09  2024-08-25   2024-08-24   1
444       id10  2024-09-03   2024-09-02   0
117       id03  2024-09-07   2024-09-06   1
30        id01  2024-08-20   2024-08-19   0
415       id09  2024-09-03   2024-09-02   1

----------------------------------------

df_train_q2 head:
    subject_id  sleep_date lifelog_date  Q2
24        id01  2024-07-24   2024-07-23   1
17        id01  2024-07-15   2024-07-14   1
66        id02  2024-08-19   2024-08-18   1
148       id04  2024-09-02   2024-09-01   0
249       id06  2024-07-03   2024-07-02   1

df_val_q2 head:
    subject_id  sleep_date lifelog_date  Q2
407       id09  2024-08-25   2024-08-24   0
444    

# GRU

In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

df_merged = df_zero_filled.copy()
# timestamp → datetime 변환
df_merged["timestamp"] = pd.to_datetime(df_merged["timestamp"])

# lifelog_date = 날짜(문자열) ; 2024-06-26 같은 형식
df_merged["lifelog_date"] = df_merged["timestamp"].dt.date.astype(str)

# -------- 2. feature / sensor column 정의 -----------
drop_cols = ["timestamp",                # 시간은 예측에 불필요
             "subject_id",               # 매치용
             "lifelog_date"] + [c for c in df_merged.columns if c.startswith("id")]  # one-hot id

sensor_cols = [c for c in df_merged.columns if c not in drop_cols]
# 하루 당 최대 144 타임스텝으로 패딩
MAX_SEQ_LEN = 144


# -------- 3. 시퀀스 묶는 함수 -----------
def build_sequences(df):
    """(subject_id, lifelog_date) → ndarray(seq_len, n_feat)"""
    seq_dict = {}
    for (sid, day), g in df.groupby(['subject_id', 'lifelog_date']):
        # 10-분 간격 보장 안될 수도 있으니 timestamp 기준 정렬
        g = g.sort_values('timestamp')
        x = g[sensor_cols].to_numpy(dtype=np.float32)

        # 길이 조정
        if len(x) > MAX_SEQ_LEN:          # 잘라내기
            x = x[:MAX_SEQ_LEN]
        if len(x) < MAX_SEQ_LEN:          # 0-패딩
            pad = np.zeros((MAX_SEQ_LEN - len(x), x.shape[1]), np.float32)
            x = np.vstack([x, pad])

        seq_dict[(sid, day)] = x          # shape = (144, n_feat)
    return seq_dict

sequence_dict = build_sequences(df_merged)
print("# total sequences built :", len(sequence_dict))

# ---------- 3. Train / Val Tensor 준비 ----------
def rows_to_tensors(df_label):
    X, y = [], []
    for _, row in df_label.iterrows():
        key = (row['subject_id'], row['lifelog_date'])
        if key not in sequence_dict:        # 누락된 날짜 skip
            continue
        X.append(sequence_dict[key])
        y.append([row['Q1']])               # binary -> shape (1,)
    return np.stack(X), np.array(y, dtype=np.float32)

X_train, y_train = rows_to_tensors(df_train_q1)
X_val,   y_val   = rows_to_tensors(df_val_q1)

print("Train tensor shape :", X_train.shape, y_train.shape)
print("Val   tensor shape :", X_val.shape,   y_val.shape)

# ---------- 4. Feature 정규화 ----------
scaler = StandardScaler().fit(X_train.reshape(-1, len(sensor_cols)))
def scale(x):
    orig_shape = x.shape
    x = scaler.transform(x.reshape(-1, len(sensor_cols)))
    return x.reshape(orig_shape)

X_train = scale(X_train)
X_val   = scale(X_val)

# ---------- 5. PyTorch Dataset ----------
class SleepDS(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):  return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SleepDS(X_train, y_train)
val_ds   = SleepDS(X_val,   y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False)

# ---------- 6. GRU 모델 ----------
class GRUModel(nn.Module):
    def __init__(self, n_features, hidden=64):
        super().__init__()
        self.gru = nn.GRU(input_size=n_features, hidden_size=hidden,
                          num_layers=1, batch_first=True)
        self.fc  = nn.Linear(hidden, 1)
    def forward(self, x):
        _, h = self.gru(x)          # h: (1,B,hidden)
        h = h.squeeze(0)            # (B,hidden)
        return torch.sigmoid(self.fc(h))    # (B,1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = GRUModel(len(sensor_cols)).to(device)
criterion = nn.BCELoss()
optim     = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---------- 7. 학습 루프 ----------
EPOCHS = 13
for epoch in range(1, EPOCHS+1):
    model.train()
    for xb, yb in train_loader:
        xb, yb = xb.to(device), yb.to(device)
        pred = model(xb)
        loss = criterion(pred, yb)
        optim.zero_grad()
        loss.backward()
        optim.step()

    # --- val ---
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = xb.to(device)
            prob = model(xb).cpu().numpy()
            preds.append(prob)
            trues.append(yb.numpy())
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    y_hat = (preds >= 0.5).astype(int)
    f1 = f1_score(trues, y_hat, average='macro')
    print(f"Epoch {epoch:2d} | Val macro‑F1: {f1:.4f}")



# total sequences built : 700
Train tensor shape : (360, 144, 529) (360, 1)
Val   tensor shape : (90, 144, 529) (90, 1)
Epoch  1 | Val macro‑F1: 0.5553
Epoch  2 | Val macro‑F1: 0.4623
Epoch  3 | Val macro‑F1: 0.5950
Epoch  4 | Val macro‑F1: 0.5801
Epoch  5 | Val macro‑F1: 0.6192
Epoch  6 | Val macro‑F1: 0.6111
Epoch  7 | Val macro‑F1: 0.5218
Epoch  8 | Val macro‑F1: 0.5744
Epoch  9 | Val macro‑F1: 0.5982
Epoch 10 | Val macro‑F1: 0.6099
Epoch 11 | Val macro‑F1: 0.5347
Epoch 12 | Val macro‑F1: 0.5312
Epoch 13 | Val macro‑F1: 0.4985


In [9]:
metrics = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']

def rows_to_tensors(df_label, metric):
    X, y = [], []
    for _, row in df_label.iterrows():
        key = (row['subject_id'], row['lifelog_date'])
        if key not in sequence_dict:        # 누락된 날짜 skip
            continue
        X.append(sequence_dict[key])
        y.append([row[metric]])             # 원하는 metric 컬럼 사용
    return np.stack(X), np.array(y, dtype=np.float32)

'''
# metric별로 train/val tensor 생성 및 shape 출력
for metric in metrics:
    train_df = globals()[f'df_train_{metric.lower()}']
    val_df   = globals()[f'df_val_{metric.lower()}']
    X_train, y_train = rows_to_tensors(train_df, metric)
    X_val,   y_val   = rows_to_tensors(val_df, metric)
    print(f"[{metric}] Train tensor shape :", X_train.shape, y_train.shape)
    print(f"[{metric}] Val   tensor shape :", X_val.shape,   y_val.shape)
'''

'''
# ---------- 3. Train / Val Tensor 준비 ----------
def rows_to_tensors(df_label, metric):
    X, y = [], []
    for _, row in df_label.iterrows():
        key = (row['subject_id'], row['lifelog_date'])
        if key not in sequence_dict:        # 누락된 날짜 skip
            continue
        X.append(sequence_dict[key])
        y.append([row[metric]])               # binary -> shape (1,)
    return np.stack(X), np.array(y, dtype=np.float32)

X_train, y_train = rows_to_tensors(df_train_q2, metric)
X_val,   y_val   = rows_to_tensors(df_val_q2, metric)

print("Train tensor shape :", X_train.shape, y_train.shape)
print("Val   tensor shape :", X_val.shape,   y_val.shape)
'''

# ---------- 4. Feature 정규화 ----------
scaler = StandardScaler().fit(X_train.reshape(-1, len(sensor_cols)))
def scale(x):
    orig_shape = x.shape
    x = scaler.transform(x.reshape(-1, len(sensor_cols)))
    return x.reshape(orig_shape)

X_train = scale(X_train)
X_val   = scale(X_val)

# ---------- 5. PyTorch Dataset ----------
class SleepDS(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = torch.tensor(y, dtype=torch.float32)
    def __len__(self):  return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]

train_ds = SleepDS(X_train, y_train)
val_ds   = SleepDS(X_val,   y_val)

train_loader = DataLoader(train_ds, batch_size=64, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=64, shuffle=False)

# ---------- 6. GRU 모델 ----------
class GRUModel(nn.Module):
    def __init__(self, n_features, hidden=64):
        super().__init__()
        self.gru = nn.GRU(input_size=n_features, hidden_size=hidden,
                          num_layers=1, batch_first=True)
        self.fc  = nn.Linear(hidden, 1)
    def forward(self, x):
        _, h = self.gru(x)          # h: (1,B,hidden)
        h = h.squeeze(0)            # (B,hidden)
        return torch.sigmoid(self.fc(h))    # (B,1)

device = "cuda" if torch.cuda.is_available() else "cpu"
model  = GRUModel(len(sensor_cols)).to(device)
criterion = nn.BCELoss()
optim     = torch.optim.Adam(model.parameters(), lr=1e-3)

# ---------- 7. 학습 루프 ----------
EPOCHS = 20
for metric in metrics:
    train_df = globals()[f'df_train_{metric.lower()}']
    val_df   = globals()[f'df_val_{metric.lower()}']
    X_train, y_train = rows_to_tensors(train_df, metric)
    X_val,   y_val   = rows_to_tensors(val_df, metric)
    print(f"[{metric}] Train tensor shape :", X_train.shape, y_train.shape)
    print(f"[{metric}] Val   tensor shape :", X_val.shape,   y_val.shape)
    for epoch in range(1, EPOCHS+1):
        model.train()
        for xb, yb in train_loader:
            xb, yb = xb.to(device), yb.to(device)
            pred = model(xb)
            loss = criterion(pred, yb)
            optim.zero_grad()
            loss.backward()
            optim.step()

        # --- val ---
        model.eval()
        preds, trues = [], []
        with torch.no_grad():
            for xb, yb in val_loader:
                xb = xb.to(device)
                prob = model(xb).cpu().numpy()
                preds.append(prob)
                trues.append(yb.numpy())
        preds = np.vstack(preds)
        trues = np.vstack(trues)
        y_hat = (preds >= 0.5).astype(int)
        f1 = f1_score(trues, y_hat, average='macro')
        print(f"Epoch {epoch:2d} | Val macro‑F1: {f1:.4f}")



[Q1] Train tensor shape : (360, 144, 529) (360, 1)
[Q1] Val   tensor shape : (90, 144, 529) (90, 1)
Epoch  1 | Val macro‑F1: 0.5264
Epoch  2 | Val macro‑F1: 0.5342
Epoch  3 | Val macro‑F1: 0.5179
Epoch  4 | Val macro‑F1: 0.5104
Epoch  5 | Val macro‑F1: 0.5031
Epoch  6 | Val macro‑F1: 0.5031
Epoch  7 | Val macro‑F1: 0.5187
Epoch  8 | Val macro‑F1: 0.5654
Epoch  9 | Val macro‑F1: 0.5654
Epoch 10 | Val macro‑F1: 0.5036
Epoch 11 | Val macro‑F1: 0.5676
Epoch 12 | Val macro‑F1: 0.5759
Epoch 13 | Val macro‑F1: 0.5264
Epoch 14 | Val macro‑F1: 0.5636
Epoch 15 | Val macro‑F1: 0.5342
Epoch 16 | Val macro‑F1: 0.5569
Epoch 17 | Val macro‑F1: 0.5537
Epoch 18 | Val macro‑F1: 0.5569
Epoch 19 | Val macro‑F1: 0.5486
Epoch 20 | Val macro‑F1: 0.5750
[Q2] Train tensor shape : (360, 144, 529) (360, 1)
[Q2] Val   tensor shape : (90, 144, 529) (90, 1)
Epoch  1 | Val macro‑F1: 0.5328
Epoch  2 | Val macro‑F1: 0.5403
Epoch  3 | Val macro‑F1: 0.5403
Epoch  4 | Val macro‑F1: 0.5759
Epoch  5 | Val macro‑F1: 0.5503


In [8]:
import numpy as np

Q1_list = [0.5264, 0.5342, 0.5179, 0.5104, 0.5031, 0.5031, 0.5187, 0.5654, 0.5654, 0.5036, 0.5676, 0.5759, 0.5264, 0.5636, 0.5342, 0.5569, 0.5537, 0.5569, 0.5486, 0.5750]
Q2_list = [0.5328, 0.5403, 0.5403, 0.5759, 0.5503, 0.5588, 0.5593, 0.5767, 0.6063, 0.5537, 0.5909, 0.5673, 0.6112, 0.6021, 0.5759, 0.5817, 0.5944, 0.5930, 0.5507, 0.5593]
Q3_list = [0.5878, 0.5680, 0.5750, 0.5588, 0.5588, 0.5750, 0.5853, 0.5661, 0.5786, 0.5750, 0.5503, 0.6121, 0.6000, 0.6277, 0.5971, 0.6156, 0.6021, 0.5680, 0.5840, 0.5840]
S1_list = [0.5930, 0.6027, 0.6021, 0.5726, 0.5750, 0.5817, 0.5750, 0.5971, 0.5750, 0.6063, 0.5750, 0.6000, 0.6000, 0.6184, 0.5909, 0.6021, 0.6092, 0.6404, 0.5840, 0.6250]
S2_list = [0.6000, 0.6000, 0.5840, 0.6000, 0.6000, 0.5840, 0.5840, 0.5840, 0.5840, 0.5840, 0.5840, 0.5840, 0.5840, 0.5840, 0.6000, 0.5840, 0.5840, 0.5840, 0.5840, 0.6000]
S3_list = [0.6000, 0.6000, 0.5840, 0.6000, 0.6156, 0.6063, 0.6063, 0.5909, 0.5909, 0.5930, 0.5846, 0.5746, 0.5673, 0.5944, 0.6250, 0.5909, 0.5588, 0.5750, 0.5840, 0.5750]
# 각 인덱스별로 6개 리스트의 값을 평균내어 f1_list에 저장
f1_list = []
all_lists = [Q1_list, Q2_list, Q3_list, S1_list, S2_list, S3_list]

for i in range(len(Q1_list)):
    avg_value = np.mean([lst[i] for lst in all_lists])
    f1_list.append(avg_value)

print(f1_list)
max_value = max(f1_list)
max_index = f1_list.index(max_value)
print("Max value:", max_value)
print("Index:", max_index)

[np.float64(0.5733333333333334), np.float64(0.5742), np.float64(0.5672166666666667), np.float64(0.5696166666666667), np.float64(0.5671333333333334), np.float64(0.56815), np.float64(0.5714333333333333), np.float64(0.5800333333333333), np.float64(0.5833666666666667), np.float64(0.5692666666666667), np.float64(0.5754), np.float64(0.5856500000000001), np.float64(0.5814833333333334), np.float64(0.5983666666666667), np.float64(0.5871833333333333), np.float64(0.5885333333333334), np.float64(0.5837), np.float64(0.5862166666666666), np.float64(0.57255), np.float64(0.5863833333333334)]
Max value: 0.5983666666666667
Index: 13


# 실제 적용

In [26]:
import pandas as pd


df_zero_filled = pd.read_csv('../data/merged_df_cwj_tozero.csv')
print("✅ df_zero_filled shape:", df_zero_filled.shape)
#df_zero_filled.head()

✅ df_zero_filled shape: (99190, 541)


In [27]:
from sklearn.model_selection import train_test_split

trainset = pd.read_csv('../data/ch2025_metrics_train.csv')

testset = pd.read_csv('../data/ch2025_submission_sample.csv')

In [28]:
print(testset.head(10))
testset.shape

  subject_id  sleep_date lifelog_date  Q1  Q2  Q3  S1  S2  S3
0       id01  2024-07-31   2024-07-30   0   0   0   0   0   0
1       id01  2024-08-01   2024-07-31   0   0   0   0   0   0
2       id01  2024-08-02   2024-08-01   0   0   0   0   0   0
3       id01  2024-08-03   2024-08-02   0   0   0   0   0   0
4       id01  2024-08-04   2024-08-03   0   0   0   0   0   0
5       id01  2024-08-06   2024-08-05   0   0   0   0   0   0
6       id01  2024-08-07   2024-08-06   0   0   0   0   0   0
7       id01  2024-08-09   2024-08-08   0   0   0   0   0   0
8       id01  2024-08-10   2024-08-09   0   0   0   0   0   0
9       id01  2024-08-12   2024-08-11   0   0   0   0   0   0


(250, 9)

In [29]:
print(trainset.head(10))
trainset.shape

  subject_id  sleep_date lifelog_date  Q1  Q2  Q3  S1  S2  S3
0       id01  2024-06-27   2024-06-26   0   0   0   0   0   1
1       id01  2024-06-28   2024-06-27   0   0   0   0   1   1
2       id01  2024-06-29   2024-06-28   1   0   0   1   1   1
3       id01  2024-06-30   2024-06-29   1   0   1   2   0   0
4       id01  2024-07-01   2024-06-30   0   1   1   1   1   1
5       id01  2024-07-02   2024-07-01   0   1   1   0   1   1
6       id01  2024-07-03   2024-07-02   0   0   1   1   0   1
7       id01  2024-07-04   2024-07-03   0   0   0   1   0   1
8       id01  2024-07-05   2024-07-04   0   0   1   0   0   1
9       id01  2024-07-06   2024-07-05   1   1   1   2   1   1


(450, 9)

In [30]:
# Define metric columns
metrics = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']

# Split df_train and df_val for each metric
for metric in metrics:
    globals()[f'trainset_{metric.lower()}'] = trainset[['subject_id', 'sleep_date', 'lifelog_date', metric]].copy()

# Display head of all 12 dataframes
for metric in metrics:
    print(f"trainset_{metric.lower()} head:")
    print(globals()[f'trainset_{metric.lower()}'].head())


trainset_q1 head:
  subject_id  sleep_date lifelog_date  Q1
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   1
3       id01  2024-06-30   2024-06-29   1
4       id01  2024-07-01   2024-06-30   0
trainset_q2 head:
  subject_id  sleep_date lifelog_date  Q2
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   0
3       id01  2024-06-30   2024-06-29   0
4       id01  2024-07-01   2024-06-30   1
trainset_q3 head:
  subject_id  sleep_date lifelog_date  Q3
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   0
3       id01  2024-06-30   2024-06-29   1
4       id01  2024-07-01   2024-06-30   1
trainset_s1 head:
  subject_id  sleep_date lifelog_date  S1
0       id01  2024-06-27   2024-06-26   0
1       id01  2024-06-28   2024-06-27   0
2       id01  2024-06-29   2024-06-28   1
3   

In [None]:
# =========================================================
# 0. 라이브러리
# =========================================================
import os, copy, json, random, math
import numpy as np
import pandas as pd
from tqdm import trange
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# =========================================================
# 1. 데이터 로드
# =========================================================
df_zero_filled = pd.read_csv('../data/merged_df_cwj_tozero.csv')
trainset = pd.read_csv('../data/ch2025_metrics_train.csv')
testset  = pd.read_csv('../data/ch2025_submission_sample.csv')   # id/date 컬럼만 사용

# ---------------------------------------------------------
# 1-1. 시계열 전처리
# ---------------------------------------------------------
df_zero_filled['timestamp']   = pd.to_datetime(df_zero_filled['timestamp'])
df_zero_filled['lifelog_date'] = df_zero_filled['timestamp'].dt.date.astype(str)

DROP_COLS = ['timestamp', 'subject_id', 'lifelog_date']
SENSOR_COLS = [c for c in df_zero_filled.columns if c not in DROP_COLS]

MAX_SEQ = 144    # 10-min × 24 h 기준

def build_sequences(df):
    """Return dict key->np.ndarray(seq_len, n_feat)"""
    seqs = {}
    for (sid, day), g in df.groupby(['subject_id', 'lifelog_date']):
        g = g.sort_values('timestamp')
        x = g[SENSOR_COLS].to_numpy(np.float32)
        if len(x) > MAX_SEQ: x = x[:MAX_SEQ]
        if len(x) < MAX_SEQ:
            x = np.concatenate([x, np.zeros((MAX_SEQ-len(x), x.shape[1]), np.float32)])
        seqs[(sid, day)] = x
    return seqs

SEQ_DICT = build_sequences(df_zero_filled)

# ---------------------------------------------------------
# 1-2. 학습/추론용 텐서 변환
# ---------------------------------------------------------
TARGETS = ['Q1','Q2','Q3','S1','S2','S3']   # S1 = 3-class, others binary

def rows_to_xy(df):
    xs, ys, groups = [], [], []
    for _, r in df.iterrows():
        key = (r.subject_id, r.lifelog_date)
        if key not in SEQ_DICT:        # missing sequence
            continue
        xs.append( SEQ_DICT[key] )
        ys.append( r[TARGETS].to_list() )
        groups.append( r.subject_id )  # fold split anchor
    return np.stack(xs), np.array(ys, np.int64), np.array(groups)

X_all, y_all, group_all = rows_to_xy(trainset)
X_test, _, _ = rows_to_xy(testset)     # y_dummy ignored

# ---------------------------------------------------------
# 1-3. 스케일링 (training set 기준)
# ---------------------------------------------------------
scaler = StandardScaler().fit(X_all.reshape(-1, X_all.shape[-1]))
def scale(x):
    shp = x.shape
    return scaler.transform(x.reshape(-1, shp[-1])).reshape(shp)

X_all  = scale(X_all)
X_test = scale(X_test)

# =========================================================
# 2. Dataset / DataLoader
# =========================================================
class SleepDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self):  return len(self.X)
    def __getitem__(self, i):
        if self.y is None: return self.X[i]
        return self.X[i], self.y[i]

# =========================================================
# 3. GRU 모델 (멀티-헤드)
# =========================================================
class GRUNet(nn.Module):
    def __init__(self, n_feat, hid=128, n_layers=2, drop=0.3):
        super().__init__()
        self.gru = nn.GRU(n_feat, hid, n_layers, batch_first=True, dropout=drop)
        self.heads = nn.ModuleList([
            nn.Linear(hid, 2),   # Q1
            nn.Linear(hid, 2),   # Q2
            nn.Linear(hid, 2),   # Q3
            nn.Linear(hid, 3),   # S1 (3-class)
            nn.Linear(hid, 2),   # S2
            nn.Linear(hid, 2)    # S3
        ])
    def forward(self, x):
        _, h = self.gru(x)
        h = h[-1]                 # last layer output (B,H)
        return [head(h) for head in self.heads]   # list of logits

# loss per task
LOSS_FNS = [
    nn.CrossEntropyLoss(), nn.CrossEntropyLoss(), nn.CrossEntropyLoss(),
    nn.CrossEntropyLoss(), nn.CrossEntropyLoss(), nn.CrossEntropyLoss()
]

# =========================================================
# 4. k-fold validation (subject 그룹 유지)
# =========================================================
N_FOLD     = 5
EPOCH_MAX  = 50
BATCH_SIZE = 64
DEVICE     = 'cuda' if torch.cuda.is_available() else 'cpu'

best_epochs = []          # fold별 best epoch 저장

gkf = GroupKFold(n_splits=N_FOLD)
for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_all, y_all, group_all), 1):
    print(f'\n=== Fold {fold}/{N_FOLD} ===')
    tr_loader = DataLoader(SleepDS(X_all[tr_idx], y_all[tr_idx]),
                           batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(SleepDS(X_all[val_idx], y_all[val_idx]),
                            batch_size=BATCH_SIZE, shuffle=False)

    model = GRUNet(n_feat=X_all.shape[-1]).to(DEVICE)
    opt   = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)

    best_f1, best_ep, best_state = -1, 0, None
    for epoch in range(1, EPOCH_MAX+1):
        # --- train ---
        model.train()
        for xb, yb in tr_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            logits = model(xb)
            loss = sum( LOSS_FNS[k](logits[k], yb[:,k]) for k in range(6) )
            opt.zero_grad(); loss.backward(); opt.step()

        # --- validation ---
        model.eval()
        y_true, y_pred = [[] for _ in range(6)], [[] for _ in range(6)]
        with torch.no_grad():
            for xb, yb in val_loader:
                logits = model(xb.to(DEVICE))
                for k in range(6):
                    preds = logits[k].argmax(1).cpu().numpy()
                    y_pred[k].extend(preds)
                    y_true[k].extend(yb[:,k].numpy())
        f1s = [f1_score(y_true[k], y_pred[k], average='macro') for k in range(6)]
        mac_f1 = np.mean(f1s)
        print(f'E{epoch:02d}  F1={mac_f1:.4f}', end='\r')

        if mac_f1 > best_f1:
            best_f1, best_ep = mac_f1, epoch
            best_state = copy.deepcopy(model.state_dict())
    print(f' -> best F1={best_f1:.4f} @ epoch {best_ep}')
    best_epochs.append(best_ep)

# ---------------------------------------------------------
# 4-1. 최적 epoch 결정 (평균 반올림)
# ---------------------------------------------------------
BEST_EPOCH = int(round(np.mean(best_epochs)))
print('\nSelected BEST_EPOCH =', BEST_EPOCH)

# =========================================================
# 5. 전체 trainset으로 재학습 (BEST_EPOCH), test 예측
# =========================================================
full_loader = DataLoader(SleepDS(X_all, y_all),
                         batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(SleepDS(X_test),
                         batch_size=BATCH_SIZE, shuffle=False)

model_final = GRUNet(n_feat=X_all.shape[-1]).to(DEVICE)
opt = torch.optim.AdamW(model_final.parameters(), lr=3e-4, weight_decay=1e-2)

for epoch in trange(1, BEST_EPOCH+1, desc='Final-train'):
    model_final.train()
    for xb, yb in full_loader:
        xb, yb = xb.to(DEVICE), yb.to(DEVICE)
        logits = model_final(xb)
        loss = sum( LOSS_FNS[k](logits[k], yb[:,k]) for k in range(6) )
        opt.zero_grad(); loss.backward(); opt.step()

# --- inference ---
model_final.eval()
preds_all = [[] for _ in range(6)]
with torch.no_grad():
    for xb in test_loader:
        logits = model_final(xb.to(DEVICE))
        for k in range(6):
            preds_all[k].extend( logits[k].argmax(1).cpu().numpy() )

# =========================================================
# 6. submission 생성
# =========================================================
sub = testset[['subject_id','sleep_date','lifelog_date']].copy()
for k,t in enumerate(TARGETS):
    sub[t] = preds_all[k]

SAVE_PATH = '../data/submission.csv'
sub.to_csv(SAVE_PATH, index=False)
print('✅ submission saved ->', SAVE_PATH)

# 10m 45.7s



=== Fold 1/5 ===
 -> best F1=0.4911 @ epoch 42

=== Fold 2/5 ===
 -> best F1=0.5031 @ epoch 49

=== Fold 3/5 ===
 -> best F1=0.4711 @ epoch 3

=== Fold 4/5 ===
 -> best F1=0.4079 @ epoch 49

=== Fold 5/5 ===
 -> best F1=0.4749 @ epoch 46

Selected BEST_EPOCH = 38


Final-train: 100%|██████████| 38/38 [01:09<00:00,  1.82s/it]


✅ submission saved -> ../data/submission.csv


# GRU for each metric

In [None]:
# 🔁 GRU 모델을 지표별로 따로 학습/추론하는 구조로 수정한 코드
# 각 지표별로 best epoch 및 best model 추적 → 개별 모델로 testset 예측

import os, copy, random
import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

# ------------------------
# Load Data
# ------------------------
df_zero_filled = pd.read_csv('../data/merged_df_cwj_tozero.csv')
trainset = pd.read_csv('../data/ch2025_metrics_train.csv')
testset  = pd.read_csv('../data/ch2025_submission_sample.csv')

# Preprocess
TARGETS = ['Q1','Q2','Q3','S1','S2','S3']
df_zero_filled['timestamp'] = pd.to_datetime(df_zero_filled['timestamp'])
df_zero_filled['lifelog_date'] = df_zero_filled['timestamp'].dt.date.astype(str)

DROP_COLS = ['timestamp', 'subject_id', 'lifelog_date']
SENSOR_COLS = [c for c in df_zero_filled.columns if c not in DROP_COLS]
MAX_SEQ = 144

def build_sequences(df):
    seqs = {}
    for (sid, day), g in df.groupby(['subject_id', 'lifelog_date']):
        g = g.sort_values('timestamp')
        x = g[SENSOR_COLS].to_numpy(np.float32)
        if len(x) > MAX_SEQ: x = x[:MAX_SEQ]
        if len(x) < MAX_SEQ:
            x = np.concatenate([x, np.zeros((MAX_SEQ-len(x), x.shape[1]), np.float32)])
        seqs[(sid, day)] = x
    return seqs

SEQ_DICT = build_sequences(df_zero_filled)

def rows_to_xy(df):
    xs, ys, groups = [], [], []
    for _, r in df.iterrows():
        key = (r.subject_id, r.lifelog_date)
        if key not in SEQ_DICT:
            continue
        xs.append(SEQ_DICT[key])
        ys.append(r[TARGETS].to_list())
        groups.append(r.subject_id)
    return np.stack(xs), np.array(ys, np.int64), np.array(groups)

X_all, y_all, group_all = rows_to_xy(trainset)
X_test, _, _ = rows_to_xy(testset)

# Scale
scaler = StandardScaler().fit(X_all.reshape(-1, X_all.shape[-1]))
def scale(x):
    shp = x.shape
    return scaler.transform(x.reshape(-1, shp[-1])).reshape(shp)

X_all = scale(X_all)
X_test = scale(X_test)

# ------------------------
# Dataset / Model
# ------------------------
class SleepDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, i):
        return self.X[i] if self.y is None else (self.X[i], self.y[i])

class SingleHeadGRU(nn.Module):
    def __init__(self, input_dim, out_dim, hidden=128, layers=2, drop=0.3):
        super().__init__()
        self.gru = nn.GRU(input_dim, hidden, layers, batch_first=True, dropout=drop)
        self.fc = nn.Linear(hidden, out_dim)
    def forward(self, x):
        _, h = self.gru(x)
        return self.fc(h[-1])

# ------------------------
# Train per target
# ------------------------
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
N_FOLD = 5
EPOCH_MAX = 50
BATCH_SIZE = 64

preds_dict = {}

for k, target in enumerate(TARGETS):
    print(f"\n=== {target} training ===")
    y_target = y_all[:, k]
    out_dim = 3 if target == 'S1' else 2
    criterion = nn.CrossEntropyLoss()
    best_epochs = []

    gkf = GroupKFold(n_splits=N_FOLD)
    for fold, (tr_idx, val_idx) in enumerate(gkf.split(X_all, y_target, group_all)):
        model = SingleHeadGRU(X_all.shape[-1], out_dim).to(DEVICE)
        opt = torch.optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-2)

        best_f1, best_ep, best_state = -1, 0, None
        for epoch in range(1, EPOCH_MAX+1):
            model.train()
            for xb, yb in DataLoader(SleepDS(X_all[tr_idx], y_target[tr_idx]), batch_size=BATCH_SIZE, shuffle=True):
                xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                pred = model(xb)
                loss = criterion(pred, yb)
                opt.zero_grad(); loss.backward(); opt.step()

            # validation
            model.eval()
            all_pred, all_true = [], []
            with torch.no_grad():
                for xb, yb in DataLoader(SleepDS(X_all[val_idx], y_target[val_idx]), batch_size=BATCH_SIZE):
                    pred = model(xb.to(DEVICE)).argmax(1).cpu().numpy()
                    all_pred.extend(pred)
                    all_true.extend(yb.numpy())
            f1 = f1_score(all_true, all_pred, average='macro')
            if f1 > best_f1:
                best_f1 = f1
                best_ep = epoch
                best_state = copy.deepcopy(model.state_dict())

        best_epochs.append(best_ep)
        print(f"Fold {fold+1} - Best F1: {best_f1:.4f} @ Epoch {best_ep}")

    # Final train on all data
    final_model = SingleHeadGRU(X_all.shape[-1], out_dim).to(DEVICE)
    opt = torch.optim.AdamW(final_model.parameters(), lr=3e-4, weight_decay=1e-2)
    loader = DataLoader(SleepDS(X_all, y_target), batch_size=BATCH_SIZE, shuffle=True)
    for epoch in range(1, int(round(np.mean(best_epochs)))+1):
        final_model.train()
        for xb, yb in loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            pred = final_model(xb)
            loss = criterion(pred, yb)
            opt.zero_grad(); loss.backward(); opt.step()

    # Predict
    final_model.eval()
    preds = []
    with torch.no_grad():
        for xb in DataLoader(SleepDS(X_test), batch_size=BATCH_SIZE):
            out = final_model(xb.to(DEVICE))
            preds.extend(out.argmax(1).cpu().numpy())
    preds_dict[target] = preds

# ------------------------
# Submission
# ------------------------
sub = testset[['subject_id','sleep_date','lifelog_date']].copy()
for t in TARGETS:
    sub[t] = preds_dict[t]

SAVE_PATH = '../data/submission2.csv'
sub.to_csv(SAVE_PATH, index=False)
print('✅ submission saved ->', SAVE_PATH)

# 85m 54.7s for 5 folds, 50 epochs each


=== Q1 training ===
Fold 1 - Best F1: 0.5968 @ Epoch 40
Fold 2 - Best F1: 0.5137 @ Epoch 15
Fold 3 - Best F1: 0.6220 @ Epoch 49
Fold 4 - Best F1: 0.5160 @ Epoch 48
Fold 5 - Best F1: 0.6735 @ Epoch 38

=== Q2 training ===
Fold 1 - Best F1: 0.6627 @ Epoch 31
Fold 2 - Best F1: 0.6104 @ Epoch 1
Fold 3 - Best F1: 0.5930 @ Epoch 4
Fold 4 - Best F1: 0.5135 @ Epoch 18
Fold 5 - Best F1: 0.5704 @ Epoch 33

=== Q3 training ===
Fold 1 - Best F1: 0.5507 @ Epoch 1
Fold 2 - Best F1: 0.4218 @ Epoch 44
Fold 3 - Best F1: 0.5312 @ Epoch 1
Fold 4 - Best F1: 0.5356 @ Epoch 50
Fold 5 - Best F1: 0.5885 @ Epoch 40

=== S1 training ===
Fold 1 - Best F1: 0.3190 @ Epoch 22
Fold 2 - Best F1: 0.3862 @ Epoch 50
Fold 3 - Best F1: 0.3525 @ Epoch 29
Fold 4 - Best F1: 0.4463 @ Epoch 25
Fold 5 - Best F1: 0.3423 @ Epoch 31

=== S2 training ===
Fold 1 - Best F1: 0.6495 @ Epoch 3
Fold 2 - Best F1: 0.5665 @ Epoch 24
Fold 3 - Best F1: 0.5767 @ Epoch 28
Fold 4 - Best F1: 0.5967 @ Epoch 28
Fold 5 - Best F1: 0.5178 @ Epoch 31


# GRU + OPTUNA
- n_trials = 20, FOLD = 5

In [33]:
# 🚀 GRU with Optuna Hyperparameter Tuning (per‑target)
# - Tune hidden size, layers, dropout, learning rate, weight decay, epochs
# - Use GroupKFold inside Optuna objective (validation F1 ↑)
# - Train final model with best params, predict test, save submission.csv
# Requirements: pip install optuna

import os, copy, random, warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
from sklearn.model_selection import GroupKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import StandardScaler

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import optuna                          # ✨ NEW

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)

df_zero_filled = pd.read_csv('../data/merged_df_cwj_tozero.csv')
trainset = pd.read_csv('../data/ch2025_metrics_train.csv')
testset  = pd.read_csv('../data/ch2025_submission_sample.csv')

TARGETS = ['Q1','Q2','Q3','S1','S2','S3']

df_zero_filled['timestamp']   = pd.to_datetime(df_zero_filled['timestamp'])
df_zero_filled['lifelog_date'] = df_zero_filled['timestamp'].dt.date.astype(str)

DROP_COLS   = ['timestamp','subject_id','lifelog_date']
SENSOR_COLS = [c for c in df_zero_filled.columns if c not in DROP_COLS]
MAX_SEQ     = 144   # 10‑min resolution

# ---------- utils ----------

def build_sequences(df):
    seqs = {}
    for (sid, day), g in df.groupby(['subject_id','lifelog_date']):
        g = g.sort_values('timestamp')
        x = g[SENSOR_COLS].astype('float32').to_numpy()
        if len(x) > MAX_SEQ: x = x[:MAX_SEQ]
        if len(x) < MAX_SEQ:
            x = np.concatenate([x, np.zeros((MAX_SEQ-len(x), x.shape[1]), np.float32)])
        seqs[(sid, day)] = x
    return seqs

SEQ_DICT = build_sequences(df_zero_filled)


def rows_to_xy(df):
    xs, ys, groups = [], [], []
    for _, r in df.iterrows():
        k = (r.subject_id, r.lifelog_date)
        if k not in SEQ_DICT:
            continue
        xs.append(SEQ_DICT[k])
        ys.append(r[TARGETS].to_list())
        groups.append(r.subject_id)
    return np.stack(xs), np.array(ys, np.int64), np.array(groups)

X_all, y_all, group_all = rows_to_xy(trainset)
X_test, _, _            = rows_to_xy(testset)

# scale
scaler = StandardScaler().fit(X_all.reshape(-1, X_all.shape[-1]))
X_all  = scaler.transform(X_all.reshape(-1, X_all.shape[-1])).reshape(X_all.shape)
X_test = scaler.transform(X_test.reshape(-1, X_test.shape[-1])).reshape(X_test.shape)

# ---------- dataset ----------
class SleepDS(Dataset):
    def __init__(self, X, y=None):
        self.X = torch.tensor(X, dtype=torch.float32)
        self.y = None if y is None else torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx):
        return self.X[idx] if self.y is None else (self.X[idx], self.y[idx])

# ---------- model ----------
class SingleHeadGRU(nn.Module):
    def __init__(self, inp_dim, out_dim, hidden, layers, drop):
        super().__init__()
        self.gru = nn.GRU(inp_dim, hidden, layers, batch_first=True, dropout=drop)
        self.fc  = nn.Linear(hidden, out_dim)
    def forward(self, x):
        _, h = self.gru(x)
        return self.fc(h[-1])

DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
BATCH_SIZE_DEFAULT = 64
N_FOLD = 5

# --------------------------------------------------
# Optuna tuning per target
# --------------------------------------------------
preds_dict = {}

for idx_target, target in enumerate(TARGETS):
    print(f"\n🎯 Optimizing {target}")
    y_target = y_all[:, idx_target]
    out_dim  = 3 if target == 'S1' else 2
    criterion = nn.CrossEntropyLoss()

    def objective(trial):
        # hyperparameters to tune
        hidden = trial.suggest_int('hidden', 64, 256, step=64)
        layers = trial.suggest_int('layers', 1, 3)
        drop   = trial.suggest_float('drop', 0.1, 0.5, step=0.1)
        lr     = trial.suggest_loguniform('lr', 1e-4, 1e-2)
        wd     = trial.suggest_loguniform('wd', 1e-5, 1e-2)
        epochs = trial.suggest_int('epochs', 10, 40, step=10)

        gkf = GroupKFold(n_splits=N_FOLD)
        f1_scores = []

        for tr_idx, val_idx in gkf.split(X_all, y_target, group_all):
            model = SingleHeadGRU(X_all.shape[-1], out_dim, hidden, layers, drop).to(DEVICE)
            opt   = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=wd)
            tr_loader  = DataLoader(SleepDS(X_all[tr_idx], y_target[tr_idx]), batch_size=BATCH_SIZE_DEFAULT, shuffle=True)
            val_loader = DataLoader(SleepDS(X_all[val_idx], y_target[val_idx]), batch_size=BATCH_SIZE_DEFAULT)

            # training loop
            for epoch in range(epochs):
                model.train()
                for xb, yb in tr_loader:
                    xb, yb = xb.to(DEVICE), yb.to(DEVICE)
                    loss = criterion(model(xb), yb)
                    opt.zero_grad(); loss.backward(); opt.step()

            # validation
            model.eval(); y_true, y_pred = [], []
            with torch.no_grad():
                for xb, yb in val_loader:
                    preds = model(xb.to(DEVICE)).argmax(1).cpu().numpy()
                    y_pred.extend(preds); y_true.extend(yb.numpy())
            f1_scores.append(f1_score(y_true, y_pred, average='macro'))

        return float(np.mean(f1_scores))

    study = optuna.create_study(direction='maximize')
    study.optimize(objective, n_trials=20, timeout=None)  # 🕑 adjust n_trials
    best_params = study.best_params
    print('Best params ->', best_params, 'Best F1 ->', study.best_value)

    # ---------- train final model with best params ----------
    hidden = best_params['hidden']
    layers = best_params['layers']
    drop   = best_params['drop']
    lr     = best_params['lr']
    wd     = best_params['wd']
    epochs = best_params['epochs']

    model_final = SingleHeadGRU(X_all.shape[-1], out_dim, hidden, layers, drop).to(DEVICE)
    opt_final   = torch.optim.AdamW(model_final.parameters(), lr=lr, weight_decay=wd)
    full_loader = DataLoader(SleepDS(X_all, y_target), batch_size=BATCH_SIZE_DEFAULT, shuffle=True)

    for _ in range(epochs):
        model_final.train()
        for xb, yb in full_loader:
            xb, yb = xb.to(DEVICE), yb.to(DEVICE)
            loss = criterion(model_final(xb), yb)
            opt_final.zero_grad(); loss.backward(); opt_final.step()

    # ---------- predict test ----------
    model_final.eval(); preds = []
    with torch.no_grad():
        for xb in DataLoader(SleepDS(X_test), batch_size=BATCH_SIZE_DEFAULT):
            preds.extend(model_final(xb.to(DEVICE)).argmax(1).cpu().numpy())
    preds_dict[target] = preds

# --------------------------------------------------
# Build submission
# --------------------------------------------------
sub = testset[['subject_id','sleep_date','lifelog_date']].copy()
for t in TARGETS:
    sub[t] = preds_dict[t]

SAVE_PATH = '../data/submission_optuna.csv'
sub.to_csv(SAVE_PATH, index=False)
print('✅ submission saved ->', SAVE_PATH)


[I 2025-06-05 14:58:00,343] A new study created in memory with name: no-name-20fccd60-f1f4-4777-9800-584b7dc6b52c



🎯 Optimizing Q1


[I 2025-06-05 15:03:36,217] Trial 0 finished with value: 0.4808357797731812 and parameters: {'hidden': 192, 'layers': 3, 'drop': 0.30000000000000004, 'lr': 0.0048856791494689585, 'wd': 0.0002509962492049059, 'epochs': 30}. Best is trial 0 with value: 0.4808357797731812.
[I 2025-06-05 15:08:40,293] Trial 1 finished with value: 0.5023608089565808 and parameters: {'hidden': 128, 'layers': 3, 'drop': 0.1, 'lr': 0.0008476317631758312, 'wd': 0.0006430327830613585, 'epochs': 30}. Best is trial 1 with value: 0.5023608089565808.
[I 2025-06-05 15:10:16,883] Trial 2 finished with value: 0.5307364925233908 and parameters: {'hidden': 64, 'layers': 1, 'drop': 0.30000000000000004, 'lr': 0.009010815268596269, 'wd': 1.0287136692431994e-05, 'epochs': 10}. Best is trial 2 with value: 0.5307364925233908.
[I 2025-06-05 15:13:37,058] Trial 3 finished with value: 0.49174575264657194 and parameters: {'hidden': 256, 'layers': 3, 'drop': 0.1, 'lr': 0.0022538037118753278, 'wd': 0.00022761365993591062, 'epochs': 

Best params -> {'hidden': 64, 'layers': 2, 'drop': 0.2, 'lr': 0.009939767427754764, 'wd': 1.0342806430569786e-05, 'epochs': 10} Best F1 -> 0.566690939321625


[I 2025-06-05 15:49:38,943] A new study created in memory with name: no-name-65a592bf-16e3-48e8-8755-905ec321c4d9



🎯 Optimizing Q2


[I 2025-06-05 15:51:05,760] Trial 0 finished with value: 0.4930097328179969 and parameters: {'hidden': 256, 'layers': 2, 'drop': 0.5, 'lr': 0.0004606657125303683, 'wd': 0.00020232758057082577, 'epochs': 10}. Best is trial 0 with value: 0.4930097328179969.
[I 2025-06-05 15:55:32,285] Trial 1 finished with value: 0.4958816499489801 and parameters: {'hidden': 192, 'layers': 3, 'drop': 0.4, 'lr': 0.000863341771774417, 'wd': 1.3192290066940933e-05, 'epochs': 30}. Best is trial 1 with value: 0.4958816499489801.
[I 2025-06-05 15:58:31,840] Trial 2 finished with value: 0.5062630339388019 and parameters: {'hidden': 128, 'layers': 3, 'drop': 0.2, 'lr': 0.00013624267312673752, 'wd': 0.0001845821644537221, 'epochs': 20}. Best is trial 2 with value: 0.5062630339388019.
[I 2025-06-05 16:05:07,421] Trial 3 finished with value: 0.44245446159071256 and parameters: {'hidden': 64, 'layers': 2, 'drop': 0.2, 'lr': 0.0009804415974283619, 'wd': 0.00020808405673901893, 'epochs': 40}. Best is trial 2 with valu

Best params -> {'hidden': 128, 'layers': 3, 'drop': 0.2, 'lr': 0.00013624267312673752, 'wd': 0.0001845821644537221, 'epochs': 20} Best F1 -> 0.5062630339388019


[I 2025-06-05 16:49:49,849] A new study created in memory with name: no-name-77720886-db5e-4068-9e69-155e9a6a04e1



🎯 Optimizing Q3


[I 2025-06-05 16:52:26,585] Trial 0 finished with value: 0.46799626004728223 and parameters: {'hidden': 128, 'layers': 1, 'drop': 0.4, 'lr': 0.009080997950698, 'wd': 8.537882259216108e-05, 'epochs': 20}. Best is trial 0 with value: 0.46799626004728223.
[I 2025-06-05 16:55:02,219] Trial 1 finished with value: 0.40356325599378584 and parameters: {'hidden': 256, 'layers': 1, 'drop': 0.2, 'lr': 0.0001994055087091531, 'wd': 0.002684431121309825, 'epochs': 20}. Best is trial 0 with value: 0.46799626004728223.
[I 2025-06-05 16:59:02,143] Trial 2 finished with value: 0.4505269941725816 and parameters: {'hidden': 64, 'layers': 3, 'drop': 0.2, 'lr': 0.0001740202704426324, 'wd': 0.006713460894859095, 'epochs': 40}. Best is trial 0 with value: 0.46799626004728223.
[I 2025-06-05 17:00:32,817] Trial 3 finished with value: 0.4090435392927427 and parameters: {'hidden': 192, 'layers': 3, 'drop': 0.4, 'lr': 0.0011070428509169143, 'wd': 0.00010452197025502075, 'epochs': 10}. Best is trial 0 with value: 0

Best params -> {'hidden': 256, 'layers': 2, 'drop': 0.1, 'lr': 0.002536242954103861, 'wd': 1.250607484415822e-05, 'epochs': 30} Best F1 -> 0.5131683451272998


[I 2025-06-05 18:16:07,445] A new study created in memory with name: no-name-954392a2-c89f-4248-9491-525ee539c9dc



🎯 Optimizing S1


[I 2025-06-05 18:19:52,753] Trial 0 finished with value: 0.34024242872965627 and parameters: {'hidden': 128, 'layers': 2, 'drop': 0.4, 'lr': 0.0008745733512930146, 'wd': 0.00843908855446404, 'epochs': 20}. Best is trial 0 with value: 0.34024242872965627.
[I 2025-06-05 18:25:46,911] Trial 1 finished with value: 0.3402171021387113 and parameters: {'hidden': 64, 'layers': 1, 'drop': 0.5, 'lr': 0.005374640860659699, 'wd': 0.0002544092942471286, 'epochs': 40}. Best is trial 0 with value: 0.34024242872965627.
[I 2025-06-05 18:31:19,633] Trial 2 finished with value: 0.2940094296146226 and parameters: {'hidden': 256, 'layers': 3, 'drop': 0.4, 'lr': 0.0002377692583660246, 'wd': 2.226396220035422e-05, 'epochs': 40}. Best is trial 0 with value: 0.34024242872965627.
[I 2025-06-05 18:33:56,965] Trial 3 finished with value: 0.3262898069501778 and parameters: {'hidden': 256, 'layers': 3, 'drop': 0.5, 'lr': 0.002535431772853788, 'wd': 0.000156957184280679, 'epochs': 20}. Best is trial 0 with value: 0.

Best params -> {'hidden': 192, 'layers': 2, 'drop': 0.1, 'lr': 0.0021564326902317305, 'wd': 2.087515700664565e-05, 'epochs': 10} Best F1 -> 0.3801351987973203


[I 2025-06-05 19:26:17,052] A new study created in memory with name: no-name-c15df6c9-0a1f-4d28-977c-fbfba3705a1b



🎯 Optimizing S2


[I 2025-06-05 19:30:50,000] Trial 0 finished with value: 0.5320869171639158 and parameters: {'hidden': 64, 'layers': 2, 'drop': 0.5, 'lr': 0.0027480829123465327, 'wd': 0.0003239780262680119, 'epochs': 30}. Best is trial 0 with value: 0.5320869171639158.
[I 2025-06-05 19:41:29,220] Trial 1 finished with value: 0.5241168663273127 and parameters: {'hidden': 128, 'layers': 3, 'drop': 0.5, 'lr': 0.00031164807491248435, 'wd': 6.395895715946433e-05, 'epochs': 40}. Best is trial 0 with value: 0.5320869171639158.
[I 2025-06-05 19:44:16,869] Trial 2 finished with value: 0.5039539355190009 and parameters: {'hidden': 64, 'layers': 1, 'drop': 0.30000000000000004, 'lr': 0.00048628492929025496, 'wd': 1.1949690969130588e-05, 'epochs': 20}. Best is trial 0 with value: 0.5320869171639158.
[I 2025-06-05 19:53:14,674] Trial 3 finished with value: 0.506178446745453 and parameters: {'hidden': 256, 'layers': 1, 'drop': 0.2, 'lr': 0.003866686069963752, 'wd': 0.00048645062418645624, 'epochs': 30}. Best is tria

Best params -> {'hidden': 64, 'layers': 2, 'drop': 0.4, 'lr': 0.009496203492354464, 'wd': 0.0014535062380554168, 'epochs': 10} Best F1 -> 0.5382710499714908


[I 2025-06-05 20:53:54,707] A new study created in memory with name: no-name-688ddabb-dcbb-4d0c-a398-1d9737eb2741



🎯 Optimizing S3


[I 2025-06-05 20:54:22,306] Trial 0 finished with value: 0.44799479877004983 and parameters: {'hidden': 256, 'layers': 1, 'drop': 0.5, 'lr': 0.0004727264082242207, 'wd': 0.0011531906565245898, 'epochs': 40}. Best is trial 0 with value: 0.44799479877004983.
[I 2025-06-05 20:54:42,299] Trial 1 finished with value: 0.4261338030618921 and parameters: {'hidden': 128, 'layers': 3, 'drop': 0.5, 'lr': 0.0013206701628361622, 'wd': 0.0003957140996103862, 'epochs': 40}. Best is trial 0 with value: 0.44799479877004983.
[I 2025-06-05 20:54:53,687] Trial 2 finished with value: 0.4282468332344698 and parameters: {'hidden': 192, 'layers': 3, 'drop': 0.2, 'lr': 0.0003014516478996842, 'wd': 0.005816372451915116, 'epochs': 10}. Best is trial 0 with value: 0.44799479877004983.
[I 2025-06-05 20:55:43,911] Trial 3 finished with value: 0.5125244467688084 and parameters: {'hidden': 256, 'layers': 3, 'drop': 0.1, 'lr': 0.00015771374640461902, 'wd': 0.004076354233663116, 'epochs': 40}. Best is trial 3 with valu

Best params -> {'hidden': 256, 'layers': 3, 'drop': 0.1, 'lr': 0.00015771374640461902, 'wd': 0.004076354233663116, 'epochs': 40} Best F1 -> 0.5125244467688084
✅ submission saved -> ../data/submission_optuna.csv
