# Baseline Sleep-Quality Prediction (id-wise)
이번 노트북은 `merge_df.csv`의 10분 단위 생체 로그 데이터를 일별로 요약하고, 각 `subject_id`별로 별도의 스태킹-배깅 앙상블 모델을 학습해 6개 지표(Q1–Q3, S1–S3)를 예측합니다.

**출력**: `Model test_1/baseline_submission_idwise_v2.csv`

In [None]:
import warnings
warnings.filterwarnings('ignore')

from pathlib import Path
import numpy as np
import pandas as pd

from sklearn.ensemble import StackingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from inspect import signature

from lightgbm import LGBMClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

# config
RANDOM_STATE = 42
TARGETS = ['Q1','Q2','Q3','S1','S2','S3']
NON_FEATURES = ['subject_id','sleep_date','lifelog_date'] + TARGETS
print('scikit-learn version:', pd.__version__)
print('LightGBM version:', LGBMClassifier()._get_tags())

scikit-learn version: 2.2.2
LightGBM version: {'array_api_support': False, 'non_deterministic': False, 'requires_positive_X': False, 'requires_positive_y': False, 'X_types': ['2darray', 'sparse', '1dlabels'], 'poor_score': False, 'no_validation': False, 'multioutput': False, 'allow_nan': True, 'stateless': False, 'multilabel': False, '_skip_test': False, '_xfail_checks': {'check_no_attributes_set_in_init': 'scikit-learn incorrectly asserts that private attributes cannot be set in __init__: (see https://github.com/microsoft/LightGBM/issues/2628)', 'check_sample_weight_equivalence': "In LightGBM, setting a sample's weight to 0 can produce a different result than omitting the sample. Such samples intentionally still affect count-based measures like 'min_data_in_leaf' (https://github.com/microsoft/LightGBM/issues/5626#issuecomment-1712706678) and the estimated distribution of features for Dataset construction (see https://github.com/microsoft/LightGBM/issues/5553).", 'check_sample_weight_e

In [7]:
# %%
# 2) 데이터 로드 및 확인
merge_df = pd.read_csv('merge_df.csv')
train_lbl = pd.read_csv('ch2025_metrics_train.csv')
test_tpl  = pd.read_csv('ch2025_submission_sample.csv')

print('merge_df:', merge_df.shape)
print('train_lbl:', train_lbl.shape)
print('test_tpl :', test_tpl.shape)

merge_df: (76753, 15)
train_lbl: (450, 9)
test_tpl : (250, 9)


In [None]:
# 날짜 column 형식
for df in (train_lbl, test_tpl):
    for col in ['lifelog_date','sleep_date']:
        if col in df.columns:
            df[col] = pd.to_datetime(df[col], errors='coerce').dt.date

print(train_lbl[['subject_id','sleep_date','lifelog_date']].head())

  subject_id  sleep_date lifelog_date
0       id01  2024-06-27   2024-06-26
1       id01  2024-06-28   2024-06-27
2       id01  2024-06-29   2024-06-28
3       id01  2024-06-30   2024-06-29
4       id01  2024-07-01   2024-06-30


In [None]:
# 일별 집계
def aggregate_lifelog(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()
    df['timestamp'] = pd.to_datetime(df['timestamp'], errors='coerce')
    df['lifelog_date'] = df['timestamp'].dt.date
    # 센서 컬럼 강제 numeric
    exclude = {'subject_id','timestamp','lifelog_date'}
    for col in df.columns.difference(exclude):
        df[col] = pd.to_numeric(df[col], errors='coerce')
    numeric_cols = df.select_dtypes('number').columns.difference(['subject_id'])
    agg = df.groupby(['subject_id','lifelog_date'])[numeric_cols] \
             .agg(['mean','std','min','max','sum'])
    agg.columns = [f'{c}_{stat}' for c,stat in agg.columns]
    return agg.reset_index()


lifelog_daily = aggregate_lifelog(merge_df)
print(lifelog_daily.shape)
lifelog_daily.head()

(709, 67)


Unnamed: 0,subject_id,lifelog_date,avg_altitude_mean,avg_altitude_std,avg_altitude_min,avg_altitude_max,avg_altitude_sum,avg_heart_rate_mean,avg_heart_rate_std,avg_heart_rate_min,...,wb_rssi_mean,wb_rssi_std,wb_rssi_min,wb_rssi_max,wb_rssi_sum,ww_rssi_mean,ww_rssi_std,ww_rssi_min,ww_rssi_max,ww_rssi_sum
0,id01,2024-06-26,89.777746,13.057159,61.135626,116.080379,6284.442204,85.659264,10.873441,68.420367,...,0.013478,0.021593,0.0,0.098621,0.943474,0.037841,0.039018,0.0,0.207389,2.648902
1,id01,2024-06-27,92.761131,13.898336,62.496463,139.125076,13357.602887,85.506998,9.485955,65.372542,...,0.00872,0.017648,0.0,0.099395,1.255674,0.035326,0.039301,0.0,0.164983,5.086907
2,id01,2024-06-28,91.184423,12.080225,73.600027,104.287955,13039.372422,79.135191,9.741568,63.520551,...,0.007572,0.01413,0.0,0.073282,1.082823,0.032136,0.041206,0.0,0.24219,4.595484
3,id01,2024-06-29,102.122124,8.322756,75.504083,142.228182,14705.585842,70.529159,13.9578,53.0,...,0.002705,0.006836,0.0,0.044107,0.389517,0.057289,0.050153,0.0,0.253416,8.249561
4,id01,2024-06-30,108.977957,12.321548,70.263226,130.47129,15692.825775,95.272954,8.469067,78.211111,...,0.00391,0.011104,0.0,0.062794,0.563095,0.037335,0.040901,0.0,0.187816,5.376227


In [None]:
# Train/Test
train = train_lbl.merge(lifelog_daily, on=['subject_id','lifelog_date'], how='left')
test  = test_tpl.merge(lifelog_daily, on=['subject_id','lifelog_date'], how='left')
print('After merge: train', train.shape, 'test', test.shape)

After merge: train (450, 74) test (250, 74)


In [None]:
#  피처 선택 및 결측 처리
cand_features = train.columns.difference(NON_FEATURES)
feature_cols = [c for c in cand_features if not train[c].isna().all()]
if not feature_cols:
    # 잠깐 오류고치려고 결측치 1로 채움(이거 바꿔야함)
    train['bias'] = 1.0
    test['bias']  = 1.0
    feature_cols = ['bias']
print('사용 피처 개수:', len(feature_cols))
feature_cols[:5]

사용 피처 개수: 65


['avg_altitude_max',
 'avg_altitude_mean',
 'avg_altitude_min',
 'avg_altitude_std',
 'avg_altitude_sum']

In [None]:
# 모델 정의 - 4개 stacking
def bagging_supports_estimator() -> bool:
    return 'estimator' in signature(BaggingClassifier.__init__).parameters

def make_ensemble(n_classes: int):
    base_learners = [
        ('lgb', LGBMClassifier(n_estimators=400, learning_rate=0.05,
             objective='multiclass' if n_classes>2 else 'binary',
             num_class=n_classes if n_classes>2 else None,
             random_state=RANDOM_STATE)),
        ('xgb', XGBClassifier(n_estimators=400, learning_rate=0.05,
             objective='multi:softprob' if n_classes>2 else 'binary:logistic',
             num_class=n_classes if n_classes>2 else None,
             eval_metric='mlogloss' if n_classes>2 else 'logloss',
             n_jobs=-1, random_state=RANDOM_STATE)),
        ('cat', CatBoostClassifier(iterations=400, learning_rate=0.05,
             loss_function='MultiClass' if n_classes>2 else 'Logloss',
             verbose=0, random_seed=RANDOM_STATE)),
        ('ada', AdaBoostClassifier(n_estimators=400, random_state=RANDOM_STATE))
    ]
    meta = LogisticRegression(max_iter=1000,
                              multi_class='multinomial' if n_classes>2 else 'auto',
                              n_jobs=-1)
    stack = StackingClassifier(estimators=base_learners,
                              final_estimator=meta,
                              passthrough=True, n_jobs=-1)
    pipe = Pipeline([('imputer', SimpleImputer(strategy='constant', fill_value=0)),
                     ('model', stack)])
    bag_kwargs = dict(n_estimators=10, max_samples=0.8, bootstrap=True,
                      n_jobs=-1, random_state=RANDOM_STATE)
    if bagging_supports_estimator():
        return BaggingClassifier(estimator=pipe, **bag_kwargs)
    else:
        return BaggingClassifier(base_estimator=pipe, **bag_kwargs)
print('Model factory ready')

Model factory ready


In [None]:
# id별 모델 학습 함수 정의
def fit_models_per_id(train_df: pd.DataFrame, feature_cols):
    models, global_cache = {}, {}
    for sid, grp in train_df.groupby('subject_id'):
        X_grp = grp[feature_cols]
        for tgt in TARGETS:
            y = grp[tgt]
            if len(grp)<5 or y.nunique()==1:
                if tgt not in global_cache:
                    gm = make_ensemble(train_df[tgt].nunique())
                    gm.fit(train_df[feature_cols], train_df[tgt])
                    global_cache[tgt] = gm
                models[(sid, tgt)] = global_cache[tgt]
            else:
                m = make_ensemble(y.nunique())
                m.fit(X_grp, y)
                models[(sid, tgt)] = m
    return models
print('fit_models_per_id ready')

fit_models_per_id ready


In [17]:
# 모델 학습 및 추론
models = fit_models_per_id(train, feature_cols)

# subject_id별로 한꺼번에 예측
for sid in test['subject_id'].unique():
    mask    = test['subject_id'] == sid
    X_sid   = test.loc[mask, feature_cols]
    for tgt in TARGETS:
        # predict → 1D array (shape=(n_rows,))
        preds = models[(sid, tgt)].predict(X_sid)
        test.loc[mask, tgt] = preds

test

ValueError: shape mismatch: value array of shape (7,3) could not be broadcast to indexing result of shape (7,2)

In [None]:
# 결과 저장
sub_cols = ['subject_id','sleep_date','lifelog_date'] + TARGETS
out = 'baseline_submission_idwise_v2.csv'
test[sub_cols].to_csv(out, index=False)
print('Saved:', out)