# date aggregation

In [2]:
import pandas as pd
import numpy as np
import os

In [3]:
merge_df = pd.read_csv('merge_df.csv')

In [4]:
# 날짜만 추출해서 lifelog_date 생성
merge_df['lifelog_date'] = pd.to_datetime(merge_df['timestamp']).dt.date

# 합산 대상 컬럼
sum_cols = ['m_usage_time', 'met_activity', 'distance', 'burned_calories']

# z-score 대상 컬럼
zscore_cols = [
    'wb_rssi', 'ww_rssi', 'avg_heart_rate',
    'avg_latitude', 'avg_longitude', 'avg_altitude',
    'avg_speed', 'avg_light', 'avg_prob_ambience'
]

# 1. z-score 변환: subject_id별로 적용
def zscore(x):
    return (x - x.mean()) / x.std(ddof=0) if x.std(ddof=0) != 0 else 0

merge_df_z = merge_df.copy()
for col in zscore_cols:
    merge_df_z[col] = merge_df.groupby('subject_id')[col].transform(zscore)

# 2. 날짜별 집계: 모든 feature를 sum
agg_dict = {col: 'sum' for col in (sum_cols + zscore_cols)}

daily_df = merge_df_z.groupby(['subject_id', 'lifelog_date']).agg(agg_dict).reset_index()


In [5]:
daily_df.head(20)

Unnamed: 0,subject_id,lifelog_date,m_usage_time,met_activity,distance,burned_calories,wb_rssi,ww_rssi,avg_heart_rate,avg_latitude,avg_longitude,avg_altitude,avg_speed,avg_light,avg_prob_ambience
0,id01,2024-06-26,25418119.0,718.34,2773.860064,189.319059,22.196509,11.625936,-9.093807,-5.686434,1.679865,-15.872355,-0.295386,-3.385498,-161.416518
1,id01,2024-06-27,54102878.0,1091.12,2393.372559,288.990784,10.550762,14.162236,-20.301245,-12.946678,-11.236354,-18.251124,24.863787,14.365686,49.418628
2,id01,2024-06-28,45006301.0,1092.4,3219.468748,130.109459,2.067024,1.78341,-86.400161,-13.917672,-12.849713,-25.682183,8.092535,31.971784,44.033083
3,id01,2024-06-29,31896684.0,1062.63,0.0,0.0,-33.833208,99.317216,-177.096385,-18.912228,-30.81301,26.933686,-9.107308,-24.660567,49.616197
4,id01,2024-06-30,46370978.0,1066.7,2330.986329,104.809712,-24.938659,21.952234,81.933425,-18.972328,-23.00675,60.026278,-17.919958,-15.51777,49.42939
5,id01,2024-07-01,56707093.0,1117.55,2717.891418,121.579084,32.929868,-18.786671,-19.019719,-10.832735,-11.944249,-19.574743,-8.323015,0.771727,36.780894
6,id01,2024-07-02,41243885.0,1137.79,4915.903809,214.109766,-7.803225,-30.11335,24.195347,-11.269948,-12.345018,-32.112316,-4.216007,-10.023553,50.077414
7,id01,2024-07-03,53240886.0,1183.62,4860.677948,209.960146,12.986787,21.432858,39.684506,-11.792755,-4.540311,-26.16555,-10.819541,18.001728,48.153042
8,id01,2024-07-04,47954159.0,1088.03,716.051481,26.360079,14.850587,81.108904,33.718957,-14.431824,-16.218299,22.402143,36.0865,63.212255,47.703869
9,id01,2024-07-05,48280608.0,1141.18,4355.324707,195.379101,-25.296784,-26.178627,-32.401457,-11.557094,-12.148006,7.307549,-0.866701,19.186034,49.6664


# train, test set merge

In [8]:
train_df = pd.read_csv('../data/ch2025_metrics_train.csv')
test_df = pd.read_csv('../data/ch2025_submission_sample.csv')

In [14]:
train_df.shape, test_df.shape, daily_df.shape


((450, 9), (250, 9), (709, 15))

In [15]:
# 날짜 컬럼 변환
train_df['lifelog_date'] = pd.to_datetime(train_df['lifelog_date']).dt.date
test_df['lifelog_date'] = pd.to_datetime(test_df['lifelog_date']).dt.date

# 필요한 날짜/ID만 추출
train_key = train_df[['subject_id', 'lifelog_date']].drop_duplicates()
test_key = test_df[['subject_id', 'lifelog_date']].drop_duplicates()

# daily_df 필터링
daily_df_train = pd.merge(daily_df, train_key, on=['subject_id', 'lifelog_date'], how='inner')
daily_df_test = pd.merge(daily_df, test_key, on=['subject_id', 'lifelog_date'], how='inner')

# merge
train_merged = pd.merge(train_df, daily_df_train, on=['subject_id', 'lifelog_date'], how='left')
test_merged = pd.merge(test_df, daily_df_test, on=['subject_id', 'lifelog_date'], how='left')


In [18]:
train_merged.shape, test_merged.shape


((450, 22), (250, 22))

# Logistic Regression model

In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

y_cols = ['Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']
x_cols = [col for col in train_merged.columns if col not in y_cols + ['subject_id', 'sleep_date', 'lifelog_date']]

test_preds = {}
train_preds = {}

for label in y_cols:
    X_train = train_merged[x_cols].fillna(0)
    y_train = train_merged[label]
    X_test = test_merged[x_cols].fillna(0)

    clf = LogisticRegression(max_iter=100000)
    clf.fit(X_train, y_train)
    test_pred = clf.predict(X_test)
    train_pred = clf.predict(X_train)

    test_preds[label] = test_pred
    train_preds[label] = train_pred

# Export

In [27]:
result_df = test_merged[['subject_id', 'sleep_date', 'lifelog_date']].copy()
for label in y_cols:
    result_df[label] = test_preds[label]

# 컬럼 순서 맞추기 (submission_sample 기준)
result_df = result_df[['subject_id', 'sleep_date', 'lifelog_date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3']]

# int type으로 저장
for label in y_cols:
    result_df[label] = result_df[label].astype(int)

result_df.to_csv('result_logistic_cwj', index=False)
