# dwt 적용해서 feature 뽑기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os

# Path for data folder
DATA_DIR = "../data/"

# Main data item list
file_names = [
    "mACStatus", "mScreenStatus", "mUsageStats", "mActivity", "mBle", "mWifi",
    "wHr", "wPedo", "mGps", "mLight","wLight", "mAmbience"
]

data_files = {name: os.path.join(DATA_DIR, f"ch2025_{name}.parquet") for name in file_names}

dfs = {}
for name, file_path in data_files.items():
    dfs[name] = pd.read_parquet(file_path)
    globals()[name] = dfs[name]
    print(f"Loaded {name} with shape {dfs[name].shape}")

Loaded mACStatus with shape (939896, 3)
Loaded mScreenStatus with shape (939653, 3)
Loaded mUsageStats with shape (45197, 3)
Loaded mActivity with shape (961062, 3)
Loaded mBle with shape (21830, 3)
Loaded mWifi with shape (76336, 3)
Loaded wHr with shape (382918, 3)
Loaded wPedo with shape (748100, 9)
Loaded mGps with shape (800611, 3)
Loaded mLight with shape (96258, 3)
Loaded wLight with shape (633741, 3)
Loaded mAmbience with shape (476577, 3)


### 일단 간단하게 wLight dwt 적용

In [3]:
!pip install PyWavelets



In [4]:
import pywt

# ensure timestamp is datetime and sort by subject and time
wLight['timestamp'] = pd.to_datetime(wLight['timestamp'])
wLight = wLight.sort_values(['subject_id', 'timestamp'])

# function to extract DWT features from a 1D array
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    # for each level (approximation + details) compute mean, std, energy
    for idx, c in enumerate(coeffs):
        feats[f'level_{idx}_mean']   = np.mean(c)
        feats[f'level_{idx}_std']    = np.std(c)
        feats[f'level_{idx}_energy'] = np.sum(c**2)
    return feats

# group by subject, fill gaps, extract features
feature_rows = []
for subj, grp in wLight.groupby('subject_id'):
    values = grp['w_light'].fillna(method='ffill').fillna(0).values
    row = extract_dwt_features(values)
    row['subject_id'] = subj
    feature_rows.append(row)

# assemble into DataFrame
features_df = pd.DataFrame(feature_rows)
print(features_df.head())
print(features_df.shape)


   level_0_mean  level_0_std  level_0_energy  level_1_mean  level_1_std  \
0   1896.855697  3468.180769    1.050090e+10    -77.191514  2563.281570   
1   1269.319542  3708.348697    2.020237e+10    -58.556863  2071.915189   
2   1249.935703  3211.219235    1.123306e+10    -44.914753  1711.498931   
3   1562.229941  3980.721287    1.777468e+10     59.418617  1897.019771   
4   2187.230936  5155.305936    2.552798e+10    -18.043144  2808.941594   

   level_1_energy  level_2_mean  level_2_std  level_2_energy  level_3_mean  \
0    4.419321e+09      0.685512  2250.379328    6.770846e+09      2.734507   
1    5.649584e+09     28.815372  1420.826885    5.299376e+09    -23.675415   
2    2.772959e+09     60.081667  1552.879161    4.552357e+09      8.183103   
3    3.501353e+09     68.298851  1990.576473    7.684194e+09     32.763466   
4    6.422849e+09     45.653551  2561.705812    1.064093e+10     26.037351   

   ...  level_4_mean  level_4_std  level_4_energy  level_5_mean  level_5_std  \


  values = grp['w_light'].fillna(method='ffill').fillna(0).values


In [5]:
import pywt

# 1) timestamp를 datetime으로 변환하고 정렬
wLight['timestamp'] = pd.to_datetime(wLight['timestamp'])
wLight = wLight.sort_values(['subject_id', 'timestamp'])

print(wLight.head())
# 2) 날짜(date) 컬럼 추가
wLight['date'] = wLight['timestamp'].dt.date

# 3) DWT feature 추출 함수 (예시는 db4, 최대 6레벨)
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    for idx, c in enumerate(coeffs):
        feats[f'level_{idx}_mean']   = np.mean(c)
        feats[f'level_{idx}_std']    = np.std(c)
        feats[f'level_{idx}_energy'] = np.sum(c**2)
    return feats

# 4) subject_id × date별로 묶어서 feature 뽑기
feature_rows = []
for (subj, day), grp in wLight.groupby(['subject_id', 'date']):
    # grp을 timestamp 기준으로 인덱싱하고 나서 w_light 컬럼을 꺼냅니다.
    vals = (
        grp
        .set_index('timestamp')['w_light']
        .resample('1T').mean()
        .ffill().fillna(0)
        .values
    )

    feats = extract_dwt_features(vals)
    feats['subject_id'] = subj
    feats['date']       = day
    feature_rows.append(feats)

# 5) DataFrame 조립
daily_features = pd.DataFrame(feature_rows)
print(daily_features.head())
print(daily_features.shape)


  subject_id           timestamp  w_light
0       id01 2024-06-26 12:17:00    633.0
1       id01 2024-06-26 12:18:00    483.0
2       id01 2024-06-26 12:19:00    541.0
3       id01 2024-06-26 12:20:00    547.0
4       id01 2024-06-26 12:21:00    547.0


  .resample('1T').mean()


   level_0_mean  level_0_std  level_0_energy  level_1_mean  level_1_std  \
0   2319.716888  1993.061284    1.590075e+08    220.529958   857.949949   
1   1487.508117  3064.055081    3.364323e+08   -380.241503  2156.959740   
2   2043.725484  4345.636720    6.687798e+08    496.916154  3143.140541   
3    443.503950   762.056177    2.176791e+07    -15.674754   227.526787   
4    643.657139  1392.467875    6.824458e+07     58.644312   399.099776   

   level_1_energy  level_2_mean  level_2_std  level_2_energy  level_3_mean  \
0    1.334010e+07   -136.793320   868.805762    2.165900e+07   -105.754601   
1    1.391147e+08    238.336705  1913.642816    1.896605e+08   -102.318272   
2    2.936615e+08    626.101028  3516.648909    6.506999e+08   -266.706257   
3    1.456396e+06     -5.185987    99.312189    4.944903e+05     -8.925586   
4    4.718874e+06      8.353223   630.773780    2.029521e+07    -14.666764   

   ...  level_4_std  level_4_energy  level_5_mean  level_5_std  \
0  ...   356.3

# DWT 적용!

In [None]:
import pandas as pd
import numpy as np

# 1) mUsageStats
prep_mUsageStats = mUsageStats[['subject_id', 'timestamp']].copy()
prep_mUsageStats['total_usage_time'] = mUsageStats['m_usage_stats'].apply(
    lambda lst: sum(d.get('total_time', 0) for d in lst)
)

# 2) wHr
prep_wHr = wHr[['subject_id', 'timestamp']].copy()
prep_wHr['avg_heart_rate'] = wHr['heart_rate'].apply(
    lambda lst: np.mean(lst) if len(lst) > 0 else np.nan
)

# 3) mBle
prep_mBle = mBle[['subject_id', 'timestamp']].copy()
prep_mBle['wb_rssi'] = mBle['m_ble'].apply(
    lambda lst: sum(np.exp(d.get('rssi', 0) / 10) for d in lst)
)

# 4) mWifi
prep_mWifi = mWifi[['subject_id', 'timestamp']].copy()
prep_mWifi['ww_rssi'] = mWifi['m_wifi'].apply(
    lambda lst: sum(np.exp(d.get('rssi', 0) / 10) for d in lst)
)

# 5) wLight (just copy)
prep_wLight = wLight.copy()

# 6) mGps
def avg_gps(arr):
    if len(arr) == 0:
        return pd.Series({'avg_alt': np.nan, 'avg_lat': np.nan, 'avg_long': np.nan, 'avg_speed': np.nan})
    alts = [d.get('altitude', np.nan) for d in arr]
    lats = [d.get('latitude', np.nan) for d in arr]
    longs = [d.get('longitude', np.nan) for d in arr]
    speeds = [d.get('speed', np.nan) for d in arr]
    return pd.Series({
        'avg_alt': np.nanmean(alts),
        'avg_lat': np.nanmean(lats),
        'avg_long': np.nanmean(longs),
        'avg_speed': np.nanmean(speeds),
    })

prep_mGps = mGps[['subject_id', 'timestamp']].copy()
gps_avgs = mGps['m_gps'].apply(avg_gps)
prep_mGps = pd.concat([prep_mGps, gps_avgs], axis=1)

# 7) wPedo
prep_wPedo = wPedo[['subject_id', 'timestamp', 'distance', 'speed']].copy()

# 2m 30s


In [None]:
import pandas as pd
import numpy as np
import pywt

# 7개의 prep_ 데이터프레임을 딕셔너리에 모아둡니다.
prep_dfs = {
    'mUsageStats': prep_mUsageStats,
    'wHr':          prep_wHr,
    'mBle':         prep_mBle,
    'mWifi':        prep_mWifi,
    'wLight':       prep_wLight,
    'mGps':         prep_mGps,
    'wPedo':        prep_wPedo
}

# 1) DWT feature extraction 함수
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    for idx, c in enumerate(coeffs):
        feats[f'lev{idx}_mean']   = np.mean(c)
        feats[f'lev{idx}_std']    = np.std(c)
        feats[f'lev{idx}_energy'] = np.sum(c**2)
    return feats

# 2) 각 데이터프레임별로 사용할 DWT 레벨 지정 (튜닝 가능)
level_list = [3,3,3,3,3,3,3]

daily_features = {}

for (name, df), lvl in zip(prep_dfs.items(), level_list):
    # 3) timestamp → datetime으로 변환 & date 컬럼 추가
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['subject_id', 'timestamp'])
    df['date'] = df['timestamp'].dt.date

    # 4) numeric 컬럼만 골라 subject_id 제외
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != 'subject_id']

    # 5) 그룹별로 DWT 피처 뽑아서 리스트에 저장
    feature_rows = []
    for (subj, day), grp in df.groupby(['subject_id', 'date']):
        row = {'subject_id': subj, 'date': day}
        for col in numeric_cols:
            # 1분 단위 리샘플링 평균 → 누락치는 0으로 채움
            ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
            feats = extract_dwt_features(ts, level=lvl)
            # 컬럼명에 접두어 붙이기
            for k, v in feats.items():
                row[f'{col}_{k}'] = v
        feature_rows.append(row)

    # 6) DataFrame으로 변환해서 저장
    daily_features[name] = pd.DataFrame(feature_rows)

# 7) 샘플: wLight 일별 DWT 피처 확인
print(daily_features['mUsageStats'].head())
print(daily_features['mUsageStats'].shape)

# 11.4s


  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values


  subject_id        date  total_usage_time_lev0_mean  \
0       id01  2024-06-26               142850.553034   
1       id01  2024-06-27               111931.971924   
2       id01  2024-06-28               109725.298833   
3       id01  2024-06-29                96803.554352   
4       id01  2024-06-30               152565.531960   

   total_usage_time_lev0_std  total_usage_time_lev0_energy  \
0              320174.917383                  1.044805e+13   
1              155658.006840                  6.542956e+12   
2              141429.307699                  4.678116e+12   
3              131253.540690                  3.351401e+12   
4              204423.231830                  8.588593e+12   

   total_usage_time_lev1_mean  total_usage_time_lev1_std  \
0                16868.787990              195746.453812   
1               -10115.412519              225115.986414   
2                -3088.248837              207698.429065   
3                 7762.613261              182339.

In [8]:
import pandas as pd
import numpy as np

# --- prep_mActivity 생성 ---
# 1. timestamp → datetime, date 컬럼 추가
mActivity['timestamp'] = pd.to_datetime(mActivity['timestamp'])
mActivity = mActivity.sort_values(['subject_id', 'timestamp'])
mActivity['date'] = mActivity['timestamp'].dt.date

# 2. 사용할 activity 코드 목록 (0,1,2,3,4,5,7,8)
activity_codes = [0,1,2,3,4,5,7,8]

rows = []
for (subj, day), grp in mActivity.groupby(['subject_id','date']):
    total = len(grp)
    counts = grp['m_activity'].value_counts().reindex(activity_codes, fill_value=0)
    ratios = counts / total
    # transition count: 연속된 activity 값이 바뀐 횟수
    trans = (grp['m_activity'].shift() != grp['m_activity']).sum() - 1  # 첫 비교 제외
    row = {
        'subject_id': subj,
        'date':       day,
        'transition_count': int(trans)
    }
    # 각 코드별 count, ratio 추가
    for code in activity_codes:
        row[f'act_{code}_count'] = int(counts[code])
        row[f'act_{code}_ratio'] = float(ratios[code])
    rows.append(row)

prep_mActivity = pd.DataFrame(rows)
print(prep_mActivity.head())
print(prep_mActivity.shape)


# --- prep_mAmbience 생성 ---
# 1. timestamp → datetime, date 컬럼 추가
mAmbience['timestamp'] = pd.to_datetime(mAmbience['timestamp'])
mAmbience = mAmbience.sort_values(['subject_id','timestamp'])
mAmbience['date'] = mAmbience['timestamp'].dt.date

# 2. 전체 고유 label pool 추출
label_pool = set()
for lst in mAmbience['m_ambience']:
    for label, prob in lst:
        label_pool.add(label)
label_pool = sorted(label_pool)

rows = []
for (subj, day), grp in mAmbience.groupby(['subject_id','date']):
    # initialize sum dict
    sums = {lbl: 0.0 for lbl in label_pool}
    for lst in grp['m_ambience']:
        for label, prob in lst:
            sums[label] += float(prob)
    row = {'subject_id': subj, 'date': day}
    # 각 label별 확률 합계 추가
    for lbl in label_pool:
        row[f'{lbl}_prob_sum'] = sums[lbl]
    rows.append(row)

prep_mAmbience = pd.DataFrame(rows)
print(prep_mAmbience.head())
print(prep_mAmbience.shape)


  subject_id        date  transition_count  act_0_count  act_0_ratio  \
0       id01  2024-06-26                32           89     0.125176   
1       id01  2024-06-27                39          211     0.146528   
2       id01  2024-06-28                40          161     0.111806   
3       id01  2024-06-29                27           95     0.065972   
4       id01  2024-06-30                26          199     0.138194   

   act_1_count  act_1_ratio  act_2_count  act_2_ratio  act_3_count  \
0            1     0.001406            0          0.0          478   
1            0     0.000000            0          0.0          880   
2            1     0.000694            0          0.0         1241   
3            0     0.000000            0          0.0         1320   
4            0     0.000000            0          0.0         1229   

   act_3_ratio  act_4_count  act_4_ratio  act_5_count  act_5_ratio  \
0     0.672293          112     0.157525            0          0.0   
1     

In [9]:
import pandas as pd
from functools import reduce

# assume daily_features is a dict of DataFrames for the 7 DWT‐processed tables:
# daily_features = {
#     'mUsageStats': dwt_mUsageStats,
#     'wHr':          dwt_wHr,
#     'mBle':         dwt_mBle,
#     'mWifi':        dwt_mWifi,
#     'wLight':       dwt_wLight,
#     'mGps':         dwt_mGps,
#     'wPedo':        dwt_wPedo,
# }

# and the two preprocessed frames:
# prep_mActivity, prep_mAmbience

# 1. collect all DataFrames in a list
dfs = list(daily_features.values()) + [prep_mActivity, prep_mAmbience]

# 2. merge them all on ['subject_id', 'date'] via outer join
merged_df = reduce(
    lambda left, right: pd.merge(
        left, right,
        on=['subject_id', 'date'],
        how='outer'
    ),
    dfs
)

# 3. inspect
print(merged_df.head())
print(merged_df.shape)


  subject_id        date  total_usage_time_lev0_mean  \
0       id01  2024-06-26               142850.553034   
1       id01  2024-06-27               111931.971924   
2       id01  2024-06-28               109725.298833   
3       id01  2024-06-29                96803.554352   
4       id01  2024-06-30               152565.531960   

   total_usage_time_lev0_std  total_usage_time_lev0_energy  \
0              320174.917383                  1.044805e+13   
1              155658.006840                  6.542956e+12   
2              141429.307699                  4.678116e+12   
3              131253.540690                  3.351401e+12   
4              204423.231830                  8.588593e+12   

   total_usage_time_lev1_mean  total_usage_time_lev1_std  \
0                16868.787990              195746.453812   
1               -10115.412519              225115.986414   
2                -3088.248837              207698.429065   
3                 7762.613261              182339.

In [10]:
merged_df.to_csv('merged_df_lv3.csv', index=False)

# Train and test

In [11]:
train_df = pd.read_csv('../data/ch2025_metrics_train.csv')
test_df = pd.read_csv('../data/ch2025_submission_sample.csv')

In [12]:
train_df.shape, test_df.shape

((450, 9), (250, 9))

In [13]:
train_df.head()

Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-06-27,2024-06-26,0,0,0,0,0,1
1,id01,2024-06-28,2024-06-27,0,0,0,0,1,1
2,id01,2024-06-29,2024-06-28,1,0,0,1,1,1
3,id01,2024-06-30,2024-06-29,1,0,1,2,0,0
4,id01,2024-07-01,2024-06-30,0,1,1,1,1,1


In [14]:
# 1) 양쪽 날짜 컬럼을 datetime 으로 통일
merged_df['date']          = pd.to_datetime(merged_df['date'])
train_df['lifelog_date']   = pd.to_datetime(train_df['lifelog_date'])

# 2) merge
temp_train_df = pd.merge(
    merged_df,
    train_df[['subject_id','lifelog_date','Q1','Q2','Q3','S1','S2','S3']],
    left_on  = ['subject_id','date'],
    right_on = ['subject_id','lifelog_date'],
    how      = 'left'
)

print("▶ temp_train_df.shape:", temp_train_df.shape)
print(temp_train_df[['subject_id','date','lifelog_date','Q1','Q2','Q3','S1','S2','S3']].head())


▶ temp_train_df.shape: (700, 675)
  subject_id       date lifelog_date   Q1   Q2   Q3   S1   S2   S3
0       id01 2024-06-26   2024-06-26  0.0  0.0  0.0  0.0  0.0  1.0
1       id01 2024-06-27   2024-06-27  0.0  0.0  0.0  0.0  1.0  1.0
2       id01 2024-06-28   2024-06-28  1.0  0.0  0.0  1.0  1.0  1.0
3       id01 2024-06-29   2024-06-29  1.0  0.0  1.0  2.0  0.0  0.0
4       id01 2024-06-30   2024-06-30  0.0  1.0  1.0  1.0  1.0  1.0


In [15]:
# 셀 2: Q1,Q2,Q3,S1,S2,S3 모두 결측치인 행 드롭
before_cnt = temp_train_df.shape[0]

# axis=1 로 Q1~S3 컬럼을 가져와 전부 NaN 인 행 True
mask_all_na = temp_train_df[['Q1','Q2','Q3','S1','S2','S3']].isna().all(axis=1)

# drop
temp_train_df = temp_train_df.loc[~mask_all_na].reset_index(drop=True)

after_cnt = temp_train_df.shape[0]
print(f"▶ dropped {before_cnt - after_cnt} rows where all Q/S are NaN")
print("▶ 남은 행 개수:", after_cnt)


▶ dropped 250 rows where all Q/S are NaN
▶ 남은 행 개수: 450


In [16]:
# 셀: 모든 컬럼 대상 결측치 개수 확인
mask_any_na_all = temp_train_df.isna().any(axis=1)  # 하나라도 NaN 이면 True
num_any_na_all = mask_any_na_all.sum()
total_rows = temp_train_df.shape[0]

print(f"▶ 전체 컬럼 중 하나라도 NaN인 행 개수: {num_any_na_all} / 전체 {total_rows} 행")


▶ 전체 컬럼 중 하나라도 NaN인 행 개수: 88 / 전체 450 행


In [17]:
# 셀 3: temp_train_df를 70% / 30% 로 분할
from sklearn.model_selection import train_test_split

train_model_df, val_model_df = train_test_split(
    temp_train_df,
    test_size=0.3,
    random_state=42,
    shuffle=True
)

print("▶ train_model_df.shape:", train_model_df.shape)
print(train_model_df.head(), "\n")

print("▶ val_model_df.shape:", val_model_df.shape)
# print(val_model_df.head())


▶ train_model_df.shape: (315, 675)
    subject_id       date  total_usage_time_lev0_mean  \
409       id09 2024-08-27                61303.353314   
108       id03 2024-08-13                67429.961673   
229       id06 2024-06-09                         NaN   
420       id10 2024-07-09                62980.359905   
118       id03 2024-09-07                76791.552746   

     total_usage_time_lev0_std  total_usage_time_lev0_energy  \
409              131908.902287                  3.110235e+12   
108              114975.589831                  3.286744e+12   
229                        NaN                           NaN   
420              104034.585500                  1.937453e+12   
118              110201.306013                  3.283511e+12   

     total_usage_time_lev1_mean  total_usage_time_lev1_std  \
409                 6411.598572              177334.267646   
108                -1992.012752              123111.969439   
229                         NaN                    

# 모델 적용 - 결측치 제거

## 0. 데이터 준비

In [18]:
import pandas as pd
import numpy as np
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# 이미 train_model_df, val_model_df 가 로드되어 있다고 가정합니다.
# 이들에는 subject_id, date, lifelog_date, (feature들…), Q1,Q2,Q3,S1,S2,S3 컬럼이 있습니다.

# 1) X, y 분할
drop_cols = ['subject_id','date','lifelog_date']
target_cols = ['Q1','Q2','Q3','S1','S2','S3']

X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]

X_val   = val_model_df.drop(columns=drop_cols + target_cols)
y_val   = val_model_df[target_cols]

print("▶ X_train.shape:", X_train.shape, "y_train.shape:", y_train.shape)
print("▶ X_val.shape:  ", X_val.shape,   "y_val.shape:  ", y_val.shape)


▶ X_train.shape: (315, 666) y_train.shape: (315, 6)
▶ X_val.shape:   (135, 666) y_val.shape:   (135, 6)


## 1. Logistic Regression

In [19]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import numpy as np

# 1) 타겟, 드롭 칼럼 정의
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']  # 실제 컬럼명에 맞춰 조정

# 2) 피처·타겟 분리
X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# (선택) 날짜형 칼럼이 남아있다면 수치형만 골라내기
import numpy as np
X_train = X_train.select_dtypes(include=[np.number])
X_val   = X_val  .select_dtypes(include=[np.number])

# 3) NaN 행 제거
mask_tr = X_train.notna().all(axis=1)
mask_va = X_val.notna().all(axis=1)
print(f"▶ train drop: {mask_tr.size-mask_tr.sum()}, 남은 train: {mask_tr.sum()}")
print(f"▶ val   drop: {mask_va.size-mask_va.sum()}, 남은 val  : {mask_va.sum()}")

X_train, y_train = X_train.loc[mask_tr], y_train.loc[mask_tr]
X_val,   y_val   = X_val.loc[mask_va],   y_val  .loc[mask_va]

# 4) 모델 학습
lr = make_pipeline(
    StandardScaler(),
    MultiOutputClassifier(LogisticRegression(max_iter=1000, random_state=42))
)
lr.fit(X_train, y_train)

# 5) 예측 및 F1
y_pred = pd.DataFrame(
    lr.predict(X_val), index=y_val.index, columns=target_cols
)
f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ LR per-target F1:", f1s)
print("▶ LR mean F1     :", np.mean(list(f1s.values())))


▶ train drop: 65, 남은 train: 250
▶ val   drop: 23, 남은 val  : 112
▶ LR per-target F1: {'Q1': 0.5351213282247764, 'Q2': 0.5221766079046929, 'Q3': 0.5470171890798787, 'S1': 0.36224475918478366, 'S2': 0.5143288084464556, 'S3': 0.5772806775292276}
▶ LR mean F1     : 0.5096948950616359


## 2. XGBoost

In [None]:
# ─────────────────────────────────────────────────────────────
# 1) 필요한 라이브러리 임포트
from xgboost import XGBClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# ─────────────────────────────────────────────────────────────
# 2) 타겟·드롭 컬럼 정의 (셀 1과 동일)
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']  # 실제 컬럼명 확인하고 필요시 수정

# ─────────────────────────────────────────────────────────────
# 3) 피처·타겟 분리
X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# ─────────────────────────────────────────────────────────────
# 4) 수치형 열만 골라내기 (datetime 같은 혼합형 완전 제거)
X_train = X_train.select_dtypes(include=[np.number])
X_val   = X_val  .select_dtypes(include=[np.number])

# ─────────────────────────────────────────────────────────────
# 5) NaN 행 제거
mask_tr = X_train.notna().all(axis=1)
mask_va = X_val  .notna().all(axis=1)

print(f"▶ train 에서 drop: {mask_tr.size-mask_tr.sum()}, 남은 train: {mask_tr.sum()}")
print(f"▶ val   에서 drop: {mask_va.size-mask_va.sum()}, 남은 val  : {mask_va.sum()}")

X_train, y_train = X_train.loc[mask_tr], y_train.loc[mask_tr]
X_val,   y_val   = X_val  .loc[mask_va],   y_val  .loc[mask_va]

# ─────────────────────────────────────────────────────────────
# 6) XGBoost 파이프라인 생성 & 학습
xgb_pipe = make_pipeline(
    StandardScaler(), 
    MultiOutputClassifier(
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            random_state=42
        )
    )
)
xgb_pipe.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 7) 예측 & F1 계산
y_pred = pd.DataFrame(
    xgb_pipe.predict(X_val), 
    index=y_val.index, 
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro') 
       for c in target_cols}
print("▶ XGB per-target F1:", f1s)
print("▶ XGB mean F1     :", np.mean(list(f1s.values())))


# 2m 17.5s

▶ train 에서 drop: 65, 남은 train: 250
▶ val   에서 drop: 23, 남은 val  : 112


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


▶ XGB per-target F1: {'Q1': 0.6066411238825031, 'Q2': 0.4725274725274725, 'Q3': 0.5292397660818713, 'S1': 0.4747222222222222, 'S2': 0.60625, 'S3': 0.567479674796748}
▶ XGB mean F1     : 0.5428100432518028


## 3. Random forest

In [21]:
# ─────────────────────────────────────────────────────────────
# 1) 필요한 라이브러리 임포트
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# ─────────────────────────────────────────────────────────────
# 2) 타겟·드롭 컬럼 정의 (셀 1,2와 동일)
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']  # 실제 컬럼명 확인 후 수정

# ─────────────────────────────────────────────────────────────
# 3) 피처·타겟 분리
X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# ─────────────────────────────────────────────────────────────
# 4) 수치형 열만 골라내기
X_train = X_train.select_dtypes(include=[np.number])
X_val   = X_val  .select_dtypes(include=[np.number])

# ─────────────────────────────────────────────────────────────
# 5) NaN 행 제거
mask_tr = X_train.notna().all(axis=1)
mask_va = X_val  .notna().all(axis=1)

print(f"▶ train 에서 drop: {mask_tr.size-mask_tr.sum()}, 남은 train: {mask_tr.sum()}")
print(f"▶ val   에서 drop: {mask_va.size-mask_va.sum()}, 남은 val  : {mask_va.sum()}")

X_train, y_train = X_train.loc[mask_tr], y_train.loc[mask_tr]
X_val,   y_val   = X_val  .loc[mask_va],   y_val  .loc[mask_va]

# ─────────────────────────────────────────────────────────────
# 6) Random Forest 파이프라인 생성 & 학습
rf_pipe = make_pipeline(
    StandardScaler(),
    MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
    )
)
rf_pipe.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 7) 예측 & F1 계산
y_pred = pd.DataFrame(
    rf_pipe.predict(X_val),
    index=y_val.index,
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ RF  per-target F1:", f1s)
print("▶ RF  mean F1     :", np.mean(list(f1s.values())))


▶ train 에서 drop: 65, 남은 train: 250
▶ val   에서 drop: 23, 남은 val  : 112
▶ RF  per-target F1: {'Q1': 0.5153846153846154, 'Q2': 0.4873075322513525, 'Q3': 0.5472739820565907, 'S1': 0.2955437543292542, 'S2': 0.4740608228980322, 'S3': 0.5175963861750184}
▶ RF  mean F1     : 0.4728611821824773


# 모델 적용 - 결측치 imputation

## 1. Logistic regression

In [22]:
# ─────────────────────────────────────────────────────────────
# 1) 라이브러리 임포트
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# ─────────────────────────────────────────────────────────────
# 2) 피처·타겟 분리
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']

X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# ─────────────────────────────────────────────────────────────
# 3) 수치형 컬럼만 선택
num_feats = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[num_feats]
X_val   = X_val  [num_feats]

# ─────────────────────────────────────────────────────────────
# 4) 파이프라인 정의 (imputer → scaler → classifier)
lr_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MultiOutputClassifier(
        LogisticRegression(max_iter=1000, random_state=42)
    )
)

# ─────────────────────────────────────────────────────────────
# 5) 학습
lr_pipe.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 6) 예측 & F1 계산
y_pred = pd.DataFrame(
    lr_pipe.predict(X_val),
    index=y_val.index,
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ LR  per-target F1:", f1s)
print("▶ LR  mean F1     :", np.mean(list(f1s.values())))


▶ LR  per-target F1: {'Q1': 0.5258999122036874, 'Q2': 0.5358209796516543, 'Q3': 0.5151760889712698, 'S1': 0.3782287015147536, 'S2': 0.5106518282988871, 'S3': 0.5796574987026466}
▶ LR  mean F1     : 0.5075725015571498


## 2. XGboost

In [None]:
# ─────────────────────────────────────────────────────────────
# 1) 라이브러리 임포트
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# ─────────────────────────────────────────────────────────────
# 2) 피처·타겟 분리 (셀 1과 동일)
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']

X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# ─────────────────────────────────────────────────────────────
# 3) 수치형 컬럼만 선택
num_feats = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[num_feats]
X_val   = X_val  [num_feats]

# ─────────────────────────────────────────────────────────────
# 4) 파이프라인 정의 (imputer → scaler → classifier)
xgb_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MultiOutputClassifier(
        XGBClassifier(
            use_label_encoder=False,
            eval_metric='logloss',
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
    )
)

# ─────────────────────────────────────────────────────────────
# 5) 학습
xgb_pipe.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 6) 예측 & F1 계산
y_pred = pd.DataFrame(
    xgb_pipe.predict(X_val),
    index=y_val.index,
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ XGB per-target F1:", f1s)
print("▶ XGB mean F1     :", np.mean(list(f1s.values())))

# 2m 20s

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


▶ XGB per-target F1: {'Q1': 0.6367181064308858, 'Q2': 0.4925666199158485, 'Q3': 0.5870588235294117, 'S1': 0.43591499072885465, 'S2': 0.571842250413679, 'S3': 0.6275486171859229}
▶ XGB mean F1     : 0.558608234700767


## 2.1 XGBoost - 하이퍼파라미터 튜닝 (다시 해봐야됨)

In [30]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell 1) Optuna로 XGBoost 하이퍼파라미터 튜닝
# ──────────────────────────────────────────────────────────────────────────────
import optuna
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score

# train/val 데이터는 미리 준비되어 있다고 가정합니다.
# X_train, y_train, X_val, y_val 가 위 예시와 동일하게 정의돼 있어야 합니다.

def objective(trial):
    # 하이퍼파라미터 서치 스페이스
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 50, 500),
        "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
        "max_depth": trial.suggest_int("max_depth", 3, 10),
        "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
        "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
        "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
        "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 1.0),
        "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
        "random_state": 42,
        "use_label_encoder": False,
        "eval_metric": "logloss",
        "n_jobs": -1,
    }

    # 파이프라인: 결측치→스케일링→MultiOutput XGB
    pipe = make_pipeline(
        SimpleImputer(strategy="mean"),
        StandardScaler(),
        MultiOutputClassifier(XGBClassifier(**params))
    )

    # 학습
    pipe.fit(X_train, y_train)
    # 예측
    y_pred = pipe.predict(X_val)

    # Macro-F1 (각 타겟별 계산 후 평균)
    f1s = [
        f1_score(y_val[col], y_pred[:, i], average="macro")
        for i, col in enumerate(y_val.columns)
    ]
    return np.mean(f1s)

# Optuna 스터디 실행
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=30, timeout=600)

print("▶ Best mean F1:", study.best_value)
print("▶ Best params :", study.best_params)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-05-27 23:18:30,736] A new study created in memory with name: no-name-569abb44-30ba-438b-b40c-34f3abca8526
  "learning_rate": trial.suggest_loguniform("learning_rate", 1e-3, 0.3),
  "subsample": trial.suggest_uniform("subsample", 0.5, 1.0),
  "colsample_bytree": trial.suggest_uniform("colsample_bytree", 0.5, 1.0),
  "gamma": trial.suggest_loguniform("gamma", 1e-8, 10.0),
  "reg_alpha": trial.suggest_loguniform("reg_alpha", 1e-8, 1.0),
  "reg_lambda": trial.suggest_loguniform("reg_lambda", 1e-8, 1.0),
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
[W 2025-05-27 23:27:19,763] Trial 0 failed with parameter

KeyboardInterrupt: 

In [None]:
# ──────────────────────────────────────────────────────────────────────────────
# Cell 2) 최적 파라미터로 XGBoost 파이프라인 재학습 & 평가
# ──────────────────────────────────────────────────────────────────────────────
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputClassifier
from xgboost import XGBClassifier
from sklearn.metrics import f1_score
import numpy as np

best = study.best_params

# 최적 파라미터를 반영한 XGB 분류기
xgb_final = XGBClassifier(
    n_estimators=best["n_estimators"],
    learning_rate=best["learning_rate"],
    max_depth=best["max_depth"],
    subsample=best["subsample"],
    colsample_bytree=best["colsample_bytree"],
    gamma=best["gamma"],
    reg_alpha=best["reg_alpha"],
    reg_lambda=best["reg_lambda"],
    use_label_encoder=False,
    eval_metric="logloss",
    random_state=42,
    n_jobs=-1
)

# 파이프라인 정의
final_pipe = make_pipeline(
    SimpleImputer(strategy="mean"),
    StandardScaler(),
    MultiOutputClassifier(xgb_final)
)

# 전체 학습
final_pipe.fit(X_train, y_train)
# 검증 예측
y_val_pred = final_pipe.predict(X_val)

# per-target & mean Macro-F1 출력
f1s_final = {
    col: f1_score(y_val[col], y_val_pred[:, i], average="macro")
    for i, col in enumerate(y_val.columns)
}

print("▶ Final per-target F1:", f1s_final)
print("▶ Final mean F1     :", np.mean(list(f1s_final.values())))


## 3. Random Forest

In [28]:
# ─────────────────────────────────────────────────────────────
# 1) 라이브러리 임포트
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# ─────────────────────────────────────────────────────────────
# 2) 피처·타겟 분리 (셀 1과 동일)
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']

X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# ─────────────────────────────────────────────────────────────
# 3) 수치형 컬럼만 선택
num_feats = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[num_feats]
X_val   = X_val  [num_feats]

# ─────────────────────────────────────────────────────────────
# 4) 파이프라인 정의 (imputer → scaler → classifier)
rf_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MultiOutputClassifier(
        RandomForestClassifier(
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
    )
)

# ─────────────────────────────────────────────────────────────
# 5) 학습
rf_pipe.fit(X_train, y_train)

# ─────────────────────────────────────────────────────────────
# 6) 예측 & F1 계산
y_pred = pd.DataFrame(
    rf_pipe.predict(X_val),
    index=y_val.index,
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ RF  per-target F1:", f1s)
print("▶ RF  mean F1     :", np.mean(list(f1s.values())))


▶ RF  per-target F1: {'Q1': 0.5480489545030459, 'Q2': 0.52668484612388, 'Q3': 0.505555900428332, 'S1': 0.3229457110054125, 'S2': 0.5238273921200751, 'S3': 0.5904126213592233}
▶ RF  mean F1     : 0.5029125709233281


# 모델 적용 - 결측치 평균 채움 
- LigthGBM
- CatBoost

## LightGBM

In [42]:
# 셀 1) LightGBM 적용
# ─────────────────────────────────────────────────────────────
import numpy as np
import pandas as pd
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from lightgbm import LGBMClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import f1_score

# 1) 타겟·드롭 컬럼 정의
target_cols = ['Q1','Q2','Q3','S1','S2','S3']
drop_cols   = ['subject_id','date','lifelog_date']

# 2) 피처·타겟 분리
X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# 3) 수치형 컬럼만 선택
num_feats = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[num_feats]
X_val   = X_val  [num_feats]

# 4) 파이프라인 정의: impute → scale → LGBM
lgbm_pipe = make_pipeline(
    SimpleImputer(strategy='mean'),
    StandardScaler(),
    MultiOutputClassifier(
        LGBMClassifier(
            n_estimators=100,
            random_state=42,
            n_jobs=-1
        )
    )
)

# 5) 학습
lgbm_pipe.fit(X_train, y_train)

# 6) 예측 & F1 계산
y_pred = pd.DataFrame(
    lgbm_pipe.predict(X_val),
    index=y_val.index,
    columns=target_cols
)

f1s = {c: f1_score(y_val[c], y_pred[c], average='macro')
       for c in target_cols}
print("▶ LightGBM per-target F1:", f1s)
print("▶ LightGBM mean F1     :", np.mean(list(f1s.values())))


[LightGBM] [Info] Number of positive: 157, number of negative: 158
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.788067 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26913
[LightGBM] [Info] Number of data points in the train set: 315, number of used features: 585
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.498413 -> initscore=-0.006349
[LightGBM] [Info] Start training from score -0.006349
[LightGBM] [Info] Number of positive: 176, number of negative: 139
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 1.600632 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 26913
[LightGBM] [Info] Number of data points in the train set: 315, number of used features: 585
[LightGBM] [Info] [binary:



## CatBoost

In [29]:
# ──────────────────────────────────────────────────────────────────────────────
# CatBoost 를 MultiOutputClassifier 없이 타깃별로 직접 돌려주는 예제
# ──────────────────────────────────────────────────────────────────────────────
from catboost import CatBoostClassifier
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd

# 1) 전처리기 세팅 (평균 impute + 스케일링)
imputer = SimpleImputer(strategy="mean")
scaler  = StandardScaler()

# 2) X, y, val 데이터 준비 (기존과 동일)
X_train_num = imputer.fit_transform(X_train)
X_train_num = scaler.fit_transform(X_train_num)
X_val_num   = imputer.transform(X_val)
X_val_num   = scaler.transform(X_val_num)

# 3) 타깃별로 CatBoost 학습 & 예측
y_pred = pd.DataFrame(index=y_val.index, columns=target_cols)

for col in target_cols:
    # 목적에 맞게 파라미터 수정 가능
    model = CatBoostClassifier(
        iterations=200,
        learning_rate=0.1,
        depth=6,
        random_state=42,
        verbose=0,
    )
    model.fit(X_train_num, y_train[col])
    y_pred[col] = model.predict(X_val_num)

# 4) per-target & mean Macro-F1 계산
f1s = {c: f1_score(y_val[c], y_pred[c], average="macro")
       for c in target_cols}
print("▶ CatBoost per-target F1:", f1s)
print("▶ CatBoost mean F1     :", np.mean(list(f1s.values())))


▶ CatBoost per-target F1: {'Q1': 0.6517754239613632, 'Q2': 0.555052790346908, 'Q3': 0.5092664322745963, 'S1': 0.339274629777423, 'S2': 0.5619727449707982, 'S3': 0.5603287841191067}
▶ CatBoost mean F1     : 0.529611800908366


# Ensemble (아직 안함))

# Deep learning

In [31]:
# 1) 라이브러리 임포트
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import f1_score

# (이전 셀에서 이미 train_model_df, val_model_df, target_cols, drop_cols 정의되어 있다고 가정)

# 2) 피처·타깃 분리
X_train = train_model_df.drop(columns=drop_cols + target_cols)
y_train = train_model_df[target_cols]
X_val   = val_model_df  .drop(columns=drop_cols + target_cols)
y_val   = val_model_df  [target_cols]

# 3) 숫자형 피처만 선택
num_feats = X_train.select_dtypes(include=[np.number]).columns
X_train = X_train[num_feats]
X_val   = X_val  [num_feats]

# 4) 결측치 평균 대체 → 표준화
imp    = SimpleImputer(strategy="mean")
scaler = StandardScaler()

X_train_num = imp.fit_transform(X_train)
X_train_num = scaler.fit_transform(X_train_num)

X_val_num   = imp.transform(X_val)
X_val_num   = scaler.transform(X_val_num)

print("▶ X_train_num shape:", X_train_num.shape)
print("▶ X_val_num   shape:", X_val_num.shape)
print("▶ y_train     shape:", y_train.shape)
print("▶ y_val       shape:", y_val.shape)


▶ X_train_num shape: (315, 666)
▶ X_val_num   shape: (135, 666)
▶ y_train     shape: (315, 6)
▶ y_val       shape: (135, 6)


In [33]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# 1) 모델 정의
n_feats  = X_train_num.shape[1]
n_targets = len(target_cols)

mlp = Sequential([
    Dense(256, activation="relu", input_shape=(n_feats,)),
    Dropout(0.3),
    Dense(128, activation="relu"),
    Dropout(0.3),
    Dense(n_targets, activation="sigmoid")
])

mlp.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss="binary_crossentropy"
)

mlp.summary()

# 2) 학습 (조기종료 콜백)
es = tf.keras.callbacks.EarlyStopping(
    monitor="val_loss", patience=5, restore_best_weights=True
)

history = mlp.fit(
    X_train_num, y_train.values,
    validation_data=(X_val_num, y_val.values),
    epochs=50,
    batch_size=32,
    callbacks=[es],
    verbose=2
)

# 3) 예측 & F1 계산
y_pred_proba = mlp.predict(X_val_num)
# threshold 0.5
y_pred = (y_pred_proba > 0.5).astype(int)
y_pred = pd.DataFrame(y_pred, index=y_val.index, columns=target_cols)

f1s = {c: f1_score(y_val[c], y_pred[c], average="macro") 
       for c in target_cols}

print("▶ MLP per-target F1:", f1s)
print("▶ MLP mean F1     :", np.mean(list(f1s.values())))


2025-05-27 23:29:26.768333: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-05-27 23:29:26.780322: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1748356166.791195 1108030 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1748356166.794361 1108030 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1748356166.803811 1108030 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Epoch 1/50


I0000 00:00:1748356170.367996 1127465 service.cc:152] XLA service 0x8ece4009c10 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1748356170.368046 1127465 service.cc:160]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
I0000 00:00:1748356170.368053 1127465 service.cc:160]   StreamExecutor device (1): NVIDIA RTX A6000, Compute Capability 8.6
I0000 00:00:1748356170.368057 1127465 service.cc:160]   StreamExecutor device (2): NVIDIA RTX A6000, Compute Capability 8.6
I0000 00:00:1748356170.368061 1127465 service.cc:160]   StreamExecutor device (3): NVIDIA RTX A6000, Compute Capability 8.6
2025-05-27 23:29:30.427112: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
E0000 00:00:1748356170.628141 1127465 cuda_dnn.cc:522] Loaded runtime CuDNN library: 9.1.0 but source was compiled with: 9.3.0.  CuDNN library needs to

FailedPreconditionError: Graph execution error:

Detected at node StatefulPartitionedCall defined at (most recent call last):
  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/runpy.py", line 196, in _run_module_as_main

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/runpy.py", line 86, in _run_code

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel_launcher.py", line 18, in <module>

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/traitlets/config/application.py", line 1075, in launch_instance

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/kernelapp.py", line 739, in start

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/tornado/platform/asyncio.py", line 205, in start

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/asyncio/base_events.py", line 603, in run_forever

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/asyncio/base_events.py", line 1909, in _run_once

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/asyncio/events.py", line 80, in _run

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 545, in dispatch_queue

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 534, in process_one

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 437, in dispatch_shell

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 362, in execute_request

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/kernelbase.py", line 778, in execute_request

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/ipkernel.py", line 449, in do_execute

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/ipykernel/zmqshell.py", line 549, in run_cell

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3077, in run_cell

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3132, in _run_cell

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/async_helpers.py", line 128, in _pseudo_sync_runner

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3336, in run_cell_async

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3519, in run_ast_nodes

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/IPython/core/interactiveshell.py", line 3579, in run_code

  File "/tmp/ipykernel_1108030/4286890921.py", line 30, in <module>

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/keras/src/utils/traceback_utils.py", line 117, in error_handler

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 377, in fit

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 220, in function

  File "/home/wonjun/.conda/envs/wonjun_base/lib/python3.10/site-packages/keras/src/backend/tensorflow/trainer.py", line 133, in multi_step_on_iterator

DNN library initialization failed. Look at the errors above for more details.
	 [[{{node StatefulPartitionedCall}}]] [Op:__inference_multi_step_on_iterator_1464]