# dwt 적용해서 feature 뽑기

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
import os

# Path for data folder
DATA_DIR = "../data/"

# Main data item list
file_names = [
    "mACStatus", "mScreenStatus", "mUsageStats", "mActivity", "mBle", "mWifi",
    "wHr", "wPedo", "mGps", "mLight","wLight", "mAmbience"
]

data_files = {name: os.path.join(DATA_DIR, f"ch2025_{name}.parquet") for name in file_names}

dfs = {}
for name, file_path in data_files.items():
    dfs[name] = pd.read_parquet(file_path)
    globals()[name] = dfs[name]
    print(f"Loaded {name} with shape {dfs[name].shape}")

Loaded mACStatus with shape (939896, 3)
Loaded mScreenStatus with shape (939653, 3)
Loaded mUsageStats with shape (45197, 3)
Loaded mActivity with shape (961062, 3)
Loaded mBle with shape (21830, 3)
Loaded mWifi with shape (76336, 3)
Loaded wHr with shape (382918, 3)
Loaded wPedo with shape (748100, 9)
Loaded mGps with shape (800611, 3)
Loaded mLight with shape (96258, 3)
Loaded wLight with shape (633741, 3)
Loaded mAmbience with shape (476577, 3)


### 일단 간단하게 wLight dwt 적용

In [9]:
!pip install PyWavelets

Collecting PyWavelets
  Downloading pywavelets-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.0 kB)
Downloading pywavelets-1.8.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m27.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PyWavelets
Successfully installed PyWavelets-1.8.0


In [13]:
import pywt

# ensure timestamp is datetime and sort by subject and time
wLight['timestamp'] = pd.to_datetime(wLight['timestamp'])
wLight = wLight.sort_values(['subject_id', 'timestamp'])

# function to extract DWT features from a 1D array
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    # for each level (approximation + details) compute mean, std, energy
    for idx, c in enumerate(coeffs):
        feats[f'level_{idx}_mean']   = np.mean(c)
        feats[f'level_{idx}_std']    = np.std(c)
        feats[f'level_{idx}_energy'] = np.sum(c**2)
    return feats

# group by subject, fill gaps, extract features
feature_rows = []
for subj, grp in wLight.groupby('subject_id'):
    values = grp['w_light'].fillna(method='ffill').fillna(0).values
    row = extract_dwt_features(values)
    row['subject_id'] = subj
    feature_rows.append(row)

# assemble into DataFrame
features_df = pd.DataFrame(feature_rows)
print(features_df.head())
print(features_df.shape)


   level_0_mean  level_0_std  level_0_energy  level_1_mean  level_1_std  \
0   1896.855697  3468.180769    1.050090e+10    -77.191514  2563.281570   
1   1269.319542  3708.348697    2.020237e+10    -58.556863  2071.915189   
2   1249.935703  3211.219235    1.123306e+10    -44.914753  1711.498931   
3   1562.229941  3980.721287    1.777468e+10     59.418617  1897.019771   
4   2187.230936  5155.305936    2.552798e+10    -18.043144  2808.941594   

   level_1_energy  level_2_mean  level_2_std  level_2_energy  level_3_mean  \
0    4.419321e+09      0.685512  2250.379328    6.770846e+09      2.734507   
1    5.649584e+09     28.815372  1420.826885    5.299376e+09    -23.675415   
2    2.772959e+09     60.081667  1552.879161    4.552357e+09      8.183103   
3    3.501353e+09     68.298851  1990.576473    7.684194e+09     32.763466   
4    6.422849e+09     45.653551  2561.705812    1.064093e+10     26.037351   

   ...  level_4_mean  level_4_std  level_4_energy  level_5_mean  level_5_std  \


  values = grp['w_light'].fillna(method='ffill').fillna(0).values


In [None]:
import pywt

# 1) timestamp를 datetime으로 변환하고 정렬
wLight['timestamp'] = pd.to_datetime(wLight['timestamp'])
wLight = wLight.sort_values(['subject_id', 'timestamp'])

print(wLight.head())
# 2) 날짜(date) 컬럼 추가
wLight['date'] = wLight['timestamp'].dt.date

# 3) DWT feature 추출 함수 (예시는 db4, 최대 6레벨)
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    for idx, c in enumerate(coeffs):
        feats[f'level_{idx}_mean']   = np.mean(c)
        feats[f'level_{idx}_std']    = np.std(c)
        feats[f'level_{idx}_energy'] = np.sum(c**2)
    return feats

# 4) subject_id × date별로 묶어서 feature 뽑기
feature_rows = []
for (subj, day), grp in wLight.groupby(['subject_id', 'date']):
    # grp을 timestamp 기준으로 인덱싱하고 나서 w_light 컬럼을 꺼냅니다.
    vals = (
        grp
        .set_index('timestamp')['w_light']
        .resample('1T').mean()
        .ffill().fillna(0)
        .values
    )

    feats = extract_dwt_features(vals)
    feats['subject_id'] = subj
    feats['date']       = day
    feature_rows.append(feats)

# 5) DataFrame 조립
daily_features = pd.DataFrame(feature_rows)
print(daily_features.head())
print(daily_features.shape)


  subject_id           timestamp  w_light        date
0       id01 2024-06-26 12:17:00    633.0  2024-06-26
1       id01 2024-06-26 12:18:00    483.0  2024-06-26
2       id01 2024-06-26 12:19:00    541.0  2024-06-26
3       id01 2024-06-26 12:20:00    547.0  2024-06-26
4       id01 2024-06-26 12:21:00    547.0  2024-06-26


  .resample('1T').mean()


   level_0_mean  level_0_std  level_0_energy  level_1_mean  level_1_std  \
0   2319.716888  1993.061284    1.590075e+08    220.529958   857.949949   
1   1487.508117  3064.055081    3.364323e+08   -380.241503  2156.959740   
2   2043.725484  4345.636720    6.687798e+08    496.916154  3143.140541   
3    443.503950   762.056177    2.176791e+07    -15.674754   227.526787   
4    643.657139  1392.467875    6.824458e+07     58.644312   399.099776   

   level_1_energy  level_2_mean  level_2_std  level_2_energy  level_3_mean  \
0    1.334010e+07   -136.793320   868.805762    2.165900e+07   -105.754601   
1    1.391147e+08    238.336705  1913.642816    1.896605e+08   -102.318272   
2    2.936615e+08    626.101028  3516.648909    6.506999e+08   -266.706257   
3    1.456396e+06     -5.185987    99.312189    4.944903e+05     -8.925586   
4    4.718874e+06      8.353223   630.773780    2.029521e+07    -14.666764   

   ...  level_4_std  level_4_energy  level_5_mean  level_5_std  \
0  ...   356.3

# DWT 적용!

In [22]:
import pandas as pd
import numpy as np

# 1) mUsageStats
prep_mUsageStats = mUsageStats[['subject_id', 'timestamp']].copy()
prep_mUsageStats['total_usage_time'] = mUsageStats['m_usage_stats'].apply(
    lambda lst: sum(d.get('total_time', 0) for d in lst)
)

# 2) wHr
prep_wHr = wHr[['subject_id', 'timestamp']].copy()
prep_wHr['avg_heart_rate'] = wHr['heart_rate'].apply(
    lambda lst: np.mean(lst) if len(lst) > 0 else np.nan
)

# 3) mBle
prep_mBle = mBle[['subject_id', 'timestamp']].copy()
prep_mBle['wb_rssi'] = mBle['m_ble'].apply(
    lambda lst: sum(np.exp(d.get('rssi', 0) / 10) for d in lst)
)

# 4) mWifi
prep_mWifi = mWifi[['subject_id', 'timestamp']].copy()
prep_mWifi['ww_rssi'] = mWifi['m_wifi'].apply(
    lambda lst: sum(np.exp(d.get('rssi', 0) / 10) for d in lst)
)

# 5) wLight (just copy)
prep_wLight = wLight.copy()

# 6) mGps
def avg_gps(arr):
    if len(arr) == 0:
        return pd.Series({'avg_alt': np.nan, 'avg_lat': np.nan, 'avg_long': np.nan, 'avg_speed': np.nan})
    alts = [d.get('altitude', np.nan) for d in arr]
    lats = [d.get('latitude', np.nan) for d in arr]
    longs = [d.get('longitude', np.nan) for d in arr]
    speeds = [d.get('speed', np.nan) for d in arr]
    return pd.Series({
        'avg_alt': np.nanmean(alts),
        'avg_lat': np.nanmean(lats),
        'avg_long': np.nanmean(longs),
        'avg_speed': np.nanmean(speeds),
    })

prep_mGps = mGps[['subject_id', 'timestamp']].copy()
gps_avgs = mGps['m_gps'].apply(avg_gps)
prep_mGps = pd.concat([prep_mGps, gps_avgs], axis=1)

# 7) wPedo
prep_wPedo = wPedo[['subject_id', 'timestamp', 'distance', 'speed']].copy()


In [27]:
import pandas as pd
import numpy as np
import pywt

# 7개의 prep_ 데이터프레임을 딕셔너리에 모아둡니다.
prep_dfs = {
    'mUsageStats': prep_mUsageStats,
    'wHr':          prep_wHr,
    'mBle':         prep_mBle,
    'mWifi':        prep_mWifi,
    'wLight':       prep_wLight,
    'mGps':         prep_mGps,
    'wPedo':        prep_wPedo
}

# 1) DWT feature extraction 함수
def extract_dwt_features(signal, wavelet='db4', level=6):
    coeffs = pywt.wavedec(signal, wavelet=wavelet, level=level)
    feats = {}
    for idx, c in enumerate(coeffs):
        feats[f'lev{idx}_mean']   = np.mean(c)
        feats[f'lev{idx}_std']    = np.std(c)
        feats[f'lev{idx}_energy'] = np.sum(c**2)
    return feats

# 2) 각 데이터프레임별로 사용할 DWT 레벨 지정 (튜닝 가능)
level_list = [3,3,3,3,3,3,3]

daily_features = {}

for (name, df), lvl in zip(prep_dfs.items(), level_list):
    # 3) timestamp → datetime으로 변환 & date 컬럼 추가
    df['timestamp'] = pd.to_datetime(df['timestamp'])
    df = df.sort_values(['subject_id', 'timestamp'])
    df['date'] = df['timestamp'].dt.date

    # 4) numeric 컬럼만 골라 subject_id 제외
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    numeric_cols = [c for c in numeric_cols if c != 'subject_id']

    # 5) 그룹별로 DWT 피처 뽑아서 리스트에 저장
    feature_rows = []
    for (subj, day), grp in df.groupby(['subject_id', 'date']):
        row = {'subject_id': subj, 'date': day}
        for col in numeric_cols:
            # 1분 단위 리샘플링 평균 → 누락치는 0으로 채움
            ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
            feats = extract_dwt_features(ts, level=lvl)
            # 컬럼명에 접두어 붙이기
            for k, v in feats.items():
                row[f'{col}_{k}'] = v
        feature_rows.append(row)

    # 6) DataFrame으로 변환해서 저장
    daily_features[name] = pd.DataFrame(feature_rows)

# 7) 샘플: wLight 일별 DWT 피처 확인
print(daily_features['mUsageStats'].head())
print(daily_features['mUsageStats'].shape)


  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values
  ts = grp.set_index('timestamp')[col].resample('1T').mean().fillna(0).values


  subject_id        date  total_usage_time_lev0_mean  \
0       id01  2024-06-26               142850.553034   
1       id01  2024-06-27               111931.971924   
2       id01  2024-06-28               109725.298833   
3       id01  2024-06-29                96803.554352   
4       id01  2024-06-30               152565.531960   

   total_usage_time_lev0_std  total_usage_time_lev0_energy  \
0              320174.917383                  1.044805e+13   
1              155658.006840                  6.542956e+12   
2              141429.307699                  4.678116e+12   
3              131253.540690                  3.351401e+12   
4              204423.231830                  8.588593e+12   

   total_usage_time_lev1_mean  total_usage_time_lev1_std  \
0                16868.787990              195746.453812   
1               -10115.412519              225115.986414   
2                -3088.248837              207698.429065   
3                 7762.613261              182339.

In [31]:
import pandas as pd
import numpy as np

# --- prep_mActivity 생성 ---
# 1. timestamp → datetime, date 컬럼 추가
mActivity['timestamp'] = pd.to_datetime(mActivity['timestamp'])
mActivity = mActivity.sort_values(['subject_id', 'timestamp'])
mActivity['date'] = mActivity['timestamp'].dt.date

# 2. 사용할 activity 코드 목록 (0,1,2,3,4,5,7,8)
activity_codes = [0,1,2,3,4,5,7,8]

rows = []
for (subj, day), grp in mActivity.groupby(['subject_id','date']):
    total = len(grp)
    counts = grp['m_activity'].value_counts().reindex(activity_codes, fill_value=0)
    ratios = counts / total
    # transition count: 연속된 activity 값이 바뀐 횟수
    trans = (grp['m_activity'].shift() != grp['m_activity']).sum() - 1  # 첫 비교 제외
    row = {
        'subject_id': subj,
        'date':       day,
        'transition_count': int(trans)
    }
    # 각 코드별 count, ratio 추가
    for code in activity_codes:
        row[f'act_{code}_count'] = int(counts[code])
        row[f'act_{code}_ratio'] = float(ratios[code])
    rows.append(row)

prep_mActivity = pd.DataFrame(rows)
print(prep_mActivity.head())
print(prep_mActivity.shape)


# --- prep_mAmbience 생성 ---
# 1. timestamp → datetime, date 컬럼 추가
mAmbience['timestamp'] = pd.to_datetime(mAmbience['timestamp'])
mAmbience = mAmbience.sort_values(['subject_id','timestamp'])
mAmbience['date'] = mAmbience['timestamp'].dt.date

# 2. 전체 고유 label pool 추출
label_pool = set()
for lst in mAmbience['m_ambience']:
    for label, prob in lst:
        label_pool.add(label)
label_pool = sorted(label_pool)

rows = []
for (subj, day), grp in mAmbience.groupby(['subject_id','date']):
    # initialize sum dict
    sums = {lbl: 0.0 for lbl in label_pool}
    for lst in grp['m_ambience']:
        for label, prob in lst:
            sums[label] += float(prob)
    row = {'subject_id': subj, 'date': day}
    # 각 label별 확률 합계 추가
    for lbl in label_pool:
        row[f'{lbl}_prob_sum'] = sums[lbl]
    rows.append(row)

prep_mAmbience = pd.DataFrame(rows)
print(prep_mAmbience.head())
print(prep_mAmbience.shape)


  subject_id        date  transition_count  act_0_count  act_0_ratio  \
0       id01  2024-06-26                32           89     0.125176   
1       id01  2024-06-27                39          211     0.146528   
2       id01  2024-06-28                40          161     0.111806   
3       id01  2024-06-29                27           95     0.065972   
4       id01  2024-06-30                26          199     0.138194   

   act_1_count  act_1_ratio  act_2_count  act_2_ratio  act_3_count  \
0            1     0.001406            0          0.0          478   
1            0     0.000000            0          0.0          880   
2            1     0.000694            0          0.0         1241   
3            0     0.000000            0          0.0         1320   
4            0     0.000000            0          0.0         1229   

   act_3_ratio  act_4_count  act_4_ratio  act_5_count  act_5_ratio  \
0     0.672293          112     0.157525            0          0.0   
1     

In [32]:
import pandas as pd
from functools import reduce

# assume daily_features is a dict of DataFrames for the 7 DWT‐processed tables:
# daily_features = {
#     'mUsageStats': dwt_mUsageStats,
#     'wHr':          dwt_wHr,
#     'mBle':         dwt_mBle,
#     'mWifi':        dwt_mWifi,
#     'wLight':       dwt_wLight,
#     'mGps':         dwt_mGps,
#     'wPedo':        dwt_wPedo,
# }

# and the two preprocessed frames:
# prep_mActivity, prep_mAmbience

# 1. collect all DataFrames in a list
dfs = list(daily_features.values()) + [prep_mActivity, prep_mAmbience]

# 2. merge them all on ['subject_id', 'date'] via outer join
merged_df = reduce(
    lambda left, right: pd.merge(
        left, right,
        on=['subject_id', 'date'],
        how='outer'
    ),
    dfs
)

# 3. inspect
print(merged_df.head())
print(merged_df.shape)


  subject_id        date  total_usage_time_lev0_mean  \
0       id01  2024-06-26               142850.553034   
1       id01  2024-06-27               111931.971924   
2       id01  2024-06-28               109725.298833   
3       id01  2024-06-29                96803.554352   
4       id01  2024-06-30               152565.531960   

   total_usage_time_lev0_std  total_usage_time_lev0_energy  \
0              320174.917383                  1.044805e+13   
1              155658.006840                  6.542956e+12   
2              141429.307699                  4.678116e+12   
3              131253.540690                  3.351401e+12   
4              204423.231830                  8.588593e+12   

   total_usage_time_lev1_mean  total_usage_time_lev1_std  \
0                16868.787990              195746.453812   
1               -10115.412519              225115.986414   
2                -3088.248837              207698.429065   
3                 7762.613261              182339.

# Train and test

In [33]:
train_df = pd.read_csv('../data/ch2025_metrics_train.csv')
test_df = pd.read_csv('../data/ch2025_submission_sample.csv')

In [36]:
train_df.shape, test_df.shape

((450, 9), (250, 9))