# Data preparation

In [2]:
import os
import numpy as np
import pandas as pd

## Variables to dataframe
- df_{file}
- df_{prep_file}

In [13]:
challenge2025_dataset_path = "ETRI_lifelog_dataset/ch2025_data_items/"

file_names = [
    "mACStatus",
    "mScreenStatus",
    "mUsageStats",
    "mActivity",
    "mBle",
    "mWifi"
]

df_dict = {}
for name in file_names:
    file_path = os.path.join(challenge2025_dataset_path, f"ch2025_{name}.parquet")
    df_dict[name] = pd.read_parquet(file_path)
    print(name)
    print(f"df_{name}: {df_dict[name].shape}")
    

mACStatus
df_mACStatus: (939896, 3)
mScreenStatus
df_mScreenStatus: (939653, 3)
mUsageStats
df_mUsageStats: (45197, 3)
mActivity
df_mActivity: (961062, 3)
mBle
df_mBle: (21830, 3)
mWifi
df_mWifi: (76336, 3)


## Train + Test

In [21]:
df_train = pd.read_csv('ETRI_lifelog_dataset/ch2025_metrics_train.csv')
df_test = pd.read_csv('ETRI_lifelog_dataset/ch2025_submission_sample.csv')

df_total = pd.concat([df_train, df_test], axis=0)

print("df_total.shape:", df_total.shape)
df_total.head()

df_total.shape: (700, 9)


Unnamed: 0,subject_id,sleep_date,lifelog_date,Q1,Q2,Q3,S1,S2,S3
0,id01,2024-06-27,2024-06-26,0,0,0,0,0,1
1,id01,2024-06-28,2024-06-27,0,0,0,0,1,1
2,id01,2024-06-29,2024-06-28,1,0,0,1,1,1
3,id01,2024-06-30,2024-06-29,1,0,1,2,0,0
4,id01,2024-07-01,2024-06-30,0,1,1,1,1,1


## df_merge

In [None]:
import pandas as pd

# lifelog_date가 string이면 datetime으로 변환
df_total['lifelog_date'] = pd.to_datetime(df_total['lifelog_date'])

result_list = []

for sid, group in df_total.groupby('subject_id'):
    # 각 id별 min/max lifelog_date
    min_date = group['lifelog_date'].min()
    max_date = group['lifelog_date'].max()
    
    # 10분 단위로 timestamp 생성
    timestamps = pd.date_range(start=min_date, end=max_date + pd.Timedelta(days=1) - pd.Timedelta(minutes=10), freq='10min')
    # 위 코드에서 max_date + 1일 - 10분 까지 하는 이유는 23:50:00 포함시키기 위해서 (끝나는 날짜 23:50까지 포함)

    # DataFrame 생성
    df_id = pd.DataFrame({
        'subject_id': sid,
        'timestamp': timestamps
    })
    result_list.append(df_id)

# 모든 id에 대해 concat
df_merge = pd.concat(result_list, ignore_index=True)

print(df_merge)
print(df_merge.shape)

       subject_id           timestamp
0            id01 2024-06-26 00:00:00
1            id01 2024-06-26 00:10:00
2            id01 2024-06-26 00:20:00
3            id01 2024-06-26 00:30:00
4            id01 2024-06-26 00:40:00
...           ...                 ...
122827       id10 2024-09-26 23:10:00
122828       id10 2024-09-26 23:20:00
122829       id10 2024-09-26 23:30:00
122830       id10 2024-09-26 23:40:00
122831       id10 2024-09-26 23:50:00

[122832 rows x 2 columns]
(122832, 2)


### df_prep_mUsageStats

In [35]:
df_mUsageStats['m_usage_stats'][1]

array([{'app_name': '통화', 'total_time': 26419},
       {'app_name': '토스', 'total_time': 119896},
       {'app_name': '전화', 'total_time': 59284},
       {'app_name': '카카오톡', 'total_time': 6744},
       {'app_name': 'NAVER', 'total_time': 67042},
       {'app_name': '\xa0✝️성경일독Q', 'total_time': 1504},
       {'app_name': 'One UI 홈', 'total_time': 209417}], dtype=object)

In [42]:
# 'subject_id', 'timestamp' 열만 추출
df_prep_mUsageStats = df_mUsageStats[['subject_id', 'timestamp']].copy()

# 각 행에 대해 total_time의 합을 구하는 함수
def sum_usage_time(usage_stats):
    #usage_stats: list of dict
    sum_time = 0
    for usage in usage_stats:
        # 'usage_time'이 없으면 0으로 처리
        sum_time += usage.get('total_time', 0)
    return sum_time

df_prep_mUsageStats['m_usage_time'] = df_mUsageStats['m_usage_stats'].apply(sum_usage_time)

# 결과 출력 (앞 10개)
print(df_prep_mUsageStats.head(3))


  subject_id           timestamp  m_usage_time
0       id01 2024-06-26 13:00:00          7955
1       id01 2024-06-26 13:10:00        490306
2       id01 2024-06-26 13:20:00        599985


### df_prep_mActivity

In [53]:
import pandas as pd

# 1. 'subject_id', 'timestamp'만 추출
df_prep_mActivity = df_mActivity[['subject_id', 'timestamp']].copy()

# 2. met_activity 열 선언
df_prep_mActivity['met_activity'] = 0.0

# 3. 매핑 함수와 시간대별 가중치 함수 정의
activity_to_met = {0: 1.3, 1: 7.2, 2: 2.3, 3: 1.1, 4: 1.0, 5: 1.3, 7: 3.4, 8: 8.0}

def get_time_weight(ts):
    hour = ts.hour
    if 0 <= hour < 8:
        return 0.3
    elif 8 <= hour < 18:
        return 0.7
    else:
        return 1.0

# timestamp를 datetime으로 변환
df_prep_mActivity['timestamp'] = pd.to_datetime(df_prep_mActivity['timestamp'])

# 4. 계산하여 met_activity 값 입력
def calc_weighted_met(row):
    met = activity_to_met.get(df_mActivity.loc[row.name, 'm_activity'], 1.0)  # 매핑 없으면 1.0
    weight = get_time_weight(row['timestamp'])
    return met * weight

df_prep_mActivity['met_activity'] = df_prep_mActivity.apply(calc_weighted_met, axis=1)

# 결과 확인 (head 10)
print(df_prep_mActivity.tail(10))


       subject_id           timestamp  met_activity
961052       id10 2024-09-26 23:50:00           1.1
961053       id10 2024-09-26 23:51:00           1.1
961054       id10 2024-09-26 23:52:00           1.1
961055       id10 2024-09-26 23:53:00           1.1
961056       id10 2024-09-26 23:54:00           1.1
961057       id10 2024-09-26 23:55:00           1.1
961058       id10 2024-09-26 23:56:00           1.1
961059       id10 2024-09-26 23:57:00           1.1
961060       id10 2024-09-26 23:58:00           1.1
961061       id10 2024-09-26 23:59:00           1.1


### df_prep_mBle

In [46]:
df_mBle['m_ble'][2]

array([{'address': '04:F5:AE:39:95:E0', 'device_class': '0', 'rssi': -44},
       {'address': '0D:BE:52:E6:13:00', 'device_class': '0', 'rssi': -80},
       {'address': '0F:13:09:75:36:FE', 'device_class': '0', 'rssi': -76},
       {'address': '15:61:31:49:2F:F5', 'device_class': '0', 'rssi': -90},
       {'address': '2B:70:D0:E0:3C:84', 'device_class': '0', 'rssi': -83},
       {'address': '2F:EF:C3:70:A0:97', 'device_class': '0', 'rssi': -41},
       {'address': '30:EF:FE:9E:E4:AD', 'device_class': '0', 'rssi': -70},
       {'address': '38:54:47:EA:74:E1', 'device_class': '0', 'rssi': -61},
       {'address': '38:C8:8D:5C:AD:83', 'device_class': '0', 'rssi': -71},
       {'address': '40:BC:AF:DD:04:C5', 'device_class': '0', 'rssi': -92},
       {'address': '45:33:4C:24:C4:C9', 'device_class': '0', 'rssi': -83},
       {'address': '47:F1:F3:8D:95:20', 'device_class': '0', 'rssi': -88},
       {'address': '54:15:89:95:27:44', 'device_class': '1064', 'rssi': -75},
       {'address': '54

In [None]:
# 'subject_id', 'timestamp' 열만 추출
df_prep_mBle = df_mBle[['subject_id', 'timestamp']].copy()

# 각 행에 대해 total_time의 합을 구하는 함수
def sum_ble_rssi(ble_stats):
    #ble_stats: list of dict
    sum_rssi = 0
    for ble in ble_stats:
        # 'ble'이 없으면 0으로 처리
        sum_rssi += np.exp(ble.get('rssi', 0) / 10)
    return sum_rssi

df_prep_mBle['m_wtb_rssi'] = df_mBle['m_ble'].apply(sum_ble_rssi)

# 결과 출력 (앞 10개)
print(df_prep_mBle.head(3))
print(df_prep_mBle.shape)


  subject_id           timestamp  m_wtb_rssi
0       id01 2024-06-26 12:13:00    0.102155
1       id01 2024-06-26 12:23:00    0.098621
2       id01 2024-06-26 12:33:00    0.037712
(21830, 3)


In [51]:

# 'subject_id', 'timestamp' 열만 추출
df_prep_mWifi = df_mWifi[['subject_id', 'timestamp']].copy()

# 각 행에 대해 total_time의 합을 구하는 함수
def sum_wifi_rssi(wifi_stats):
    # wifi_stats: list of dict
    sum_rssi = 0
    for wifi in wifi_stats:
        # 'rssi'가 없으면 0으로 처리
        sum_rssi += np.exp(wifi.get('rssi', 0) / 10)
    return sum_rssi

df_prep_mWifi['m_wtw_rssi'] = df_mWifi['m_wifi'].apply(sum_wifi_rssi)

# 결과 출력 (앞 10개)
print(df_prep_mWifi.head(3))
print(df_prep_mWifi.shape)


  subject_id           timestamp  m_wtw_rssi
0       id01 2024-06-26 12:03:00    0.202476
1       id01 2024-06-26 12:13:00    0.091135
2       id01 2024-06-26 12:23:00    0.063361
(76336, 3)
