### ETRI Lifelog Data Preprocessing

<br><br>

### Devleopment Envrionment

In [19]:
import os
import tqdm
import glob
import pyarrow
import numpy as np
import pandas as pd
from termcolor import colored
from functools import reduce
from dateutil import parser

### 0.0 Train Dataset

#### 0.1 Train Label Dataframe

In [20]:
path = '/workspace/human_understanding_2024/train_label.csv'
train_label_df = pd.read_csv(path)
train_label_df = train_label_df.sort_values("subject_id")
train_users = sorted(list(set(list(train_label_df['subject_id']))))
user_label_dfs = []
for idx, train_user in enumerate(train_users):
    user_label = train_label_df['subject_id'] == train_user   
    user_label_df = train_label_df[user_label]
    del user_label_df['Unnamed: 0']
    user_label_df = user_label_df.sort_values("date").reset_index(drop=True)
    user_label_df.columns = ['userId', 'date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']
    user_label_dfs.append(user_label_df)
    
    if idx == 0:
        pd.set_option('display.max_rows', 4)
        pd.set_option('display.max_colwidth', None)
        # display(user_label_df)
        # print()

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
all_user_label_df = pd.concat(user_label_dfs, axis=0).reset_index(drop=True)
display(all_user_label_df)          

Unnamed: 0,userId,date,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-30,1,0,0,1,1,0,0
1,user01,2020-08-31,0,0,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...
506,user30,2020-09-24,1,0,1,0,1,1,1
507,user30,2020-09-25,0,0,1,0,1,1,0


#### 0.2 Train Sleep Time Dataframe

In [21]:
path = '/workspace/human_understanding_2025/user_sleep_2020.csv'
user_sleep_df = pd.read_csv(path)
user_sleep_df = user_sleep_df[['userId', 'date', 'startDt', 'endDt', 'lastUpdate']]  
train_sleep_users = sorted(list(set(list(user_sleep_df['userId']))))
user_Dt_dfs = []
for idx, train_sleep_user in enumerate(train_sleep_users):
    user_Dt = user_sleep_df['userId'] == train_sleep_user   
    user_Dt_df = user_sleep_df[user_Dt]
    user_Dt_df = user_Dt_df.sort_values("date").reset_index(drop=True)
    user_Dt_df['startPosition'] = list(user_Dt_df['startDt'])[0:1] + list(user_Dt_df['lastUpdate'])[:-1]
    user_Dt_df['endPosition'] = list(user_Dt_df['lastUpdate'])
    user_Dt_dfs.append(user_Dt_df)
    
    if idx == 0:
        pd.set_option('display.max_rows', 4)
        pd.set_option('display.max_colwidth', None)
        # display(user_Dt_df)  
        # print()
        
print(colored("Sleep Time", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
all_user_Dt_df = pd.concat(user_Dt_dfs, axis=0).reset_index(drop=True)   
display(all_user_Dt_df)             

[1mSleep Time[0m


Unnamed: 0,userId,date,startDt,endDt,lastUpdate,startPosition,endPosition
0,user01,2020-08-31,1598802240,1598830980,1598838373,1598802240,1598838373
1,user01,2020-09-01,1598897280,1598922060,1598923633,1598838373,1598923633
...,...,...,...,...,...,...,...
613,user30,2020-09-25,1600965120,1600987200,1600987323,1600900872,1600987323
614,user30,2020-09-26,1601058000,1601079120,1601086511,1600987323,1601086511


#### 0.3 Time, Feature Dataframe from Folder Name

In [22]:
user_ids = []; user_Dts =[]; user_features =[]; user_missing_features = []
components = []

directory = ['user01-06/', 'user07-10/', 
             'user11-12/', 'user21-25/', 'user26-30/']

for dir in directory:
    p = "/workspace/human_understanding_2025/" + dir
    user_list = glob.glob(p + "*" + os.path.sep)
    for user in user_list:
        component_list = glob.glob(user + "*" + os.path.sep)

        for component in component_list:
            user_name = component.split("/")[-3]
            Dt = component.split("/")[-2]
            
            user_ids.append(user_name)
            user_Dts.append(Dt)
            
            feature_list = glob.glob(component + "*" + os.path.sep)
            
            user_feature = {}; user_missing_feature = []
            for feature in feature_list:
                feature_name = feature.split("/")[-2]
                csv_list = glob.glob(feature + "*.csv")
                csv_num = len(csv_list)
                user_feature[feature_name] = csv_num

                if csv_num == 0:
                    user_missing_feature.append(feature_name)

            user_features.append(user_feature)
            components.append(component)

            if len(user_missing_feature) == 0:
                user_missing_feature = "all right"
            user_missing_features.append(user_missing_feature)

user_folder_df = pd.DataFrame({"userId":user_ids,
                              "Dt":user_Dts})
user_folder_df = user_folder_df.sort_values("userId")

user_feature_df = pd.DataFrame({"userId":user_ids,
                              "Dt":user_Dts,
                              "missing feature":user_missing_features,
                              "feature":user_features,
                              "path":components})
user_feature_df = user_feature_df.sort_values("userId")

user_folder_dfs = []; user_feature_dfs = []
train_startDt_users = sorted(list(set(list(user_ids))))
for idx, train_startDt_user in enumerate(train_startDt_users):
    user_folder = user_folder_df['userId'] == train_startDt_user   
    folder_user_df = user_folder_df[user_folder]
    folder_user_df = folder_user_df.sort_values("Dt").reset_index(drop=True)
    user_folder_dfs.append(folder_user_df)

    user_feature = user_feature_df['userId'] == train_startDt_user   
    feature_user_df = user_feature_df[user_feature]
    feature_user_df = feature_user_df.sort_values("Dt").reset_index(drop=True)
    user_feature_dfs.append(feature_user_df)
    
    if idx == 0:
        pd.set_option('display.max_rows', 4)
        pd.set_option('display.max_colwidth', None)
        
        # print(colored("Time from Folder Name", attrs=['bold']))
        # display(folder_user_df)  
        # print()

        # print(colored("Feature from Folder Name", attrs=['bold']))
        # display(feature_user_df)
        # print()

print(colored("Time from Folder Name", attrs=['bold']))
all_user_folder_df = pd.concat(user_folder_dfs, axis=0).reset_index(drop=True)   
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(all_user_folder_df)   
print()

print(colored("Feature from Folder Name", attrs=['bold']))
all_user_feature_df = pd.concat(user_feature_dfs, axis=0).reset_index(drop=True)   
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(all_user_feature_df) 
print()

user_ids = []; user_Dts = []; user_missing_features = []
for user_feature_df in user_feature_dfs:
    for idx, row in user_feature_df.iterrows():
        if row['missing feature'] != "all right":
            user_ids.append(row['userId'])
            user_Dts.append(row['Dt'])
            user_missing_features.append(row['missing feature'])

all_user_missing_feature_df = pd.DataFrame({"userId":user_ids,
                            "Dt":user_Dts,
                            "missing feature":user_missing_features}) 

print(colored("Missing Feature from Folder Name", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(all_user_missing_feature_df)
print()

[1mTime from Folder Name[0m


Unnamed: 0,userId,Dt
0,user01,1598759880
1,user01,1598832660
...,...,...
569,user30,1601079300
570,user30,1601165700



[1mFeature from Folder Name[0m


Unnamed: 0,userId,Dt,missing feature,feature,path
0,user01,1598759880,"[e4Bvp, e4Acc, e4Eda, e4Hr, e4Temp]","{'mGps': 463, 'mAcc': 437, 'e4Bvp': 0, 'e4Acc': 0, 'mGyr': 405, 'mMag': 433, 'e4Eda': 0, 'e4Hr': 0, 'e4Temp': 0}",/workspace/human_understanding_2025/user01-06/user01/1598759880/
1,user01,1598832660,all right,"{'mGps': 778, 'mAcc': 728, 'e4Bvp': 119, 'e4Acc': 119, 'mGyr': 723, 'mMag': 723, 'e4Eda': 119, 'e4Hr': 119, 'e4Temp': 119}",/workspace/human_understanding_2025/user01-06/user01/1598832660/
...,...,...,...,...,...
569,user30,1601079300,all right,"{'mGps': 772, 'mAcc': 772, 'e4Bvp': 770, 'e4Acc': 770, 'mGyr': 772, 'mMag': 772, 'e4Eda': 770, 'e4Hr': 770, 'e4Temp': 770}",/workspace/human_understanding_2025/user26-30/user30/1601079300/
570,user30,1601165700,all right,"{'mGps': 753, 'mAcc': 755, 'e4Bvp': 742, 'e4Acc': 742, 'mGyr': 755, 'mMag': 755, 'e4Eda': 742, 'e4Hr': 742, 'e4Temp': 580}",/workspace/human_understanding_2025/user26-30/user30/1601165700/



[1mMissing Feature from Folder Name[0m


Unnamed: 0,userId,Dt,missing feature
0,user01,1598759880,"[e4Bvp, e4Acc, e4Eda, e4Hr, e4Temp]"
1,user01,1600873320,"[mGps, e4Eda]"
...,...,...,...
95,user28,1600117200,"[e4Bvp, e4Acc, e4Eda, e4Hr, e4Temp]"
96,user29,1598762940,"[e4Bvp, e4Acc, e4Eda, e4Hr, e4Temp]"





#### 0.4 Matched Train Input CSV Path Dataframe

In [None]:
all_user_ids = []; all_user_Dts =[]; all_user_dates = []
all_user_startPositions = []; all_user_endPositions = []
user_match_folder_dfs = []

user_features =[]; user_missing_features = []
user_merge_label_Dt_dfs = []

user_merge_input_dfs = []

user_path_dfs = []

user_label_dfs_for_csv = []

for user_label_df, user_Dt_df, user_feature_df in zip(user_label_dfs, user_Dt_dfs, user_feature_dfs):
    user_merge_label_Dt_df = pd.merge(user_label_df, user_Dt_df, left_on='date', right_on='date', how='inner')
    user_merge_label_Dt_dfs.append(user_merge_label_Dt_df)

    user_ids = []; user_Dts =[]; user_dates = []
    user_startPositions = []; user_endPositions = []
    for num_Dt in range(len(user_merge_label_Dt_df)):
        row_Dt = user_merge_label_Dt_df.loc[num_Dt]
        # startDt = int(row_Dt['startDt'])
        # endDt = int(row_Dt['endDt'])
        # lastUpdate = int(row_Dt['lastUpdate'])

        date = row_Dt['date']
        startPosition = int(row_Dt['startPosition'])
        endPosition = int(row_Dt['endPosition'])

        for num_folder in range(len(user_feature_df)):
            row_folder = user_feature_df.loc[num_folder]
            dt = int(row_folder['Dt'])
            if dt >= startPosition and dt <= endPosition:
                user_id = row_folder['userId']
                user_date = date
                user_Dt = row_folder['Dt']

                user_ids.append(user_id)
                user_dates.append(date)
                user_Dts.append(user_Dt)
                user_startPositions.append(startPosition)
                user_endPositions.append(endPosition)


    user_match_folder_df = pd.DataFrame({"userId":user_ids, "date":user_dates,
                                        "Dt":user_Dts, "startPosition":user_startPositions, "endPosition":user_endPositions})
    user_match_folder_dfs.append(user_match_folder_df)


for user_match_folder_df, user_merge_label_Dt_df, user_feature_df, user_label_df in zip(user_match_folder_dfs, user_merge_label_Dt_dfs, user_feature_dfs, user_label_dfs):                                                                                 
    user_merge_input_df = pd.merge(user_match_folder_df, user_merge_label_Dt_df, left_on='startPosition', right_on='startPosition', how='inner')
    user_merge_input_df = user_merge_input_df.sort_values("date_x")
    del user_merge_input_df['userId_x']
    
    user_merge_input_df = pd.merge(user_merge_input_df, user_feature_df, left_on='Dt', right_on='Dt', how='inner')
    user_merge_input_df = user_merge_input_df[['userId_x', 'date_x', 'Dt', 'startPosition', 'endPosition_x', 'missing feature', 'feature', 'path']]
    user_merge_input_df.columns = ['userId', 'date', 'Dt', 'startPosition', 'endPosition', 'missing feature', 'feature', 'path']
    user_merge_input_df = user_merge_input_df.sort_values("date")
    user_merge_input_dfs.append(user_merge_input_df)

    dates = sorted(list(user_merge_input_df['date'].value_counts().keys()))
    user_ids = list(user_merge_input_df['userId'].value_counts().keys()) * len(dates)
    user_folders = []; counts = []
    user_hrate_csv_lists = []
    user_mgps_csv_lists = []
    user_macc_csv_lists = []

    for date in dates:
        user_split_date_df = user_merge_input_df.loc[user_merge_input_df.date == date] 
        user_folder = list(user_split_date_df['path'])  
        user_folder = sorted(user_folder)
        count = len(user_folder)
        counts.append(count) 
        user_folders.append(user_folder)

        user_hrate_folders = []; user_hrate_date_csv_lists = []
        user_mgps_folders = []; user_mgps_date_csv_lists = []
        user_macc_folders = []; user_macc_date_csv_lists = []

        feature_names = ['e4Hr', 'mGps', 'mAcc']

        for feature_name in feature_names:
            for user_folder_name in user_folder:
                
                if feature_name == 'e4Hr':
                    user_hrate_folder = user_folder_name + feature_name
                    user_hrate_folders.append(user_hrate_folder)
                    user_hrate_date_csv_list = glob.glob(user_hrate_folder + "/*.csv")
                    user_hrate_date_csv_lists.append(user_hrate_date_csv_list)

                elif feature_name == 'mGps':
                    user_mgps_folder = user_folder_name + feature_name
                    user_mgps_folders.append(user_mgps_folder)
                    user_mgps_date_csv_list = glob.glob(user_mgps_folder + "/*.csv")
                    user_mgps_date_csv_lists.append(user_mgps_date_csv_list)

                elif feature_name == 'mAcc': 
                    user_macc_folder = user_folder_name + feature_name
                    user_macc_folders.append(user_macc_folder)
                    user_macc_date_csv_list = glob.glob(user_macc_folder + "/*.csv")
                    user_macc_date_csv_lists.append(user_macc_date_csv_list)

        user_hrate_csv_lists.append(user_hrate_date_csv_lists)
        user_mgps_csv_lists.append(user_mgps_date_csv_lists)
        user_macc_csv_lists.append(user_macc_date_csv_lists)

    user_path_df = pd.DataFrame({"userId":user_ids, "date":dates, "count":counts, "folder":user_folders,
                                 "e4Hr":user_hrate_csv_lists, "mGps":user_mgps_csv_lists, 'mAcc':user_macc_csv_lists})
    user_path_dfs.append(user_path_df)

    user_label_df = pd.merge(user_path_df, user_label_df, left_on='date', right_on='date', how='inner')
    user_label_df = user_label_df.sort_values("date")
    user_label_df = user_label_df[["userId_x", "date", "Q1", "Q2", "Q3", "S1", "S2", "S3", "S4"]]
    user_label_df.columns = ['userId', "date", "Q1", "Q2", "Q3", "S1", "S2", "S3", "S4"]
    user_label_dfs_for_csv.append(user_label_df)

all_user_match_folder_df = pd.concat(user_match_folder_dfs, axis=0).reset_index(drop=True)   
all_user_label_Dt_df = pd.concat(user_merge_label_Dt_dfs, axis=0).reset_index(drop=True)   
all_user_input_info_df = pd.concat(user_merge_input_dfs, axis=0).reset_index(drop=True)  
all_user_path_df = pd.concat(user_path_dfs, axis=0).reset_index(drop=True)  
all_user_label_df = pd.concat(user_label_dfs_for_csv, axis=0).reset_index(drop=True)  
all_user_input_info_df.to_csv("workspace/data/train_input_info.csv", index=False)
all_user_label_df.to_csv("workspace/data/train_label.csv", index=False)

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
print(colored("Matched Folder", attrs=['bold']))
display(all_user_match_folder_df)
print()

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
print(colored("Train Input Information", attrs=['bold']))
display(all_user_input_info_df)     
print()

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
print(colored("Train Input Feature CSV File Path", attrs=['bold']))
display(all_user_path_df)     
print()

print(colored("Train Feature CSV File", attrs=['bold']))
print(colored("Example: e4Hr", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
csv_e4Hr = all_user_path_df['e4Hr']
for i, csv_nested_list in enumerate(csv_e4Hr):
    for j, csv_list in enumerate(csv_nested_list):
        for k, csv in enumerate(csv_list):
            if i == 0 and j == 0 and k == 0:
                user_feature_csv_example_df = pd.read_csv(csv)
                display(user_feature_csv_example_df)
                print()

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
print(colored("Train Label", attrs=['bold']))
display(all_user_label_df)

[1mMatched Folder[0m


Unnamed: 0,userId,date,Dt,startPosition,endPosition
0,user01,2020-08-31,1598832660,1598802240,1598838373
1,user01,2020-09-05,1599269580,1599185829,1599273253
...,...,...,...,...,...
466,user30,2020-09-24,1600816380,1600814550,1600896726
467,user30,2020-09-25,1600902660,1600900872,1600987323



[1mTrain Input Information[0m


Unnamed: 0,userId,date,Dt,startPosition,endPosition,missing feature,feature,path
0,user01,2020-08-31,1598832660,1598802240,1598838373,all right,"{'mGps': 778, 'mAcc': 728, 'e4Bvp': 119, 'e4Acc': 119, 'mGyr': 723, 'mMag': 723, 'e4Eda': 119, 'e4Hr': 119, 'e4Temp': 119}",/workspace/human_understanding_2025/user01-06/user01/1598832660/
1,user01,2020-09-05,1599269580,1599185829,1599273253,all right,"{'mGps': 698, 'mAcc': 677, 'e4Bvp': 689, 'e4Acc': 689, 'mGyr': 642, 'mMag': 642, 'e4Eda': 689, 'e4Hr': 688, 'e4Temp': 689}",/workspace/human_understanding_2025/user01-06/user01/1599269580/
...,...,...,...,...,...,...,...,...
466,user30,2020-09-24,1600816380,1600814550,1600896726,all right,"{'mGps': 840, 'mAcc': 841, 'e4Bvp': 829, 'e4Acc': 829, 'mGyr': 841, 'mMag': 841, 'e4Eda': 829, 'e4Hr': 828, 'e4Temp': 829}",/workspace/human_understanding_2025/user26-30/user30/1600816380/
467,user30,2020-09-25,1600902660,1600900872,1600987323,all right,"{'mGps': 753, 'mAcc': 740, 'e4Bvp': 743, 'e4Acc': 743, 'mGyr': 725, 'mMag': 754, 'e4Eda': 743, 'e4Hr': 743, 'e4Temp': 743}",/workspace/human_understanding_2025/user26-30/user30/1600902660/



[1mTrain Input Feature CSV File Path[0m


Unnamed: 0,userId,date,count,folder,e4Hr,mGps,mAcc
0,user01,2020-08-31,1,[/workspace/human_understanding_2025/user01-06/user01/1598832660/],"[[/workspace/human_understanding_2025/user01-06/user01/1598832660/e4Hr/1598874420.csv, /workspac...","[[/workspace/human_understanding_2025/user01-06/user01/1598832660/mGps/1598846940.csv, /workspac...","[[/workspace/human_understanding_2025/user01-06/user01/1598832660/mAcc/1598846940.csv, /workspac..."
1,user01,2020-09-05,1,[/workspace/human_understanding_2025/user01-06/user01/1599269580/],"[[/workspace/human_understanding_2025/user01-06/user01/1599269580/e4Hr/1599279120.csv, /workspac...","[[/workspace/human_understanding_2025/user01-06/user01/1599269580/mGps/1599279120.csv, /workspac...","[[/workspace/human_understanding_2025/user01-06/user01/1599269580/mAcc/1599279120.csv, /workspac..."
...,...,...,...,...,...,...,...
388,user30,2020-09-24,1,[/workspace/human_understanding_2025/user26-30/user30/1600816380/],"[[/workspace/human_understanding_2025/user26-30/user30/1600816380/e4Hr/1600820280.csv, /workspac...","[[/workspace/human_understanding_2025/user26-30/user30/1600816380/mGps/1600820280.csv, /workspac...","[[/workspace/human_understanding_2025/user26-30/user30/1600816380/mAcc/1600820280.csv, /workspac..."
389,user30,2020-09-25,1,[/workspace/human_understanding_2025/user26-30/user30/1600902660/],"[[/workspace/human_understanding_2025/user26-30/user30/1600902660/e4Hr/1600918980.csv, /workspac...","[[/workspace/human_understanding_2025/user26-30/user30/1600902660/mGps/1600918980.csv, /workspac...","[[/workspace/human_understanding_2025/user26-30/user30/1600902660/mAcc/1600918980.csv, /workspac..."



[1mTrain Feature CSV File[0m
[1mExample: e4Hr[0m


Unnamed: 0,timestamp,hr
0,0.0,86.53
1,1.0,86.75
...,...,...
58,58.0,103.02
59,59.0,103.05



[1mTrain Label[0m


Unnamed: 0,userId,date,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-31,0,0,0,0,1,1,1
1,user01,2020-09-05,1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...
388,user30,2020-09-24,1,0,1,0,1,1,1
389,user30,2020-09-25,0,0,1,0,1,1,0


### 0.5.0 wHr

In [26]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

wHr_lists = []; remove_index = []
csv_e4Hr = all_user_path_df['e4Hr']
idx = -1

with tqdm.tqdm(csv_e4Hr) as pbar:
    for csv_nested_list in pbar:
        idx += 1
        wHr_date_lists = []
        for j, csv_list in enumerate(csv_nested_list):
            for k, csv in enumerate(csv_list):
                user_feature_csv_example_df = pd.read_csv(csv)
                user_feature_csv_example_df = user_feature_csv_example_df[(user_feature_csv_example_df['hr'] != 0)]
                wHr_list = list(user_feature_csv_example_df['hr'])
                wHr_list = make_average_list(wHr_list, 50)
                wHr_date_lists.append(wHr_list)
        wHr_date_lists = sum(wHr_date_lists, []) 
        wHr_date_lists = make_average_list(wHr_date_lists, 50)
        if len(wHr_date_lists) == 0:
            remove_index.append(idx)
            pass
        elif len(wHr_date_lists) != 0:
            wHr_lists.append(wHr_date_lists)
        
all_user_wHr_df = all_user_path_df[['userId', 'date']]
all_user_wHr_df = all_user_wHr_df.drop(index=remove_index)
all_user_wHr_df = all_user_wHr_df.reset_index()
del all_user_wHr_df['index'] 
all_user_wHr_df['hr'] = wHr_lists
all_user_wHr_df.to_csv("workspace/data/train_wHr_input.csv", index=False)

100%|██████████| 390/390 [02:43<00:00,  2.38it/s]


In [27]:
all_user_wHr_df = pd.read_csv("workspace/data/train_wHr_input.csv")
all_user_label_df = pd.read_csv("workspace/data/train_label.csv")
all_user_wHr_df = all_user_wHr_df.merge(all_user_label_df)

all_user_wHr_df.to_csv("workspace/data/train_wHr_data.csv", index=False)

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
print(colored("Train wHr", attrs=['bold']))
display(all_user_wHr_df)     
print()

[1mTrain wHr[0m


Unnamed: 0,userId,date,hr,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-31,"[86.500984, 87.74435600000002, 86.43617274074074, 89.721256, 90.36156666666668]",0,0,0,0,1,1,1
1,user01,2020-09-05,"[87.484928, 85.21238755555554, 86.728824, 89.728356, 90.956332, 87.33840533333333, 84.615324, 84...",1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...
377,user30,2020-09-24,"[78.25824399999999, 80.19544, 77.76200399999999, 81.45235600000001, 78.67375200000001, 81.138328...",1,0,1,0,1,1,1
378,user30,2020-09-25,"[80.138268, 79.45635936507936, 81.50123199999999, 78.75182799999999, 76.837956, 80.69712, 80.258...",0,0,1,0,1,1,0





### 0.5.1 mGps

In [28]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

lat_lists = []; lon_lists = []; remove_index = []
csv_mGps = all_user_path_df['mGps']
idx = -1

with tqdm.tqdm(csv_mGps) as pbar:
    for csv_nested_list in pbar:
        idx += 1
        lat_date_lists = []; lon_date_lists = []
        for j, csv_list in enumerate(csv_nested_list):
            for k, csv in enumerate(csv_list):
                user_feature_csv_example_df = pd.read_csv(csv)
                user_feature_csv_example_df = user_feature_csv_example_df[(user_feature_csv_example_df['lat'] != 0) & (user_feature_csv_example_df['lon'] != 0)]
               
                lat_list = list(user_feature_csv_example_df['lat'])
                lat_date_lists.append(lat_list)
                    
                lon_list = list(user_feature_csv_example_df['lon'])
                lon_date_lists.append(lon_list)
                
        lat_date_lists = sum(lat_date_lists, []) 
        lat_date_lists = make_average_list(lat_date_lists, 50)

        lon_date_lists = sum(lon_date_lists, []) 
        lon_date_lists = make_average_list(lon_date_lists, 50)   

        if len(lat_date_lists) == 0 or len(lon_date_lists) == 0:
            remove_index.append(idx)
            pass
        else:
            lat_lists.append(lat_date_lists)
            lon_lists.append(lon_date_lists)                    
        
all_user_mGps_df = all_user_path_df[['userId', 'date']]
all_user_mGps_df = all_user_mGps_df.drop(index=remove_index)
all_user_mGps_df = all_user_mGps_df.reset_index()
del all_user_mGps_df['index'] 
all_user_mGps_df['lat'] = lat_lists
all_user_mGps_df['lon'] = lon_lists
all_user_mGps_df.to_csv("workspace/data/train_mGps_input.csv", index=False)

100%|██████████| 390/390 [04:02<00:00,  1.61it/s]


In [29]:
all_user_mGps_df = pd.read_csv("workspace/data/train_mGps_input.csv")
all_user_label_df = pd.read_csv("workspace/data/train_label.csv")
all_user_mGps_df = all_user_mGps_df.merge(all_user_label_df)

all_user_mGps_df.to_csv("workspace/data/train_mGps_data.csv", index=False)

pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
print(colored("Train wHr", attrs=['bold']))
display(all_user_mGps_df)     
print()

[1mTrain wHr[0m


Unnamed: 0,userId,date,lat,lon,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-31,"[37.428727606, 37.373258362, 37.45968430600001, 37.441553966, 37.439966222, 37.483896382000005, ...","[127.06283952599999, 127.035457894, 127.037780458, 126.981853632, 126.98031775800001, 127.013245...",0,0,0,0,1,1,1
1,user01,2020-09-05,"[37.483967086, 37.486892482, 37.484844412, 37.482705612, 37.48434709400001, 37.483589436, 37.485...","[127.01222557400003, 127.000313692, 126.996643608, 126.974013362, 127.00607272999997, 127.012433...",1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...
358,user30,2020-09-24,"[37.48705599, 37.48724637, 37.486589011999996, 37.487062484, 37.48705531, 37.487036232, 37.48686...","[126.89534209399999, 126.866566006, 126.88410588199999, 126.886156966, 126.89534959400002, 126.8...",1,0,1,0,1,1,1
359,user30,2020-09-25,"[37.487034374, 37.485593026, 37.487086408, 37.487099118, 37.48448434, 37.486945552, 37.487054703...","[126.876101378, 126.88212281200002, 126.895381506, 126.895346558, 126.871686934, 126.86689568999...",0,0,1,0,1,1,0





### 0.5.2 mAcc

In [31]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

def divide_df(l): 
    nested_df = []; nested_lens = []
    users = sorted(list(set(list(l['userId']))))
    for user in users:
        user_label = l['userId'] == user   
        user_label_df = l[user_label]
        user_label_df = user_label_df.reset_index()
        del user_label_df['index']
        nested_len = len(user_label_df)
        nested_df.append(user_label_df)
        nested_lens.append(nested_len)
    return nested_df, nested_lens

all_user_label_df = pd.read_csv("workspace/data/train_label.csv")
all_user_label_nested_df, nested_lens = divide_df(all_user_label_df)

all_user_mAcc_df = all_user_path_df[['userId', 'date']]
all_user_mAcc_nested_df, nested_lens = divide_df(all_user_mAcc_df)

csv_nested_mAcc_path = []
csv_nested_mAcc, nested_lens = divide_df(all_user_path_df)
for csv_mAcc in csv_nested_mAcc:
    csv_nested_mAcc_path.append(list(csv_mAcc['mAcc']))

for idx, (csv_mAcc_path, all_user_label_df, all_user_mAcc_df) in enumerate(zip(csv_nested_mAcc_path, all_user_label_nested_df, all_user_mAcc_nested_df)):
    # if idx >= 16:
    x_lists = []; y_lists = []; z_lists = []; remove_index = []
    remove_idx = -1
    with tqdm.tqdm(csv_mAcc_path) as pbar:
        pbar.set_description(f'{idx + 1}/{len(all_user_mAcc_nested_df)}')
        for csv_nested_list in pbar:
            remove_idx += 1
            x_folder_lists = []; y_folder_lists = []; z_folder_lists = []
            for j, csv_list in enumerate(csv_nested_list):
                for k, csv in enumerate(csv_list):
                    user_feature_csv_example_df = pd.read_csv(csv)
                    user_feature_csv_example_df = user_feature_csv_example_df[(user_feature_csv_example_df['x'] != 0) & (user_feature_csv_example_df['y'] != 0) & (user_feature_csv_example_df['z'] != 0)]
                    x_list = list(user_feature_csv_example_df['x'])
                    x_folder_lists.append(x_list)
                    y_list = list(user_feature_csv_example_df['y'])
                    y_folder_lists.append(y_list)
                    z_list = list(user_feature_csv_example_df['z'])
                    z_folder_lists.append(z_list)                   
            x_date_lists = sum(x_folder_lists, []) 
            y_date_lists = sum(y_folder_lists, [])  
            z_date_lists = sum(z_folder_lists, []) 

            x_date_lists = make_average_list(x_date_lists, 50)
            y_date_lists = make_average_list(y_date_lists, 50)
            z_date_lists = make_average_list(z_date_lists, 50)

            if len(x_date_lists) == 0 or len(y_date_lists) == 0 or len(z_date_lists) == 0:
                remove_index.append(remove_idx)
                pass
            else:
                x_lists.append(x_date_lists)
                y_lists.append(y_date_lists)
                z_lists.append(z_date_lists)

        all_user_mAcc_df = all_user_mAcc_df.drop(index=remove_index)
        all_user_mAcc_df = all_user_mAcc_df.reset_index()
        del all_user_mAcc_df['index'] 
        all_user_mAcc_df['x'] = x_lists
        all_user_mAcc_df['y'] = y_lists
        all_user_mAcc_df['z'] = z_lists
        all_user_mAcc_df = all_user_mAcc_df.merge(all_user_label_df)
        if idx < 9: idx = "0" + str(idx+1)
        elif idx >= 9: idx = str(idx+1)
        all_user_mAcc_df.to_csv("workspace/data/train_mAcc_data_" + idx + ".csv", index=False)

1/22: 100%|██████████| 10/10 [01:41<00:00, 10.12s/it]
2/22: 100%|██████████| 10/10 [03:08<00:00, 18.83s/it]
3/22: 100%|██████████| 7/7 [11:31<00:00, 98.72s/it] 
4/22: 100%|██████████| 24/24 [04:30<00:00, 11.28s/it]
5/22: 100%|██████████| 26/26 [00:23<00:00,  1.13it/s]
6/22: 100%|██████████| 26/26 [05:01<00:00, 11.59s/it]
7/22: 100%|██████████| 20/20 [07:13<00:00, 21.66s/it]
8/22: 100%|██████████| 10/10 [10:15<00:00, 61.58s/it]
9/22: 100%|██████████| 19/19 [04:28<00:00, 14.16s/it]
10/22: 100%|██████████| 19/19 [04:50<00:00, 15.30s/it]
11/22: 100%|██████████| 10/10 [15:30<00:00, 93.06s/it]
12/22: 100%|██████████| 18/18 [06:04<00:00, 20.24s/it]
13/22: 100%|██████████| 18/18 [03:52<00:00, 12.94s/it]
14/22: 100%|██████████| 16/16 [01:07<00:00,  4.21s/it]
15/22: 100%|██████████| 26/26 [05:29<00:00, 12.66s/it]
16/22: 100%|██████████| 27/27 [05:25<00:00, 12.06s/it]
17/22: 100%|██████████| 22/22 [00:00<00:00, 52.97it/s]
18/22: 100%|██████████| 20/20 [05:22<00:00, 16.12s/it]
19/22: 100%|████████

In [32]:
all_user_mAcc_df = pd.read_csv("workspace/data/train_mAcc_data_01.csv")
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
print(colored("Train mAcc", attrs=['bold']))
display(all_user_mAcc_df)     
print()

[1mTrain mAcc[0m


Unnamed: 0,userId,date,x,y,z,Q1,Q2,Q3,S1,S2,S3,S4
0,user01,2020-08-31,"[0.027773587904000002, 0.031508656592, 0.029593237011999996, 0.028779183128, 0.027821473508, 0.0...","[0.1482056118, 0.1452845969, 0.14499728316000002, 0.14878023720000003, 0.1552926648, 0.141166443...","[9.72238382, 9.7270767, 9.72482599, 9.723437259999999, 9.725256889999999, 9.7290878, 9.72808216,...",0,0,0,0,1,1,1
1,user01,2020-09-05,"[-2.215039396, -2.22849521, -2.230362774, -2.3605634220000002, -2.307027428, -2.275997646, -2.28...","[-9.41017044, -9.413666000000001, -9.39398513, -9.344663109999999, -9.37176624, -9.37574079, -9....","[1.3032516680000001, 1.25627598, 1.3568355300000001, 1.492830336, 1.3840344999999998, 1.39720301...",1,0,1,1,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...
8,user01,2020-09-24,"[0.22300275619999999, 0.22012962780000003, 0.22429566579999996, 0.22405623859999999, 0.222954872...","[8.629349619999998, 8.62762565, 8.62958903, 8.63069033, 8.628248150000001, 8.62920587, 8.6280087...","[4.366630546, 4.36361375, 4.365433417999999, 4.365912302, 4.3645236, 4.367827694, 4.363470104, 4...",0,0,1,0,1,1,1
9,user01,2020-09-26,"[-0.509070712, -0.6214579671999999, -0.5076341472000001, 0.6235649328799999, 0.37829541972, -0.8...","[-9.811929739999998, -9.79770775, -9.81580837, -9.812216990000001, -9.76394851, -9.6632931199999...","[-1.0088516412, -1.0296818282, -0.8390017803599998, -0.7917387918799998, -1.5652811028, -1.93534...",0,0,1,0,1,0,1





### 1.0.0 Validation & Test Datset

### 1.1.0 Validation & Test Label

In [None]:
val_label_path = "workspace/human_understanding_2024/val_label.csv"
test_label_path = "workspace/human_understanding_2024/answer_sample.csv"

df_val_label = pd.read_csv(val_label_path)
df_test_label = pd.read_csv(test_label_path)

print(colored("Validation Label", attrs=['bold']))
pd.set_option('display.max_rows', 4)
df_val_label.columns = ['userId', 'date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']
display(df_val_label)
print()

print(colored("Test Label", attrs=['bold']))
pd.set_option('display.max_rows', 4)
df_test_label.columns = ['userId', 'date', 'Q1', 'Q2', 'Q3', 'S1', 'S2', 'S3', 'S4']
display(df_test_label)
print()

### 1.2.0 wHr

스마트워치(갤럭시 워치)에서 측정된 심박 데이터. 1초 간격으로 측정됨.

In [None]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

val_dataset_path = "workspace/human_understanding_2024/val_dataset"
test_dataset_path = "workspace/human_understanding_2024/test_dataset"

valid_file_name = "ch2024_val__w_heart_rate.parquet.gzip"
test_file_name = "ch2024_test_w_heart_rate.parquet.gzip"

val_label_path = "workspace/human_understanding_2024/val_label.csv"
test_label_path = "workspace/human_understanding_2024/answer_sample.csv"

df_val_label = pd.read_csv(val_label_path)
df_test_label = pd.read_csv(test_label_path)

valid_hrate_df = pd.read_parquet(os.path.join(val_dataset_path, valid_file_name))
test_hrate_df  = pd.read_parquet(os.path.join(test_dataset_path, test_file_name))

valid_hrate_df  = valid_hrate_df [(valid_hrate_df ['heart_rate'] != 0)]
test_hrate_df = test_hrate_df[(test_hrate_df['heart_rate'] != 0)]

valid_hrate_df["timestamp"] = valid_hrate_df["timestamp"].astype(str)
valid_hrate_df[['date', 'timestamp']] = valid_hrate_df['timestamp'].str.split(' ', expand=True)
valid_hrate_df = valid_hrate_df[['subject_id', "date", "timestamp", "heart_rate"]]
valid_hrate_df.columns = ['userId', "date", "timestamp", "hr"]

user_valid_hrate_dfs = []
valid_sleep_users = sorted(list(set(list(valid_hrate_df['userId']))))
with tqdm.tqdm(valid_sleep_users) as pbar:
    pbar.set_description('Validation Data')
    for valid_sleep_user in pbar:
        user_valid_hrate = valid_hrate_df['userId'] == valid_sleep_user   
        user_valid_hrate_df = valid_hrate_df[user_valid_hrate]
        dates = sorted(list(user_valid_hrate_df['date'].value_counts().keys()))

        wHr_lists = []; remove_index = []
        idx = -1
        for date in dates:
            idx += 1
            user_split_valid_hrate_df = user_valid_hrate_df.loc[user_valid_hrate_df.date == date]
            wHr_date_list = list(user_split_valid_hrate_df['hr'])
            wHr_date_list = make_average_list(wHr_date_list, 50)
            if len(wHr_date_list) == 0:
                remove_index.append(idx)
                pass
            elif len(wHr_date_list) != 0:
                wHr_lists.append(wHr_date_list)
        
        user_valid_hrate_df = pd.DataFrame({"userId":valid_sleep_user, "date":dates})
        user_valid_hrate_df = user_valid_hrate_df.drop(index=remove_index)
        user_valid_hrate_df = user_valid_hrate_df.reset_index()
        del user_valid_hrate_df['index'] 
        user_valid_hrate_df["hr"] = wHr_lists
        user_valid_hrate_dfs.append(user_valid_hrate_df)


test_hrate_df["timestamp"] = test_hrate_df["timestamp"].astype(str)
test_hrate_df[['date', 'timestamp']] = test_hrate_df['timestamp'].str.split(' ', expand=True)
test_hrate_df = test_hrate_df[['subject_id', "date", "timestamp", "heart_rate"]]
test_hrate_df.columns = ['userId', "date", "timestamp", "hr"]

user_test_hrate_dfs = []
test_sleep_users = sorted(list(set(list(test_hrate_df['userId']))))
with tqdm.tqdm(test_sleep_users) as pbar:
    pbar.set_description('Test Data')
    for test_sleep_user in pbar:
        user_test_hrate = test_hrate_df['userId'] == test_sleep_user   
        user_test_hrate_df = test_hrate_df[user_test_hrate]
        dates = sorted(list(user_test_hrate_df['date'].value_counts().keys()))

        wHr_lists = []; remove_index = []
        idx = -1
        for date in dates:
            idx += 1
            user_split_test_hrate_df = user_test_hrate_df.loc[user_test_hrate_df.date == date]
            wHr_date_list = list(user_split_test_hrate_df['hr'])
            wHr_date_list = make_average_list(wHr_date_list, 50)
            if len(wHr_date_list) == 0:
                remove_index.append(idx)
                pass
            elif len(wHr_date_list) != 0:
                wHr_lists.append(wHr_date_list)
        
        user_test_hrate_df = pd.DataFrame({"userId":test_sleep_user, "date":dates})
        user_test_hrate_df = user_test_hrate_df.drop(index=remove_index)
        user_test_hrate_df = user_test_hrate_df.reset_index()
        del user_test_hrate_df['index']         
        user_test_hrate_df["hr"] = wHr_lists
        user_test_hrate_dfs.append(user_test_hrate_df)

print()

user_valid_hrate_df = pd.concat(user_valid_hrate_dfs, axis=0).reset_index(drop=True)
new_data = {'Q1': list(df_val_label['Q1']), 'Q2': list(df_val_label['Q2']), 'Q3': list(df_val_label['Q3']), 
            'S1': list(df_val_label['S1']), 'S2': list(df_val_label['S2']), 'S3': list(df_val_label['S3']), 'S4': list(df_val_label['S4'])}
user_valid_hrate_df = user_valid_hrate_df.assign(**new_data)
user_valid_hrate_df.to_csv("/workspace/data/valid_wHr_data.csv", index=False)

user_test_hrate_df = pd.concat(user_test_hrate_dfs, axis=0).reset_index(drop=True)
new_data = {'Q1': list(df_test_label['Q1']), 'Q2': list(df_test_label['Q2']), 'Q3': list(df_test_label['Q3']), 
            'S1': list(df_test_label['S1']), 'S2': list(df_test_label['S2']), 'S3': list(df_test_label['S3']), 'S4': list(df_test_label['S4'])}
user_test_hrate_df = user_test_hrate_df.assign(**new_data)
user_test_hrate_df.to_csv("/workspace/data/test_wHr_data.csv", index=False)

print(colored("Valid wHr", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(valid_hrate_df)
print()

print(colored("Test wHr", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(test_hrate_df)
print()

print(colored("Valid wHr", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(user_valid_hrate_df)
print()

print(colored("Test wHr", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(user_test_hrate_df)
print()

### 1.3.0 mGps

스마트폰에서 산출된 GPS 좌표 정보(단, 위도 및 경도는 상대 좌표로 변환됨). 5초 간격(1분당 약 12회)으로 측정됨.

In [None]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

val_dataset_path = "workspace/human_understanding_2024/val_dataset"
test_dataset_path = "workspace/human_understanding_2024/test_dataset"

valid_file_name = "ch2024_val__m_gps.parquet.gzip"
test_file_name = "ch2024_test_m_gps.parquet.gzip"

val_label_path = "workspace/human_understanding_2024/val_label.csv"
test_label_path = "workspace/human_understanding_2024/answer_sample.csv"

df_val_label = pd.read_csv(val_label_path)
df_test_label = pd.read_csv(test_label_path)

valid_mGps_df = pd.read_parquet(os.path.join(val_dataset_path, valid_file_name))
test_mGps_df  = pd.read_parquet(os.path.join(test_dataset_path, test_file_name))

valid_mGps_df = valid_mGps_df[(valid_mGps_df['latitude'] != 0) & (valid_mGps_df['longitude'] != 0)]
test_mGps_df = test_mGps_df[(test_mGps_df['latitude'] != 0) & (test_mGps_df['longitude'] != 0)]

valid_mGps_df["timestamp"] = valid_mGps_df["timestamp"].astype(str)
valid_mGps_df[['date', 'timestamp']] = valid_mGps_df['timestamp'].str.split(' ', expand=True)
valid_mGps_df = valid_mGps_df[['subject_id', "date", "timestamp", "latitude", "longitude"]]
valid_mGps_df.columns = ['userId', "date", "timestamp", "lat", "lon"]

test_mGps_df["timestamp"] = test_mGps_df["timestamp"].astype(str)
test_mGps_df[['date', 'timestamp']] = test_mGps_df['timestamp'].str.split(' ', expand=True)
test_mGps_df = test_mGps_df[['subject_id', "date", "timestamp", "latitude", "longitude"]]
test_mGps_df.columns = ['userId', "date", "timestamp", "lat", "lon"]

user_valid_mGps_dfs = []
valid_sleep_users = sorted(list(set(list(valid_mGps_df['userId']))))
with tqdm.tqdm(valid_sleep_users) as pbar:
    pbar.set_description('Validation Data')
    for valid_sleep_user in pbar:
        user_valid_mGps = valid_mGps_df['userId'] == valid_sleep_user   
        user_valid_mGps_df = valid_mGps_df[user_valid_mGps]
        user_valid_mGps_df = user_valid_mGps_df.sort_values("date").reset_index(drop=True)
        
        dates = sorted(list(user_valid_mGps_df['date'].value_counts().keys()))

        lat_lists = []; lon_lists = []; remove_index = []
        idx = -1
        for date in dates:
            idx += 1
            user_split_valid_mGps_df = user_valid_mGps_df.loc[user_valid_mGps_df.date == date]
            lat_date_list = list(user_split_valid_mGps_df['lat'])
            lat_date_list = make_average_list(lat_date_list, 50)

            lon_date_list = list(user_split_valid_mGps_df['lon'])
            lon_date_list = make_average_list(lon_date_list, 50)

            if len(lat_date_list) == 0 or len(lon_date_list) == 0:
                remove_index.append(idx)
                pass
            else:
                lat_lists.append(lat_date_list)
                lon_lists.append(lon_date_list)

        user_valid_mGps_df = pd.DataFrame({"userId":valid_sleep_user, "date":dates})
        user_valid_mGps_df = user_valid_mGps_df.drop(index=remove_index)
        user_valid_mGps_df = user_valid_mGps_df.reset_index()
        del user_valid_mGps_df['index']  
        user_valid_mGps_df["lat"] = lat_lists
        user_valid_mGps_df["lon"] = lon_lists
        user_valid_mGps_dfs.append(user_valid_mGps_df)

user_test_mGps_dfs = []
test_sleep_users = sorted(list(set(list(test_mGps_df['userId']))))
with tqdm.tqdm(test_sleep_users) as pbar:
    pbar.set_description('Test Data')
    for test_sleep_user in pbar:
        user_test_mGps = test_mGps_df['userId'] == test_sleep_user   
        user_test_mGps_df = test_mGps_df[user_test_mGps]
        user_test_mGps_df = user_test_mGps_df.sort_values("date").reset_index(drop=True)

        dates = sorted(list(user_test_mGps_df['date'].value_counts().keys()))

        lat_lists = []; lon_lists = []; remove_index = []
        idx = -1
        for date in dates:
            idx += 1
            user_split_test_mGps_df = user_test_mGps_df.loc[user_test_mGps_df.date == date]
            lat_date_list = list(user_split_test_mGps_df['lat'])
            lat_date_list = make_average_list(lat_date_list, 50)

            lon_date_list = list(user_split_test_mGps_df['lon'])
            lon_date_list = make_average_list(lon_date_list, 50)

            if len(lat_date_list) == 0 or len(lon_date_list) == 0:
                remove_index.append(idx)
                pass
            else:
                lat_lists.append(lat_date_list)
                lon_lists.append(lon_date_list)

        user_test_mGps_df = pd.DataFrame({"userId":test_sleep_user, "date":dates})
        user_test_mGps_df = user_test_mGps_df.drop(index=remove_index)
        user_test_mGps_df = user_test_mGps_df.reset_index()
        del user_test_mGps_df['index']  
        user_test_mGps_df["lat"] = lat_lists
        user_test_mGps_df["lon"] = lon_lists
        user_test_mGps_dfs.append(user_test_mGps_df)

print()

user_valid_mGps_df = pd.concat(user_valid_mGps_dfs, axis=0).reset_index(drop=True)
data_names = []; text_names = []
for i, j, k, l in zip(user_valid_mGps_df['userId'], user_valid_mGps_df['date'], df_val_label['subject_id'], df_val_label['date']):
    data_name = str(i) + " " + str(j); data_names.append(data_name)
    text_name = str(k) + " " + str(l); text_names.append(text_name)

for text_name in text_names:
    try: 
        data_names.index(text_name)
    except:
        remove_index = df_val_label[(df_val_label.subject_id.astype(str) == text_name.split(" ")[0]) & (df_val_label.date.astype(str) == text_name.split(" ")[1])].index[0]
        df_val_label = df_val_label.drop(remove_index)

user_test_mGps_df = pd.concat(user_test_mGps_dfs, axis=0).reset_index(drop=True)
data_names = []; text_names = []
for i, j, k, l in zip(user_test_mGps_df['userId'], user_test_mGps_df['date'], df_test_label['subject_id'], df_test_label['date']):
    data_name = str(i) + " " + str(j); data_names.append(data_name)
    text_name = str(k) + " " + str(l); text_names.append(text_name)

for text_name in text_names:
    try: 
        data_names.index(text_name)
    except:
        remove_index = df_test_label[(df_test_label.subject_id.astype(str) == text_name.split(" ")[0]) & (df_test_label.date.astype(str) == text_name.split(" ")[1])].index[0]
        df_test_label = df_test_label.drop(remove_index)

new_data = {'Q1': list(df_val_label['Q1']), 'Q2': list(df_val_label['Q2']), 'Q3': list(df_val_label['Q3']), 
            'S1': list(df_val_label['S1']), 'S2': list(df_val_label['S2']), 'S3': list(df_val_label['S3']), 'S4': list(df_val_label['S4'])}
user_valid_mGps_df = user_valid_mGps_df.assign(**new_data)
user_valid_mGps_df.to_csv("/workspace/data/valid_mGps_data.csv", index=False)

user_test_mGps_df = pd.concat(user_test_mGps_dfs, axis=0).reset_index(drop=True)
new_data = {'Q1': list(df_test_label['Q1']), 'Q2': list(df_test_label['Q2']), 'Q3': list(df_test_label['Q3']), 
            'S1': list(df_test_label['S1']), 'S2': list(df_test_label['S2']), 'S3': list(df_test_label['S3']), 'S4': list(df_test_label['S4'])}
user_test_mGps_df = user_test_mGps_df.assign(**new_data)
user_test_mGps_df.to_csv("/workspace/data/test_mGps_data.csv", index=False)

print(colored("Valid mGps", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(valid_mGps_df)
print()

print(colored("Test mGps", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(test_mGps_df)
print()

print(colored("Valid mGps", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(user_valid_mGps_df)
print()

print(colored("Test mGps", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(user_test_mGps_df)
print()

### 1.4.0 mAcc

스마트폰의 가속도 센서 데이터. 1초당 약 50회씩(50Hz) 측정됨.

In [None]:
def make_average_list(data, length): 
    average_list = []
    for i in range(0, len(data), length): 
        nested_list = data[i:i + length]
        netsed_mean = np.mean(nested_list)
        average_list.append(netsed_mean)
    return average_list

def divide_date_df(l): 
    nested_df = []; nested_lens = []
    userids = sorted(list(set(list(l['subject_id']))))
    for userid in userids:
        userid_label = l['subject_id'] == userid   
        userid_label_df = l[userid_label]
        userid_label_df = userid_label_df.reset_index()
        del userid_label_df['index']
        nested_len = len(userid_label_df)
        nested_df.append(userid_label_df)
        nested_lens.append(nested_len)
    return nested_df, nested_lens

valid_mAcc_dfs = []; test_mAcc_dfs =[]

val_dataset_path = "workspace/human_understanding_2024/val_dataset"
test_dataset_path = "workspace/human_understanding_2024/test_dataset"

valid_file_names = ["ch2024_val__m_acc_part_1.parquet.gzip",
              "ch2024_val__m_acc_part_2.parquet.gzip",
              "ch2024_val__m_acc_part_3.parquet.gzip",
              "ch2024_val__m_acc_part_4.parquet.gzip"]
test_file_names = ["ch2024_test__m_acc_part_5.parquet.gzip",
              "ch2024_test__m_acc_part_6.parquet.gzip",
              "ch2024_test__m_acc_part_7.parquet.gzip",
              "ch2024_test__m_acc_part_8.parquet.gzip"]

val_label_path = "workspace/human_understanding_2024/val_label.csv"
test_label_path = "workspace/human_understanding_2024/answer_sample.csv"

df_val_label = pd.read_csv(val_label_path)
df_test_label = pd.read_csv(test_label_path)

df_val_labels, nested_val_lens  = divide_date_df(df_val_label)
df_test_labels, nested_test_lens  = divide_date_df(df_test_label)

df_val_label
valid_idx = 0; test_idx = 0
for valid_file_name, test_file_name, df_val_label, df_test_label in zip(valid_file_names, test_file_names, df_val_labels, df_test_labels):
    valid_mAcc_df = pd.read_parquet(os.path.join(val_dataset_path, valid_file_name)) 
    test_mAcc_df = pd.read_parquet(os.path.join(test_dataset_path, test_file_name)) 

    user_valid_mAcc_df = valid_mAcc_df.copy()
    user_test_mAcc_df = test_mAcc_df.copy()

    valid_mAcc_df = valid_mAcc_df[(valid_mAcc_df['x'] != 0) & (valid_mAcc_df['y'] != 0) & (valid_mAcc_df['z'] != 0)]
    test_mAcc_df = test_mAcc_df[(test_mAcc_df['x'] != 0) & (test_mAcc_df['y'] != 0) & (test_mAcc_df['z'] != 0)]

    valid_mAcc_df = valid_mAcc_df.copy()
    valid_mAcc_df['timestamp'] = pd.to_datetime(valid_mAcc_df['timestamp'], format='%Y-%m-%d %H:%M:%S.%f').dt.date
    valid_mAcc_df.columns = ['userId', "date", "x", "y", "z"] 

    test_mAcc_df = test_mAcc_df.copy()
    test_mAcc_df['timestamp'] = pd.to_datetime(test_mAcc_df['timestamp'], format='%Y-%m-%d %H:%M:%S.%f').dt.date
    test_mAcc_df.columns = ['userId', "date", "x", "y", "z"] 

    valid_mAcc_len = [i for i in range(len(valid_mAcc_dfs))]
    test_mAcc_len = [i for i in range(len(test_mAcc_dfs))]

    valid_idx += 1
    dates = sorted(list(valid_mAcc_df['date'].value_counts().keys()))

    x_lists = []; y_lists = []; z_lists = []; remove_index = []
    idx = -1
    with tqdm.tqdm(dates) as pbar:
        pbar.set_description(f"Valid {valid_idx}/{len(valid_file_names)}")
        for date in pbar:
            idx += 1
            user_split_valid_mAcc_df = valid_mAcc_df.loc[valid_mAcc_df.date == date]
            x_date_list = list(user_split_valid_mAcc_df['x'])
            y_date_list = list(user_split_valid_mAcc_df['y'])
            z_date_list = list(user_split_valid_mAcc_df['z'])
            x_date_list = make_average_list(x_date_list, 50)
            y_date_list = make_average_list(y_date_list, 50)
            z_date_list = make_average_list(z_date_list, 50)

            if len(x_date_list) == 0 or len(y_date_list) == 0 or len(z_date_list) == 0:
                remove_index.append(idx)
                pass
            else:
                x_lists.append(x_date_list)
                y_lists.append(y_date_list)
                z_lists.append(z_date_list)

    valid_sleep_user = [list(valid_mAcc_df["userId"])[0]] * len(dates)
    valid_mAcc_df = pd.DataFrame({"userId":valid_sleep_user, "date":dates})
    valid_mAcc_df = valid_mAcc_df.reset_index()
    del valid_mAcc_df['index']  
    valid_mAcc_df = valid_mAcc_df.drop(index=remove_index)
    valid_mAcc_df["x"] = x_lists
    valid_mAcc_df["y"] = y_lists
    valid_mAcc_df["z"] = z_lists

    data_names = []; text_names = []
    for i, j, k, l in zip(valid_mAcc_df['userId'], valid_mAcc_df['date'], df_val_label['subject_id'], df_val_label['date']):
        data_name = str(i) + " " + str(j); data_names.append(data_name)
        text_name = str(k) + " " + str(l); text_names.append(text_name)

    for text_name in text_names:
        try: 
            data_names.index(text_name)
        except:
            remove_index = df_val_label[(df_val_label.subject_id.astype(str) == text_name.split(" ")[0]) & (df_val_label.date.astype(str) == text_name.split(" ")[1])].index[0]
            df_val_label = df_val_label.drop(remove_index)

    new_data = {'Q1': list(df_val_label['Q1']), 'Q2': list(df_val_label['Q2']), 'Q3': list(df_val_label['Q3']), 
                'S1': list(df_val_label['S1']), 'S2': list(df_val_label['S2']), 'S3': list(df_val_label['S3']), 'S4': list(df_val_label['S4'])}
    valid_mAcc_df = valid_mAcc_df.assign(**new_data)
 
    if valid_idx < 9: str_valid_idx = "0" + str(valid_idx)
    elif valid_idx >= 9: str_valid_idx = str(valid_idx)

    valid_mAcc_df.to_csv("/workspace/data/valid_mAcc_data_" + str_valid_idx + ".csv", index=False)

    test_idx += 1
    dates = sorted(list(test_mAcc_df['date'].value_counts().keys()))

    x_lists = []; y_lists = []; z_lists = []; remove_index = []
    idx = -1
    with tqdm.tqdm(dates) as pbar:
        pbar.set_description(f"Test {test_idx}/{len(test_file_names)}")
        for date in pbar:
            idx += 1
            user_split_test_mAcc_df = test_mAcc_df.loc[test_mAcc_df.date == date]
            x_date_list = list(user_split_test_mAcc_df['x'])
            y_date_list = list(user_split_test_mAcc_df['y'])
            z_date_list = list(user_split_test_mAcc_df['z'])
            x_date_list = make_average_list(x_date_list, 50)
            y_date_list = make_average_list(y_date_list, 50)
            z_date_list = make_average_list(z_date_list, 50)

            if len(x_date_list) == 0 or len(y_date_list) == 0 or len(z_date_list) == 0:
                remove_index.append(idx)
                pass
            else:
                x_lists.append(x_date_list)
                y_lists.append(y_date_list)
                z_lists.append(z_date_list)

    test_sleep_user = [list(test_mAcc_df["userId"])[0]] * len(dates)
    test_mAcc_df = pd.DataFrame({"userId":test_sleep_user, "date":dates})
    test_mAcc_df = test_mAcc_df.drop(index=remove_index)
    test_mAcc_df = test_mAcc_df.reset_index()
    del test_mAcc_df['index']      
    test_mAcc_df["x"] = x_lists
    test_mAcc_df["y"] = y_lists
    test_mAcc_df["z"] = z_lists

    data_names = []; text_names = []
    for i, j, k, l in zip(test_mAcc_df['userId'], test_mAcc_df['date'], df_test_label['subject_id'], df_test_label['date']):
        data_name = str(i) + " " + str(j); data_names.append(data_name)
        text_name = str(k) + " " + str(l); text_names.append(text_name)

    for text_name in text_names:
        try: 
            data_names.index(text_name)
        except:
            remove_index = df_test_label[(df_test_label.subject_id.astype(str) == text_name.split(" ")[0]) & (df_test_label.date.astype(str) == text_name.split(" ")[1])].index[0]
            df_test_label = df_test_label.drop(remove_index)

    new_data = {'Q1': list(df_test_label['Q1']), 'Q2': list(df_test_label['Q2']), 'Q3': list(df_test_label['Q3']), 
                'S1': list(df_test_label['S1']), 'S2': list(df_test_label['S2']), 'S3': list(df_test_label['S3']), 'S4': list(df_test_label['S4'])}
    test_mAcc_df = test_mAcc_df.assign(**new_data)

    if test_idx < 9: str_test_idx = "0" + str(test_idx)
    elif test_idx >= 9: str_test_idx = str(test_idx)

    test_mAcc_df.to_csv("/workspace/data/test_mAcc_data_" + str_test_idx + ".csv", index=False)

print(colored("Valid mAcc", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(user_valid_mAcc_df)
print()

print(colored("Test mAcc", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', None)
display(user_test_mAcc_df)
print()

print(colored("Valid mAcc", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(valid_mAcc_df)
print()

print(colored("Test mAcc", attrs=['bold']))
pd.set_option('display.max_rows', 4)
pd.set_option('display.max_colwidth', 100)
display(test_mAcc_df)
print()

## Reference

<b>AIFactory Competition</b>
<br>[제3회 ETRI 휴먼이해 인공지능 논문경진대회](https://aifactory.space/task/2790/overview)

<b>ETRI</b>
<br>[ETRI Lifelog Dataset 2020 (English)](https://nanum.etri.re.kr/share/schung1/ETRILifelogDataset2020?lang=ko_KR)
<br>[ETRI Lifelog Dataset 2020 (Korean)](https://nanum.etri.re.kr/share/schung1/ETRILifelogDataset2020?lang=ko_KR)

<br><b>Paper</b>
<br>[Human Understanding AI Paper Challenge 2024 - Dataset Design](https://arxiv.org/abs/2403.16509)
<br>[라이프로그 기반 일상생활 활동유형에 대한 탐색적 연구]()
<br>[Real-world multimodal lifelog dataset for human behavior study](https://kiss.kstudy.com/Detail/Ar?key=3860737)
<br>[An empirical study on finding experience sampling
parameters to explain sleep quality based on
dimension reduction](https://www.researchgate.net/publication/338365671_An_empirical_study_on_finding_experience_sampling_parameters_to_explain_sleep_quality_based_on_dimension_reduction)
<br>[Assessing Sleep Quality Using Mobile EMAs: Opportunities, Practical Consideration, and Challenges](https://ieeexplore.ieee.org/document/9667514)
<br>[Finding Points-of-Interest (PoIs) from Life-logging and Location Trace Data](https://ieeexplore.ieee.org/document/8940021)
<br>[Sensor Data Acquisition and Multimodal Sensor Fusion for Human Activity Recognition Using Deep Learning](https://www.mdpi.com/1424-8220/19/7/1716)
<br>[SPER: Stay-Point Extraction considering Revisits in a Single Trajectory](https://ieeexplore.ieee.org/document/9621139)
<br>[Population Based Training of Neural Networks](https://arxiv.org/abs/1711.09846)