In [1]:
import pandas as pd

# Load Train data
train_df = pd.read_csv('../data/TrainingDataPD25/TrainActivities.csv')
train_df.head()

Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject
0,1130251,2806,1 (FACING camera) Sit and stand,2024/09/02 6:16,2024/09/02 6:16,2024/09/02 6:16,U22
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024/09/02 6:17,2024/09/02 6:17,2024/09/02 6:17,U22
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024/09/02 6:18,2024/09/02 6:18,2024/09/02 6:18,U22
3,1130261,2806,1 (FACING camera) Sit and stand,2024/09/02 6:20,2024/09/02 6:20,2024/09/02 6:20,U22
4,1130292,2806,1 (FACING camera) Sit and stand,2024/09/02 6:42,2024/09/02 6:42,2024/09/02 6:42,U2


In [2]:
# convert Started, Finished and Updated columns to datetime format
train_df['Started'] = pd.to_datetime(train_df['Started'])
train_df['Finished'] = pd.to_datetime(train_df['Finished'])
train_df['Updated'] = pd.to_datetime(train_df['Updated'])

In [3]:
# convert the timezone to JST
train_df['Started'] = train_df['Started'].dt.tz_localize('UTC').dt.tz_convert('Asia/Tokyo')
train_df['Finished'] = train_df['Finished'].dt.tz_localize('UTC').dt.tz_convert('Asia/Tokyo')
train_df['Updated'] = train_df['Updated'].dt.tz_localize('UTC').dt.tz_convert('Asia/Tokyo')

In [4]:
# cleaning train_df
train_df = train_df.dropna()

In [5]:
# combine all dataframes of each user 
import os
import glob

base_dir = '/Users/tptn/MachineLearning/tremor_challenge2/data/TrainingDataPD25/users_timeXYZ/users'

headers = ['id', 'datetime', 'x', 'y', 'z']
df_list = []
for user in os.listdir(base_dir):
    path = os.path.join(base_dir, user)

    if os.path.isdir(path):
        files = glob.glob(os.path.join(path, '*.csv'))

        for f in files:
            df = pd.read_csv(f, header=None, names=headers)
            df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%dT%H:%M:%S.%f%z')
            if df['datetime'].dt.tz != 'UTC+09:00': # convert timezone if it not in JST
                df['datetime'] = df['datetime'].dt.tz_convert('Asia/Tokyo')
            df_list.append(df)
            
users_data = pd.concat(df_list, ignore_index=True)

In [6]:
users_data.shape
users_data.dtypes
# there are no null values in the dataframes

id                               int64
datetime    datetime64[ns, Asia/Tokyo]
x                              float64
y                              float64
z                              float64
dtype: object

In [7]:
# train_df['Activity Type'].value_counts().index
# train_df['Activity Type'].unique()

In [8]:
ACT_LABELS = list(train_df['Activity Type'].unique())
# print(ACT_LABELS)
USERS = list(users_data['id'].unique())
# print(USERS)

In [9]:
# Split accelerometer data per 1 sample (=Activity at one time)
"""
Make Segment.
A segment is a group of continuous sensor data for which feature calculation (statistic calculation) is performed in human activity recognition using sensor data.
"""
seg_label_list = [] # segment's (1 sample's) label list (label mean "activity_type_id", "user_id" etc.).
seg_list = [] # segment's (1 sample's) accelerometer data list.

for i in list(train_df.index):
    started_at = train_df.loc[i, 'Started']
    finished_at = train_df.loc[i, 'Finished']
    seg = users_data[(users_data["datetime"] >=started_at) & (users_data["datetime"] <= finished_at)]
    seg_label = train_df.loc[i, "Activity Type"]
    if (len(seg)!=0):
        seg_list.append(seg)
        seg_label_list.append(seg_label)

seg_list[0].head()

Unnamed: 0,id,datetime,x,y,z
1343454,2807,2024-09-06 06:25:28.643000+09:00,2.452,-0.986,9.768
1343455,2807,2024-09-06 06:25:28.802000+09:00,2.433,-0.991,9.773
1343456,2807,2024-09-06 06:25:28.965000+09:00,2.449,-0.989,9.764
1343457,2807,2024-09-06 06:25:29.122000+09:00,2.428,-0.986,9.756
1343458,2807,2024-09-06 06:25:29.282000+09:00,2.425,-0.989,9.771


# Feature Extraction

In [10]:
# Extract start and end date of the whole training dataset
date_start = train_df['Started'].min()
date_finish = train_df['Finished'].max()
print(date_start, date_finish)

2024-09-02 15:16:00+09:00 2024-09-11 14:11:00+09:00


From the output above it looks like that the training dataset is conducted only 9 days from 2024-09-02 to 2024-09-11.

In [11]:
"""
Create hourly data frames from start date to end date
Weekday will be used as feature
"""
import datetime
time = date_start
time_list = []
while time <= date_finish:
    for hour in range (0, 24):
        year_month_date_hour = str(time.date()) + '-' + str(hour).zfill(2)
        time_list.append([year_month_date_hour, hour, time, time.weekday()])
    time = time + datetime.timedelta(days=1)

cols = ['year-month-date', 'hour', 'start', 'weekday']
df_y = pd.DataFrame(time_list, columns=cols)
del time_list, time

In [12]:
df_y.head()

Unnamed: 0,year-month-date,hour,start,weekday
0,2024-09-02-00,0,2024-09-02 15:16:00+09:00,0
1,2024-09-02-01,1,2024-09-02 15:16:00+09:00,0
2,2024-09-02-02,2,2024-09-02 15:16:00+09:00,0
3,2024-09-02-03,3,2024-09-02 15:16:00+09:00,0
4,2024-09-02-04,4,2024-09-02 15:16:00+09:00,0


In [None]:
# add year-month-date-hour column to train_df
train_df['year-month-date-hour'] = train_df['Started'].dt.date.astype(str) + '-' + train_df['Started'].dt.hour.astype(str)

In [None]:
train_cp = train_df.copy()
feat = train_cp.groupby([])

KeyError: "['id'] not in index"