# Load data & Time conversion

- Acceleration data is have multiple timezones, so we convert them to 'UTC+09:00' for consistency.
- Activity data is taken in 'UTC+09:00' timezone, but UTC information is not included in the data. So we localize the datetime to 'UTC+09:00' timezone.

In [9]:
# load activity data
import pandas as pd

act_dir = '../data/TrainingDataPD25/TrainActivities.csv'

def read_act_df(base_dir, time_cols=['Started', 'Finished', 'Updated']):
    act_df = pd.read_csv(base_dir)
    act_df = act_df.dropna(subset=time_cols)

    for col in time_cols:
        act_df[col] = pd.to_datetime(act_df[col], format='%Y/%m/%d %H:%M')
        if act_df[col].isna().any():
            raise ValueError(f"Column '{col}' contains NaN or unparseable datetime values.")
        act_df[col] = act_df[col].dt.tz_localize('UTC+09:00')

    return act_df

act_df = read_act_df(act_dir)
act_df.head(10)

Unnamed: 0,ID,Activity Type ID,Activity Type,Started,Finished,Updated,Subject
0,1130251,2806,1 (FACING camera) Sit and stand,2024-09-02 06:16:00+09:00,2024-09-02 06:16:00+09:00,2024-09-02 06:16:00+09:00,U22
1,1130254,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:17:00+09:00,2024-09-02 06:17:00+09:00,2024-09-02 06:17:00+09:00,U22
2,1130257,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:18:00+09:00,2024-09-02 06:18:00+09:00,2024-09-02 06:18:00+09:00,U22
3,1130261,2806,1 (FACING camera) Sit and stand,2024-09-02 06:20:00+09:00,2024-09-02 06:20:00+09:00,2024-09-02 06:20:00+09:00,U22
4,1130292,2806,1 (FACING camera) Sit and stand,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,U2
5,1130293,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,U2
6,1130294,2808,3 Stand up from chair - both hands with SHAKING,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,2024-09-02 06:42:00+09:00,U2
7,1130306,2806,1 (FACING camera) Sit and stand,2024-09-02 06:44:00+09:00,2024-09-02 06:44:00+09:00,2024-09-02 06:44:00+09:00,U1
8,1136872,2806,1 (FACING camera) Sit and stand,2024-09-03 19:56:00+09:00,2024-09-03 19:56:00+09:00,2024-09-03 19:56:00+09:00,U22
9,1136878,2807,2 (FACING camera) both hands SHAKING (sitting ...,2024-09-03 19:56:00+09:00,2024-09-03 19:57:00+09:00,2024-09-03 19:57:00+09:00,U22


In [10]:
#load acceleration data
import os
import glob

user_dir = '/Users/tptn/MachineLearning/tremor_challenge2/data/TrainingDataPD25/users_timeXYZ/users'
headers = ['id', 'datetime', 'x', 'y', 'z']

def read_users_df(base_dir, headers):
    """ Read all csv files in users folder 
    Return:
        users_df: a DataFrame contains all csv files
    Args:
        base_dir: base folder to csv files
        headers: headers for csv files
    """
    df_list = [] # to store all csv files
    for id in os.listdir(base_dir):
        path = os.path.join(base_dir, id)

        if os.path.isdir(path):
            files = glob.glob(os.path.join(path, '*.csv'))

            for f in files:
                df = pd.read_csv(f, header=None, names=headers) # load csv
                df['datetime'] = pd.to_datetime(df['datetime'], format='%Y-%m-%dT%H:%M:%S.%f%z') # conver to datetime
                tz = df['datetime'].dt.tz # get the timezone
                if tz != 'UTC+09:00':
                    df['datetime'] = df['datetime'].dt.tz_convert('UTC+09:00')
                df_list.append(df)
    users_df = pd.concat(df_list, ignore_index=True)
    return users_df

users_df = read_users_df(user_dir, headers)
users_df.head(10)

Unnamed: 0,id,datetime,x,y,z
0,2810,2024-09-10 06:25:37.059000+09:00,7.322,6.469,-0.548
1,2810,2024-09-10 06:25:37.075000+09:00,7.336,6.364,-0.586
2,2810,2024-09-10 06:25:37.084000+09:00,7.381,6.271,-0.612
3,2810,2024-09-10 06:25:37.093000+09:00,7.36,6.194,-0.624
4,2810,2024-09-10 06:25:37.102000+09:00,7.319,6.098,-0.619
5,2810,2024-09-10 06:25:37.154000+09:00,7.398,6.036,-0.648
6,2810,2024-09-10 06:25:37.160000+09:00,7.384,5.967,-0.672
7,2810,2024-09-10 06:25:37.164000+09:00,7.336,5.921,-0.641
8,2810,2024-09-10 06:25:37.171000+09:00,7.494,5.959,-0.698
9,2810,2024-09-10 06:25:37.176000+09:00,7.527,6.002,-0.713


# Preparing the data

## Drop duplicate rows

In [16]:
def drop_dups(df, dup_cols=['Activity Type', 'Started', 'Finished', 'Subject']):
    """
    Drop duplicated rows (with keep='last') and sort by 'Started' column.
    
    Returns
    -------
        DataFrame

    Parameters
    ----------
    df: pandas.DataFrame
        Original DataFrame
    dup_cols: list of str
        Columns to consider for identifying duplicates
    """
    drop_df = df.drop_duplicates(subset=dup_cols, keep='last').sort_values(by=['Started']).reset_index(drop=True)
    return drop_df

drop_df = drop_dups(act_df)
drop_df.shape
# act_df.shape

(203, 7)

## Add duration between activity's start and finish 
- In the TrainActivities.csv, most of them has the same start and finish time, which means there is no duration. 
- If you want to match the acceleration data with activity type by 'duration', you must add it.

In [22]:
def add_duration(df, start_col='Started', end_col='Finished'):
    """Adding interval between two given cols
    
    Return:
        DataFrame: a dataframe with added interval between two columns

    Args:
        df (DataFrame): original dataframe
    """
    df_cp = df.copy()

    df_cp[start_col] = df_cp[start_col].dt.floor('s')
    df_cp[end_col] = df_cp[end_col].dt.floor('s') + pd.Timedelta(minutes=1) - pd.Timedelta(microseconds=1)

    return df_cp

duration_added_df = add_duration(drop_df)
duration_added_df[['Started', 'Finished']].head(10)

Unnamed: 0,Started,Finished
0,2024-09-02 06:16:00+09:00,2024-09-02 06:16:59.999999+09:00
1,2024-09-02 06:17:00+09:00,2024-09-02 06:17:59.999999+09:00
2,2024-09-02 06:18:00+09:00,2024-09-02 06:18:59.999999+09:00
3,2024-09-02 06:20:00+09:00,2024-09-02 06:20:59.999999+09:00
4,2024-09-02 06:42:00+09:00,2024-09-02 06:42:59.999999+09:00
5,2024-09-02 06:42:00+09:00,2024-09-02 06:42:59.999999+09:00
6,2024-09-02 06:42:00+09:00,2024-09-02 06:42:59.999999+09:00
7,2024-09-02 06:44:00+09:00,2024-09-02 06:44:59.999999+09:00
8,2024-09-03 19:56:00+09:00,2024-09-03 19:56:59.999999+09:00
9,2024-09-03 19:56:00+09:00,2024-09-03 19:57:59.999999+09:00


# Segmentation

In [38]:
def seg_df_per_start_and_duration(users_df, act_df):
    """
    Segment dataframe into one dataframe per activity time interval.
    Each segment contains acceleration data and a set of all overlapping activity labels.

    Returns
    -------
    seg_list: list of DataFrame
        List of DataFrames, each DataFrame corresponds to one time interval segment.
    seg_label_list: list of set
        List of sets of labels, each set corresponds to one acceleration segment with potentially multiple activities.
    seg_subject_list: list of str
        List of sets of subjects, each set corresponds to one acceleration segment.
    
    Parameters
    ----------
    users_df: pandas.DataFrame
        A DataFrame containing all acceleration data with a 'datetime' column.
    act_df: pandas.DataFrame
        A DataFrame containing activity label data with 'Started', 'Finished', 'Activity Type ID', and 'Subject'.
    """

    seg_list = []
    seg_label_list = []
    seg_subject_list = []

    # Get all unique time intervals (start, end)
    time_intervals = act_df[['Started', 'Finished']].drop_duplicates()

    for _, row in time_intervals.iterrows():
        started_at = row['Started']
        finished_at = row['Finished']

        # Extract the segment from sensor data
        seg = users_df[(users_df['datetime'] >= started_at) & (users_df['datetime'] <= finished_at)]

        # Get all activities and subjects in this time interval
        overlapping_acts = act_df[(act_df['Started'] == started_at) & (act_df['Finished'] == finished_at)]
        seg_labels = list(set(overlapping_acts['Activity Type ID'].tolist()))
        seg_subjects = str(set(overlapping_acts['Subject'].tolist())) # it should be one subject only, because I already checked that there is only one user per started and duration
        # print(set(overlapping_acts['Subject'].tolist()))
        if not seg.empty:
            seg_list.append(seg)
            seg_label_list.append(seg_labels)
            seg_subject_list.append(seg_subjects)

    return seg_list, seg_label_list, seg_subject_list

seg_list, seg_label_list, seg_subject_list = seg_df_per_start_and_duration(users_df, duration_added_df)

# set(seg_subject_list) # there is no more than one subject per segment!
# print(len(seg_list))

## Feature extraction

In [28]:
import numpy as np
from scipy.stats import iqr, skew, kurtosis

def zero_crossing_rate(signal):
    """Calculate zero crossing rate of a 1D numpy array"""
    return ((signal[:-1] * signal[1:]) < 0).sum() / len(signal)

def safe_corr(a, b):
    """Safely calculate correlation"""
    if len(a) < 2 or np.std(a) == 0 or np.std(b) == 0:
        return 0.0  # or np.nan if you prefer to filter later
    return np.corrcoef(a, b)[0, 1]

def get_feats(df, columns=['x', 'y', 'z']):
    """Extracting features from accelerometer data of each activity.

    Returns:
        feats: List of features (example: std_x, std_y, std_z, mean_x, ...)
    Args:
        df (DataFrame): DataFrame contains acceleration data
        columns (list): list of columns.
    """

    feats = []

    # Time domain features per axis
    for col in columns:
        data = df[col].values

        std_ = np.std(data, ddof=0)
        mean_ = np.mean(data)
        max_ = np.max(data)
        min_ = np.min(data)
        median_ = np.median(data)
        iqr_ = iqr(data)
        var_ = np.var(data)
        skew_ = skew(data)
        kurt_ = kurtosis(data)
        ptp_ = np.ptp(data)  # peak-to-peak (max - min)
        rms_ = np.sqrt(np.mean(data**2))
        zcr_ = zero_crossing_rate(data)

        feats.extend([
            std_, mean_, max_, min_, median_, iqr_,
            var_, skew_, kurt_, ptp_, rms_, zcr_
        ])

    # Signal Magnitude Area (SMA)
    mag = np.abs(df[columns]).sum(axis=1)
    sma = mag.sum() / len(df)
    feats.append(sma)

    # Correlation between axes
    corr_xy = safe_corr(df['x'], df['y'])
    corr_yz = safe_corr(df['y'], df['z'])
    corr_zx = safe_corr(df['z'], df['x'])
    
    feats.extend([corr_xy, corr_yz, corr_zx])

    return feats

## Window overlapping

In [29]:
def windowSeg(df, wsize, stride):
    """Segment using sliding window method

    Returns:
        List: list of (list of features per segment (DataFrame)).
    Args:
        df (DataFrame): DataFramme contains accelerometer data
        wsize: window size [seconds]
        stride: how much the window slide
    """
    features_list = []
    start_time = df['datetime'].min()
    end_time = df['datetime'].max()
    current_start = start_time

    while current_start + pd.Timedelta(seconds=wsize) <= end_time:
        current_end = current_start + pd.Timedelta(seconds=wsize)
        window = df[(df['datetime'] >= current_start) & (df['datetime'] < current_end)]
        if not window.empty:
            feats = get_feats(window)
            features_list.append(feats)
        current_start += pd.Timedelta(seconds=stride)

    return features_list

## Segment and extract features

In [40]:
def process_segments(seg_list, seg_label_list, seg_subject_list, wsize=1, stride=0.5):
    all_features = []
    all_labels = []
    all_subjects = []

    for df, label, subject in zip(seg_list, seg_label_list, seg_subject_list):
        feats_list = windowSeg(df, wsize, stride)
        all_features.extend(feats_list)
        all_labels.extend([label] * len(feats_list))
        all_subjects.extend([subject] * len(feats_list))

    # Define feature names for columns
    base_features = ['std', 'mean', 'max', 'min', 'median', 'iqr', 'var', 'skew', 'kurt', 'ptp', 'rms', 'zcr']
    columns = [f'{stat}_{axis}' for axis in ['x', 'y', 'z'] for stat in base_features]
    columns += ['sma', 'corr_xy', 'corr_yz', 'corr_zx']

    features_df = pd.DataFrame(all_features, columns=columns)
    features_df['label'] = all_labels
    features_df['subject'] = all_subjects

    return features_df

WINDOW_SIZE = 1 # window size = 1s
OVERLAP_RATE = 0.5 * WINDOW_SIZE # overlap 50% window size

all_in_one_df = process_segments(seg_list, seg_label_list, seg_subject_list, wsize=WINDOW_SIZE, stride=WINDOW_SIZE - OVERLAP_RATE)

In [None]:
all_in_one_df['label'].value_counts()
len(all_in_one_df['label'].value_counts().index)

21

# Split the data

In [49]:
X = all_in_one_df.drop(columns=['label', 'subject']).values
y = all_in_one_df['label'].values

In [55]:
def split_by_position(X, y, start_ratio=0.0, train_ratio=0.7):
    """
    Split dataset into train/test without shuffling.
    
    Parameters:
    - X, y: features and labels
    - start_ratio: float in [0, 1), where to start the 70% slice
    - train_ratio: proportion of the data to use as training (e.g. 0.7)

    Returns:
    - X_train, X_test, y_train, y_test
    """
    assert 0 <= start_ratio < 1, "start_ratio must be in [0, 1)"
    assert 0 < train_ratio <= 1, "train_ratio must be in (0, 1]"

    total_len = len(X)
    start_index = int(start_ratio * total_len)
    end_index = start_index + int(train_ratio * total_len)

    # Handle edge case where end_index exceeds total_len
    if end_index > total_len:
        end_index = total_len

    X_train = X[start_index:end_index]
    y_train = y[start_index:end_index]

    # Use everything else for test
    X_test = np.concatenate((X[:start_index], X[end_index:]), axis=0)
    y_test = np.concatenate((y[:start_index], y[end_index:]), axis=0)

    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = split_by_position(X, y, start_ratio=0.05, train_ratio=0.7)

In [56]:
print(f"X_train shape: {X_train.shape}, y_train shape: {y_train.shape}")
print(f"X_test shape: {X_test.shape}, y_test shape: {y_test.shape}")

X_train shape: (20414, 40), y_train shape: (20414,)
X_test shape: (8749, 40), y_test shape: (8749,)


# Handlle imbalanced train data

## Random Over Sampler

In [60]:
%pip install scikit-multilearn

Collecting scikit-multilearn
  Downloading scikit_multilearn-0.2.0-py3-none-any.whl.metadata (6.0 kB)
Downloading scikit_multilearn-0.2.0-py3-none-any.whl (89 kB)
Installing collected packages: scikit-multilearn
Successfully installed scikit-multilearn-0.2.0

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
