In [None]:
# This is the template for the submission. If you want, you can develop your algorithm in a regular Python script and copy the code here for submission.

# Team members (e-mail, legi):
# chozhang@student.ethz.ch, 22-945-562
# minghli@student.ethz.ch, 22-953-293
# changli@student.ethz.ch, 22-944-474


In [1]:
from typing import *
import os
import sys
curr_environ = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
if curr_environ != 'Localhost':
    sys.path.append('/kaggle/input/mobile-health-2023-path-detection')
    input_dir = '/kaggle/input/mobile-health-2023-path-detection'
else:
    input_dir = os.path.abspath('')


In [2]:
import numpy as np
import pandas as pd

# progress bar
from tqdm import tqdm
from time import time

# utilty
from Lilygo.Recording import Recording
from Lilygo.Dataset import Dataset
from os import listdir
from os.path import isfile, join

# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches




In [3]:
# for signal processing and calculations
from scipy import signal
from scipy.integrate import simpson
from scipy import stats

from sklearn import tree

# for tuning parameters
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# for skeleton
from sklearn.base import BaseEstimator


In [4]:
### signal processing functions ###
def parse(signal, ds_freq: float = 20.0, zero_mean: bool = False):
    """downsampling the signal to specific frequency ds_freq, and make the data
     with zero mean if zero_mean is True"""
    ori_time_seq = np.array(signal.timestamps)
    ori_value_seq = np.array(signal.values)
    if zero_mean:
        ori_value_seq = ori_value_seq - np.mean(ori_value_seq)
    dt = 1./ds_freq
    time_seq = np.arange(start=np.min(ori_time_seq),
                         stop=np.max(ori_time_seq),
                         step=dt)
    value_seq = np.interp(time_seq, ori_time_seq, ori_value_seq)
    return value_seq


def bp_filter(value_seq, fp: float = 3, fs: float = 20.0):
    """apply band pass filter to the sequence. fp is the threshold frequency,
     and fs is the sampling frequency."""
    sos = signal.butter(N=4, Wn=[0.5, fp],
                        btype='bandpass', fs=fs, output='sos')
    filtered = signal.sosfilt(sos, value_seq)
    return filtered


def get_envelop(value_seq,
                fs: float = 20,
                half_window_size: float = 0.5,
                _min: float = 20.,
                _max: float = 500.):
    """
    get the envelop as the adaptive local norm of the signal, currently the mode
     of vector (no negative values). The envelop is calculated by the maximum in
     a window, half_window_size is the seconds of time. _min and _max for clip.
    """
    half_win = int(fs*half_window_size)
    seq = np.concatenate(
        [np.zeros((half_win,)), value_seq, np.zeros((half_win,))])
    envelop = np.array([np.max(seq[k-half_win:k+half_win+1])
                        for k in range(half_win, half_win+len(value_seq))])
    return np.clip(envelop, _min, _max)


Building on prior work [23], raw accelerometer data were converted to signal magnitude vector values ( $SM=\sqrt{acc^2_x+acc^2_y+acc^2_z}$), thus removing the dependence of the resulting signal from the orientation of the sensor. Mean and standard deviation of the SM were considered jointly with a time-frequency analysis of SM in each 10-s window. The analysis of power spectral density aimed at characterizing the following:

1. The total power in the frequencies between 0.3 Hz and 15 Hz;
2. The first and second dominant frequencies and their powers in the same frequency band;
3. The dominant frequency in the 0.6–2.5 Hz band and its power;
4. The ratio between the power of the first dominant frequency and the total power (0.3–15 Hz);
5. The ratio between the dominant frequency of the current window and the previous window;
6. The ratio (R1) between the power at frequencies lower than 3 Hz and the total power (0.3–15 Hz);
7. The ratio (R2) between the power at frequencies lower than 3 Hz and the total power (0.3–15 Hz);
8. The ratio (R3) between the power at frequencies in the 1.5–2.5 Hz range and the total power (0.3–15 Hz).

In [5]:
def encode_feat_vec(sm):
    """ return a list containing feature vectors of all the 10s non-overlapping windows"""
    WINDOW = 2000  # 10s window
    feat_vecs = []
    df1s = []
    df2s = []
    dfb2s = []
    sf = 50.0
    for i in range(0, len(sm) - WINDOW, WINDOW):
        data = sm[i:i+WINDOW]
        low1 = 0.3
        high1 = 15
        low2 = 0.6
        high2 = 2.5
        nperseg = 2 / low1 * sf
        freqs, psd = signal.welch(data, sf, nperseg=nperseg)
        BAND1 = np.logical_and(freqs >= low1, freqs <= high1)
        BAND2 = np.logical_and(freqs >= low2, freqs <= high2)
        freq_res = freqs[1] - freqs[0]
        # compuate total power
        total_power = simpson(psd[BAND1], dx=freq_res)

        # compute dominant frequencies for BAND1, BAND2
        idx_p1 = psd[BAND1].argsort()[-1]
        df1 = low1 + freq_res * idx_p1
        pdf1 = simpson(psd[BAND1][idx_p1:idx_p1+2], dx=freq_res)
        df1s.append(df1)

        idx_p2 = psd[BAND1].argsort()[-2]
        df2 = low1 + freq_res * idx_p2
        pdf2 = simpson(psd[BAND1][idx_p2:idx_p2+2], dx=freq_res)
        df2s.append(df2)

        idx_b2 = psd[BAND2].argsort()[-1]
        dfb2 = low2 + freq_res * idx_b2
        pdfb2 = simpson(psd[BAND2][idx_b2:idx_b2+2], dx=freq_res)
        dfb2s.append(dfb2)

        ratio_pdf1_tot = pdf1 / total_power

        if i != 0:
            ratio_curr_prev_df1 = df1s[-1] / df1s[-2]
            ratio_curr_prev_df2 = df2s[-1] / df2s[-2]
            ratio_curr_prev_dfb2 = dfb2s[-1] / dfb2s[-2]
        else:
            ratio_curr_prev_df1, ratio_curr_prev_df2, ratio_curr_prev_dfb2 = \
                1.0, 1.0, 1.0

        R1 = simpson(psd[freqs <= 3], dx=freq_res) / total_power
        R2 = simpson(psd[freqs > 3], dx=freq_res) / total_power

        feat_vec = [
            np.mean(data),
            np.std(data),
            df1, pdf1, df2, pdf2, dfb2, pdfb2,
            ratio_pdf1_tot,
            ratio_curr_prev_df1, ratio_curr_prev_df2, ratio_curr_prev_dfb2,
            R1, R2]
        feat_vecs.append(feat_vec)
    return feat_vecs


In [6]:
def compute_sm(ax, ay, az, normalize=False):
    sm = np.sqrt(np.sum(np.square([ax, ay, az]), axis=0))
    if normalize:
        return sm - np.mean(sm)
    else:
        return sm


In [7]:
# do not run this often
train_dir = '/kaggle/input/mobile-health-2023-path-detection/data/train/'
all_train_f = [os.path.join(train_dir, f) for f in os.listdir(train_dir)]
all_train_f.sort()


In [8]:
print(len(all_train_f))

263


In [9]:
def parse_accelerometer(trace: Recording, use_zc=True):
    if use_zc:
        ax = parse(trace.data['ax'])
        ay = parse(trace.data['ay'])
        az = parse(trace.data['az'])
    else:
        ax = trace.data['ax'].values
        ay = trace.data['ay'].values
        az = trace.data['az'].values
    return ax, ay, az


In [None]:
# do not run this often
X_train = []
y_train = []
X_0 = []
y_0 = []
sample_pool = np.random.choice(all_train_f, size=70)
test_f = []
for f in tqdm(all_train_f):
    t = Recording(f, no_labels=False, mute=True)
    if 0 in t.labels['activities']:
        ax, ay, az = parse_accelerometer(t, use_zc=False)
        dummy_t = {}
        dummy_t['ax'] = ax
        dummy_t['ay'] = ay
        dummy_t['az'] = az
        X_0.append(dummy_t)
        y_0.append(True)
    elif f in sample_pool:
        ax, ay, az = parse_accelerometer(t, use_zc=False)
        dummy_t = {}
        dummy_t['ax'] = ax
        dummy_t['ay'] = ay
        dummy_t['az'] = az
        X_0.append(dummy_t)
        y_0.append(False)
    if len(t.labels['activities']) == 1:
        ax, ay, az = parse_accelerometer(t, use_zc=False)
        sm = compute_sm(ax, ay, az, normalize=True)
        fv = encode_feat_vec(sm)
        X_train.extend(fv)
        y_train.extend(t.labels['activities'] * len(fv))
    else:
        test_f.append(f)

In [None]:
# do not run this often
with open('/kaggle/working/train.npy', 'wb') as f:
    np.save(f, np.array(X_train))
    np.save(f, np.array(y_train))

In [11]:
with open('/kaggle/input/activity-recognition-matchy/train.npy', 'rb') as f:
    X_train = np.load(f)
    y_train = np.load(f)

In [None]:
# d2 = StandStillDetector(std_thresh = 0.17, cont_thresh = 3)
# d2.score(X_0, y_0)

In [None]:
d = StandStillDetector()
parameters = {
    "std_thresh": np.arange(0.15, 0.23, step=0.02),
    "cont_thresh": np.arange(3, 5, step=1)
}
param_tuner = GridSearchCV(d, parameters, cv=5, verbose=3)
param_tuner.fit(X_0, y_0)

print(param_tuner.best_score_)
print(param_tuner.best_params_)

In [55]:
class StandStillDetector(BaseEstimator):
    def __init__(self, mean_thresh = -0.1, std_thresh = 0.1, cont_thresh = 5):
        # no params, for now
        self.mean_thresh = mean_thresh
        self.std_thresh  = std_thresh
        self.cont_thresh = cont_thresh
    
    def fit(self, X, y):
        # no learning actually
        return self

    def score(self, X: Sequence[Dict], y, sample_weight=None):
        from sklearn.metrics import accuracy_score
        # assume X is traces and y is labels
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

    def predict_fv(self, feat_vec):
        clf_as_still = False
        prev_still = False
        cont = 0
        for fv in feat_vec:
            if fv[1] < self.std_thresh and prev_still == False:
                prev_still = True
                cont = 1
                if cont >= self.cont_thresh:
                    clf_as_still = True
                    break
            elif fv[1] < self.std_thresh and prev_still == True:
                cont += 1
                if cont >= self.cont_thresh:
                    clf_as_still = True
                    break
            else:
                prev_still = False
                cont = 0
        return clf_as_still

    def predict(self, traces):
        # assume array
        if hasattr(traces, '__len__'):
            res = np.zeros(len(traces), dtype=int)
            _traces = traces
        else:
            res = np.zeros(1, dtype=int)
            _traces = [traces]
        i = 0
        for trace in _traces:
            ax, ay, az = trace['ax'], trace['ay'], trace['az']
            sm = compute_sm(ax, ay, az, normalize=True)
            feat_vec = encode_feat_vec(sm)
            clf_as_still = False
            prev_still = False
            cont = 0
            for fv in feat_vec:
                if fv[1] < self.std_thresh and prev_still == False:
                    prev_still = True
                    cont = 1
                    if cont >= self.cont_thresh:
                        clf_as_still = True
                        break
                elif fv[1] < self.std_thresh and prev_still == True:
                    cont += 1
                    if cont >= self.cont_thresh:
                        clf_as_still = True
                        break
                else:
                    prev_still = False
                    cont = 0
            res[i] = clf_as_still
            i += 1
        if len(res) == 1:
            return res[0]
        else:
            return res

In [None]:
# test in batch
scores = []
for i in tqdm(range(0, len(test_f) + 1 - 20, 20)):
    batch_f = test_f[i:i+20]
    traces = [Recording(f, no_labels=False, mute=True) for f in batch_f]
    labels = [t.labels['board_loc'] for t in traces]
    scores.append(watchloc_detector.score(traces, labels))

print(scores)


---

## Activity

Activities contained in the data trace and performed for more than 60 s uninterrupted. 

Output as a list of integers: e.g., `[0, 3]` (`0`: standing still, `1`: walk, `2`: run, `3`: cycle). 

These do not need to be in the right order and they do not need to occur multiple times.

In [36]:
class ActivityRecognizer(BaseEstimator):
    def __init__(self, count_tresh=2):
        self.clf = tree.DecisionTreeClassifier()
        self.count_thresh = count_tresh

    def fit(self, X, y): # X is an array of windows
        selector = np.array([fv[1] > 0.1 for fv in X])
        _X = X[selector]
        if isinstance(y, int):
            _y = [y] * len(_X)
        else:
            _y = y[selector]
        self.clf.fit(_X, _y)

    def score(self, X, y, sample_weight=None):
        y_predict = self.predict(X)
        tot_score = 0
        for true, pred in zip(y, y_predict):
            # walk 1, run 2, cycle 3
            # 0.05, 0.05, 0.1
            true_v = [1 in true, 2 in true, 3 in true]
            pred_v = [1 in pred, 2 in pred, 3 in pred]
            score = int(true_v[0] == pred_v[0]) + \
                    int(true_v[1] == pred_v[1]) + \
                    int(true_v[2] == pred_v[2])
            score /= 4
            tot_score += score
        return tot_score / len(y_predict)

    def predict_fv(self, X):
        win_res = self.clf.predict(X)
        values, counts = np.unique(win_res, return_counts=True)
        return [values[j] 
                for j, count in enumerate(counts) if count >= self.count_thresh]

    def predict(self, traces):
        # assume array
        if hasattr(traces, '__len__'):
            res = [0] * len(traces)
            _traces = traces
        else:
            res = [0]
            _traces = [traces]
        i = 0
        for trace in _traces:
            if isinstance(trace, Recording):
                ax, ay, az = parse_accelerometer(trace, use_zc=False)
                sm = compute_sm(ax, ay, az, normalize=True)
                feat_vec = encode_feat_vec(sm)
            else:
                feat_vec = trace
            # will get an activity prediction for each window
            win_res = self.clf.predict(feat_vec) 
            values, counts = np.unique(win_res, return_counts=True)
            res[i] = [values[j] 
                      for j, count in enumerate(counts) if count >= self.count_thresh]
            i += 1
        return res
#         if len(res) == 1:
#             return res[0]
#         else:
#             return res
        

In [66]:
act = ActivityRecognizer(count_tresh=2)
act.fit(X_train, y_train)

In [24]:
import pickle
with open('/kaggle/input/activity-recognition-matchy/test_f.pkl', 'rb') as f:
    test_f = pickle.load(f)
print(len(test_f))

97


In [38]:
a_trace = Recording(test_f[0], no_labels=False, mute=True)
act.score([a_trace], [a_trace.labels['activities']])

0.5

In [64]:
# test in batch
scores = []
for i in tqdm(range(0, len(test_f) + 1 - 20, 20)):
    batch_f = test_f[i:i+20]
    traces = [Recording(f, no_labels=False, mute=True) for f in batch_f]
    labels = [t.labels['activities'] for t in traces]
    scores.append(act.score(traces, labels))

print(scores)



  0%|          | 0/4 [00:00<?, ?it/s][A
 25%|██▌       | 1/4 [01:00<03:02, 60.98s/it][A
 50%|█████     | 2/4 [02:03<02:04, 62.10s/it][A
 75%|███████▌  | 3/4 [02:59<00:58, 58.97s/it][A
100%|██████████| 4/4 [03:53<00:00, 58.47s/it][A

[0.5875, 0.55, 0.475, 0.525]





In [65]:
print(np.mean(scores))

0.534375


---

# Prediction

In [44]:
# Get the path of all traces
dir_traces = '/kaggle/input/mobile-health-2023-path-detection/data/test'
filenames = [join(dir_traces, f)
             for f in listdir(dir_traces) if isfile(join(dir_traces, f))]
filenames.sort()


In [56]:
# initialize predictors
# step_counter = StepCounter()
stand_detector = StandStillDetector(std_thresh=0.17, cont_thresh=3)

In [67]:
# Loop through all traces and calculate the step count for each trace
solution_file = []
t = []
pbar = tqdm(total=len(filenames), position=0, leave=False)
for filename in filenames:
    trace = Recording(filename, no_labels=True, mute=True)
    categorization_results = {'watch_loc': 114514, 'path_idx': 114514,
                              'step_count': 0, 'stand': 0, 'walk': 0, 'run': 0, 'cycle': 0}
    ax, ay, az = parse_accelerometer(trace)
    sm = compute_sm(ax, ay, az)
    fv = encode_feat_vec(
        compute_sm(*parse_accelerometer(trace, use_zc=False), normalize=True))

    #
    # Your algorithm goes here
    # You can access the variable 'watch_loc' in the dictionary 'categorization_results' for example with
    # categorization_results['watch_loc'] = 1
    # Make sure, you do not use the gps data and are tolerant for missing data (see task set).
    # Your program must not crash when single smartphone data traces are missing.
    #
    st = time()
#     categorization_results['watch_loc'] = watchloc_detector.predict_fv(fv)
#     categorization_results['step_count'] = step_counter.predict(sm)
    categorization_results['stand'] = int(stand_detector.predict_fv(fv))
    res = act.predict_fv(fv)
    if 1 in res:
        categorization_results['walk'] = 1
    if 2 in res:
        categorization_results['run'] = 1
    if 3 in res:
        categorization_results['cycle'] = 1
    et = time()
    t.append(et - st)

    # Append your calculated results and the id of each trace and category to the solution file
    trace_id = ''.join([*filename][-8:-5])
    for counter_label, category in enumerate(categorization_results):
        solution_file.append(
            [trace_id + f'_{counter_label+1}', categorization_results[category]])
    pbar.update(1)
    pbar.set_description(f'Average infer time: {np.mean(t):.3f}s')


Average infer time: 0.001s: 100%|██████████| 376/376 [22:12<00:00,  2.55s/it]

In [72]:
from joblib import dump, load
dump(act.clf, '/kaggle/working/activity_recognition.joblib')

['/kaggle/working/activity_recognition.joblib']

In [68]:
# Write the detected step counts into a .csv file to then upload the .csv file to Kaggle
# When cross-checking the .csv file on your computer, we recommend using the text editor and NOT excel so that the results are displayed correctly
# IMPORTANT: Do NOT change the name of the columns ('Id' and 'Category') of the .csv file
submission_file_df = pd.DataFrame(np.asarray(solution_file), columns=['Id', 'Category'])
submission_file_df.to_csv('/kaggle/working/submission.csv', header=['Id', 'Category'], index=False)

In [69]:
submission_file_df.head()

Unnamed: 0,Id,Category
0,001_1,114514
1,001_2,114514
2,001_3,0
3,001_4,0
4,001_5,1
