In [13]:
# This is the template for the submission. If you want, you can develop your algorithm in a regular Python script and copy the code here for submission.

# Team members (e-mail, legi):
# chozhang@student.ethz.ch, 22-945-562
# minghli@student.ethz.ch, 22-953-293
# changli@student.ethz.ch, 22-944-474


We assume that all the models and features are located in `/kaggle/working`! Please change the path if you want to run the notebook using a different location!

In [14]:
from typing import *
import os
import sys
curr_environ = os.environ.get('KAGGLE_KERNEL_RUN_TYPE', 'Localhost')
if curr_environ != 'Localhost':
    sys.path.append('/kaggle/input/mobile-health-2023-path-detection')
    input_dir = '/kaggle/input/mobile-health-2023-path-detection'
else:
    input_dir = os.path.abspath('')


In [15]:
import numpy as np
import pandas as pd

from Lilygo.Recording import Recording
from Lilygo.Dataset import Dataset
from os import listdir
from os.path import isfile, join



In [16]:
# progress bar
from tqdm import tqdm
from time import time

# plotting
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

# for signal processing and calculations
from scipy import signal
from scipy.integrate import simpson
from scipy import stats

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier

# load models
from joblib import load

# for tuning parameters
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# for skeleton
from sklearn.base import BaseEstimator


---

# Utils

In [17]:
### signal processing functions ###
def parse(signal, ds_freq:float=20.0, zero_mean:bool=False):
    """downsampling the signal to specific frequency ds_freq, and make the data
     with zero mean if zero_mean is True"""
    ori_time_seq = np.array(signal.timestamps)
    ori_value_seq = np.array(signal.values)
    if zero_mean: ori_value_seq = ori_value_seq - np.mean(ori_value_seq)
    dt = 1./ds_freq
    time_seq = np.arange(start=np.min(ori_time_seq), stop=np.max(ori_time_seq), step=dt)
    value_seq = np.interp(time_seq, ori_time_seq, ori_value_seq)
    return time_seq, value_seq


def bp_filter(value_seq, fp: float = 3, fs: float = 20.0):
    """apply band pass filter to the sequence. fp is the threshold frequency,
     and fs is the sampling frequency."""
    sos = signal.butter(N=4, Wn=[0.5, fp],
                        btype='bandpass', fs=fs, output='sos')
    filtered = signal.sosfilt(sos, value_seq)
    return filtered


def get_envelop(value_seq,
                fs: float = 20,
                half_window_size: float = 0.5,
                _min: float = 20.,
                _max: float = 500.):
    """
    get the envelop as the adaptive local norm of the signal, currently the mode
     of vector (no negative values). The envelop is calculated by the maximum in
     a window, half_window_size is the seconds of time. _min and _max for clip.
    """
    half_win = int(fs*half_window_size)
    seq = np.concatenate(
        [np.zeros((half_win,)), value_seq, np.zeros((half_win,))])
    envelop = np.array([np.max(seq[k-half_win:k+half_win+1])
                        for k in range(half_win, half_win+len(value_seq))])
    return np.clip(envelop, _min, _max)

def lp_filter(value_seq, alpha=0.95):
    x = value_seq[0]
    a = []
    for v in value_seq:
        x = x * alpha + v * (1-alpha)
        a.append(x)    
    return np.array(a)

def split_fea(t, v, n_split, fn = lambda x: np.percentile(x, 50)):
    tmin, tmax = np.min(t)-1e-8, np.max(t)+1e-8
    t_split = np.linspace(tmin, tmax, n_split+1, endpoint=True)
    outputs = []
    for i in range(n_split):
        start, end = t_split[i], t_split[i+1]
        array = v[(t>start) & (t<end)]
        fea = fn(array)
        outputs.append(fea)
    return outputs

In [18]:
def compute_sm(ax, ay, az, normalize=False):
    sm = np.sqrt(np.sum(np.square([ax, ay, az]), axis=0))
    if normalize:
        return sm - np.mean(sm)
    else:
        return sm


In [19]:
def parse_accelerometer(trace: Recording, use_zc=True):
    if use_zc:
        _, ax = parse(trace.data['ax'])
        _, ay = parse(trace.data['ay'])
        _, az = parse(trace.data['az'])
    else:
        ax = trace.data['ax'].values
        ay = trace.data['ay'].values
        az = trace.data['az'].values
    return ax, ay, az


# Step Count

In [20]:
class StepCounter(BaseEstimator):
    def __init__(self, acc_min=2.,  acc_max=3., acc_height=0.25,
                 half_window_size=0.5, width=0.5):
        self.acc_min = acc_min
        self.acc_max = acc_max
        self.acc_height = acc_height
        self.half_window_size = half_window_size
        self.width = width

    def fit(self, data, labels):
        # no learning actually, just to fit the estimator interface
        return self

    def score(self, X, y_true, sample_weight=None, normalize=True) -> float:
        '''
        Get the "score" of the step counting result. 
        The score is calculated based on how different the step count is from the true values
        '''
        y_predicted = self.predict(X)
        diff = y_predicted - y_true
        scores = np.zeros(len(diff))
        for i in range(len(diff)):
            s = - abs(diff[i])
            scores[i] = s
        if normalize:
            return np.average(scores, weights=sample_weight)
        elif sample_weight is not None:
            return np.dot(scores, sample_weight)
        else:
            return scores.sum()

    def predict(self, sm):
        acc_step_counts = self._count_steps(sm,
                                            _max=self.acc_max,
                                            _min=self.acc_min,
                                            _height=self.acc_height,
                                            half_window_size=self.half_window_size,
                                            width=self.width)
        res = int(acc_step_counts)
        return res

    def _count_steps(self, sm, _max, _min, _height,
                     half_window_size=0.5, width=0.5):

        # calculate the mode.
        g_v = sm
        g_v /= get_envelop(g_v,
                           half_window_size=half_window_size,
                           _min=_min,
                           _max=_max)  # an adaptive local norm
        # band pass
        filtered_gv = bp_filter(g_v)
        # amp 1/4 after filtering, should be amplified 4x.
        filtered_gv = filtered_gv * (filtered_gv > 0) * 4

        # 0.5 optimal for gyro. not tuned for acc but I am lazy.
        peaks, _ = signal.find_peaks(filtered_gv,
                                     height=_height,
                                     distance=20 * 0.2)
        # when _min=20 for acc, height=0.01 looks good. sota: _min=1, height=0.25
        step_count = len(peaks)  # peaks are the steps.
        return step_count


# Smart Watch Location

In [21]:
def encode_feat_vec(sm):
    """ return a list containing feature vectors of all the 10s non-overlapping windows"""
    WINDOW = 2000  # 10s window
    feat_vecs = []
    df1s = []
    df2s = []
    dfb2s = []
    sf = 50.0
    for i in range(0, len(sm) - WINDOW, WINDOW):
        data = sm[i:i+WINDOW]
        low1 = 0.3
        high1 = 15
        low2 = 0.6
        high2 = 2.5
        nperseg = 2 / low1 * sf
        freqs, psd = signal.welch(data, sf, nperseg=nperseg)
        BAND1 = np.logical_and(freqs >= low1, freqs <= high1)
        BAND2 = np.logical_and(freqs >= low2, freqs <= high2)
        freq_res = freqs[1] - freqs[0]
        # compuate total power
        total_power = simpson(psd[BAND1], dx=freq_res)

        # compute dominant frequencies for BAND1, BAND2
        idx_p1 = psd[BAND1].argsort()[-1]
        df1 = low1 + freq_res * idx_p1
        pdf1 = simpson(psd[BAND1][idx_p1:idx_p1+2], dx=freq_res)
        df1s.append(df1)

        idx_p2 = psd[BAND1].argsort()[-2]
        df2 = low1 + freq_res * idx_p2
        pdf2 = simpson(psd[BAND1][idx_p2:idx_p2+2], dx=freq_res)
        df2s.append(df2)

        idx_b2 = psd[BAND2].argsort()[-1]
        dfb2 = low2 + freq_res * idx_b2
        pdfb2 = simpson(psd[BAND2][idx_b2:idx_b2+2], dx=freq_res)
        dfb2s.append(dfb2)

        ratio_pdf1_tot = pdf1 / total_power

        if i != 0:
            ratio_curr_prev_df1 = df1s[-1] / df1s[-2]
            ratio_curr_prev_df2 = df2s[-1] / df2s[-2]
            ratio_curr_prev_dfb2 = dfb2s[-1] / dfb2s[-2]
        else:
            ratio_curr_prev_df1, ratio_curr_prev_df2, ratio_curr_prev_dfb2 = \
                1.0, 1.0, 1.0

        R1 = simpson(psd[freqs <= 3], dx=freq_res) / total_power
        R2 = simpson(psd[freqs > 3], dx=freq_res) / total_power

        feat_vec = [
            np.mean(data),
            np.std(data),
            df1, pdf1, df2, pdf2, dfb2, pdfb2,
            ratio_pdf1_tot,
            ratio_curr_prev_df1, ratio_curr_prev_df2, ratio_curr_prev_dfb2,
            R1, R2]
        feat_vecs.append(feat_vec)
    return feat_vecs


In [22]:
# decision tree version
class WatchLocDetector(BaseEstimator):
    def __init__(self):
        self.clf = tree.DecisionTreeClassifier()

    def fit(self, X, y):
        selector = np.array([fv[1] > 0.1 for fv in X])
        _X = X[selector]
        if isinstance(y, int):
            _y = [y] * len(_data)
        else:
            _y = y[selector]
        self.clf.fit(_X, _y)

    def score(self, X, y, sample_weight=None):
        from sklearn.metrics import accuracy_score
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

    def predict_fv(self, X):
        win_res = self.clf.predict([X])
        return stats.mode(win_res).mode[0]

    def predict(self, traces):
        # assume array
        if hasattr(traces, '__len__'):
            res = np.zeros(len(traces), dtype=int)
            _traces = traces
        else:
            res = np.zeros(1, dtype=int)
            _traces = [traces]
        i = 0
        for trace in _traces:
            if isinstance(trace, Recording):
                ax, ay, az = parse_accelerometer(trace)
                sm = compute_sm(ax, ay, az, normalize=True)
                feat_vec = encode_feat_vec(sm)
            else:
                feat_vec = trace
            win_res = self.clf.predict(feat_vec)
            res[i] = stats.mode(win_res).mode[0]
            i += 1
        if len(res) == 1:
            return res[0]
        else:
            return res


# Activity Recognition

In [23]:
class StandStillDetector(BaseEstimator):
    def __init__(self, mean_thresh = -0.1, std_thresh = 0.17, cont_thresh = 3):
        # no params, for now
        self.mean_thresh = mean_thresh
        self.std_thresh  = std_thresh
        self.cont_thresh = cont_thresh
    
    def fit(self, X, y):
        # no learning actually
        return self

    def score(self, X: Sequence[Dict], y, sample_weight=None):
        from sklearn.metrics import accuracy_score
        # assume X is traces and y is labels
        return accuracy_score(y, self.predict(X), sample_weight=sample_weight)

    def predict_fv(self, feat_vec):
        clf_as_still = False
        prev_still = False
        cont = 0
        for fv in feat_vec:
            if fv[1] < self.std_thresh and prev_still == False:
                prev_still = True
                cont = 1
                if cont >= self.cont_thresh:
                    clf_as_still = True
                    break
            elif fv[1] < self.std_thresh and prev_still == True:
                cont += 1
                if cont >= self.cont_thresh:
                    clf_as_still = True
                    break
            else:
                prev_still = False
                cont = 0
        return clf_as_still

    def predict(self, traces):
        # assume array
        if hasattr(traces, '__len__'):
            res = np.zeros(len(traces), dtype=int)
            _traces = traces
        else:
            res = np.zeros(1, dtype=int)
            _traces = [traces]
        i = 0
        for trace in _traces:
            ax, ay, az = trace['ax'], trace['ay'], trace['az']
            sm = compute_sm(ax, ay, az, normalize=True)
            feat_vec = encode_feat_vec(sm)
            clf_as_still = False
            prev_still = False
            cont = 0
            for fv in feat_vec:
                if fv[1] < self.std_thresh and prev_still == False:
                    prev_still = True
                    cont = 1
                    if cont >= self.cont_thresh:
                        clf_as_still = True
                        break
                elif fv[1] < self.std_thresh and prev_still == True:
                    cont += 1
                    if cont >= self.cont_thresh:
                        clf_as_still = True
                        break
                else:
                    prev_still = False
                    cont = 0
            res[i] = clf_as_still
            i += 1
        if len(res) == 1:
            return res[0]
        else:
            return res

In [24]:
class ActivityRecognizer(BaseEstimator):
    def __init__(self, count_tresh=2):
        self.clf = tree.DecisionTreeClassifier()
        self.count_thresh = count_tresh

    def fit(self, X, y): # X is an array of windows
        selector = np.array([fv[1] > 0.1 for fv in X])
        _X = X[selector]
        if isinstance(y, int):
            _y = [y] * len(_X)
        else:
            _y = y[selector]
        self.clf.fit(_X, _y)

    def score(self, X, y, sample_weight=None):
        y_predict = self.predict(X)
        tot_score = 0
        for true, pred in zip(y, y_predict):
            # walk 1, run 2, cycle 3
            # 0.05, 0.05, 0.1
            true_v = [1 in true, 2 in true, 3 in true]
            pred_v = [1 in pred, 2 in pred, 3 in pred]
            score = int(true_v[0] == pred_v[0]) + \
                    int(true_v[1] == pred_v[1]) + \
                    int(true_v[2] == pred_v[2])
            score /= 4
            tot_score += score
        return tot_score / len(y_predict)

    def predict_fv(self, X):
        win_res = self.clf.predict(X)
        values, counts = np.unique(win_res, return_counts=True)
        return [values[j] 
                for j, count in enumerate(counts) if count >= self.count_thresh]

    def predict(self, traces):
        # assume array
        if hasattr(traces, '__len__'):
            res = [0] * len(traces)
            _traces = traces
        else:
            res = [0]
            _traces = [traces]
        i = 0
        for trace in _traces:
            if isinstance(trace, Recording):
                ax, ay, az = parse_accelerometer(trace, use_zc=False)
                sm = compute_sm(ax, ay, az, normalize=True)
                feat_vec = encode_feat_vec(sm)
            else:
                feat_vec = trace
            # will get an activity prediction for each window
            win_res = self.clf.predict(feat_vec) 
            values, counts = np.unique(win_res, return_counts=True)
            res[i] = [values[j] 
                      for j, count in enumerate(counts) if count >= self.count_thresh]
            i += 1
        return res
#         if len(res) == 1:
#             return res[0]
#         else:
#             return res


## Path Detection

In [25]:
from scipy.spatial.transform import Rotation

def madgwick_update(q, gyro, accel, mag, beta, dt):
    q = np.array([q[0], q[1], q[2], q[3]])

    f_g = np.array([2 * (q[1] * q[3] - q[0] * q[2]) - accel[0],
                    2 * (q[0] * q[1] + q[2] * q[3]) - accel[1],
                    2 * (0.5 - q[1]**2 - q[2]**2) - accel[2]])

    f_b = np.array([2 * (q[1] * q[2] + q[0] * q[3]) - mag[0],
                    2 * (q[0] * q[1] - q[2] * q[3]) - mag[1],
                    2 * (q[0]**2 + q[3]**2 - 0.5) - mag[2]])

    j_g = np.array([[-2 * q[2], 2 * q[3], -2 * q[0], 2 * q[1]],
                    [2 * q[1],  2 * q[0],  2 * q[3], 2 * q[2]],
                    [0,         -4 * q[1], -4 * q[2],0]])

    j_b = np.array([[2 * q[3],  2 * q[2],  2 * q[1], 2 * q[0]],
                    [2 * q[0],  -2 * q[1], -2 * q[2],2 * q[3]],
                    [-4 * q[0], -4 * q[3],  0,       0]])

    step_size = beta * dt
    q += step_size * (j_g.T @ f_g + j_b.T @ f_b)
    q /= np.linalg.norm(q)
    return q

class PathDetector(BaseEstimator):
    # hack labels via GPS, delete later
    def __init__(self):
        pass
    
    def extract(self,trace):
        data = trace.data
        fea_alti = self.get_fea_alti(data['altitude'])
        fea_ori = self.get_fea_ori(data['ax'],data['ay'],data['az'],data['gx'],data['gy'],data['gz'],data['mx'],data['my'],data['mz'])
        fea = fea_alti + fea_ori
        return fea

    # required
    def fit(self, data, labels):
        self.clf = RandomForestClassifier(min_samples_leaf=10, max_depth=15, max_features=12)
        self.clf.fit(data, labels)
        print('fit score', self.clf.score(data,labels))

    # required    
    def predict(self, feature):
        fea = [feature]
        res = self.clf.predict(fea)[0]
        return res
    
    def get_fea_alti(self, alti):
        t, alti = parse(alti)
        t = t[600:]
        alti = alti[600:]
        alti = lp_filter(alti, .998)
        alti = alti - np.min(alti)
        alti[alti>60] = 60
        alti /= 60.0
        t = (t-np.min(t)) / (np.max(t)-np.min(t))
        fea_alti = split_fea(t, alti, 5, lambda x: np.nanpercentile(x,50))
        return fea_alti
    
    def get_fea_ori(self, ax, ay, az, gx, gy, gz, mx, my, mz):
        deg2rad = np.pi/180
        t, ax = parse(ax)
        t, ay = parse(ay)
        t, az = parse(az)
        t, gx = parse(gx)
        t, gy = parse(gy)
        t, gz = parse(gz)
        t, mx = parse(mx)
        t, my = parse(my)
        t, mz = parse(mz)
        gx, gy, gz = gx * deg2rad, gy * deg2rad, gz * deg2rad
        acc = np.array([ax,ay,az])
        gyro = np.array([gx,gy,gz])
        mag = np.array([mx,my,mz])

        acc /= (1e-8 + np.linalg.norm(acc,ord=2,axis=0,keepdims=True))
        mag /= (1e-8 + np.linalg.norm(mag,ord=2,axis=0,keepdims=True))

        dt = t[1] - t[0]
        beta = 0.1  # Madgwick filter gain
        
        ref_acc_vector = np.array([[0, 0, 1],[0, 0, 1],[0, 0, 1]])  # a - g for a = 0
        rotation_matrix = Rotation.align_vectors(ref_acc_vector, [acc[:,0],acc[:,1],acc[:,2]])[0].as_matrix()
        q = Rotation.from_matrix(rotation_matrix).as_quat()
        
        quats = []
        yaws = []
        for i in range(len(t)):
            q = madgwick_update(q, gyro[:,i], acc[:,i], mag[:,i], beta, dt)
            quats.append(q)
            try:
                yaw = Rotation.from_quat(q).as_euler('xyz')[-1]
            except: 
                yaw = 0
            yaws.append(yaw)
        t, yaws = np.array(t[400:]), np.array(yaws[400:])
        yaws -= yaws[0]  # sometime can jump 2pi
        sinyaws, cosyaws = np.sin(yaws), np.cos(yaws)
        sinyaws, cosyaws = lp_filter(sinyaws, .97), lp_filter(cosyaws, .97)
        fea_sinyaw = split_fea(t, sinyaws, 10)
        fea_cosyaw = split_fea(t, cosyaws, 10)
        fea_ori = fea_sinyaw + fea_cosyaw
        return fea_ori

path_detector = PathDetector()

In [26]:
pathdec_fea_test = np.load('/kaggle/working/group24_features_pathindex.npy')

In [36]:
features = np.load('/kaggle/working/group24_model_pathindex_trainfeatures.npy')
labels = np.load('/kaggle/working/group24_model_pathindex_trainlabels.npy')

# clf = RandomForestClassifier(min_samples_leaf=10, max_depth=15, max_features=12)
# scores = cross_val_score(clf, features, labels, cv=5, scoring='accuracy')
# print(scores, np.mean(scores), np.std(scores))
path_detector.fit(features, labels)

fit score 0.8517110266159695


---

In [28]:
# Get the path of all traces
dir_traces = '/kaggle/input/mobile-health-2023-path-detection/data/test'
filenames = [join(dir_traces, f) for f in listdir(dir_traces) if isfile(join(dir_traces, f))]
filenames.sort()

In [37]:
step_counter = StepCounter(half_window_size=0.75)
standstill_detector = StandStillDetector()
watchloc_detector = WatchLocDetector()
watchloc_detector.clf = load('/kaggle/working/group24_model_watchloc.joblib')
activity_recognizer = ActivityRecognizer()
activity_recognizer.clf = load('/kaggle/working/group24_model_activity.joblib')
# path_detector.clf = load('/kaggle/working/group24_model_pathindex.joblib')

In [None]:
# Loop through all traces and calculate the step count for each trace
solution_file = []
for i, filename in tqdm(enumerate(filenames)):
    trace = Recording(filename, no_labels=True, mute=True)
    categorization_results = {'watch_loc': 0, 'path_idx': 114514,
                              'step_count': 0, 'stand': 0, 'walk': 0, 'run': 0, 'cycle': 0}
    ax, ay, az = parse_accelerometer(trace)
    sm = compute_sm(ax, ay, az)
    fv = encode_feat_vec(compute_sm(
        *parse_accelerometer(trace, use_zc=False), normalize=True))

    #
    # Your algorithm goes here
    # You can access the variable 'watch_loc' in the dictionary 'categorization_results' for example with
    # categorization_results['watch_loc'] = 1
    # Make sure, you do not use the gps data and are tolerant for missing data (see task set).
    # Your program must not crash when single smartphone data traces are missing.
    #
    categorization_results['watch_loc'] = watchloc_detector.predict_fv(fv)
    categorization_results['step_count'] = step_counter.predict(sm)
    categorization_results['stand'] = int(standstill_detector.predict_fv(fv))
    res = activity_recognizer.predict_fv(fv)
    if 1 in res:
        categorization_results['walk'] = 1
    if 2 in res:
        categorization_results['run'] = 1
    if 3 in res:
        categorization_results['cycle'] = 1
    categorization_results['path_idx'] = path_detector.predict(pathdec_fea_test[i])

    # Append your calculated results and the id of each trace and category to the solution file
    trace_id = ''.join([*filename][-8:-5])
    for counter_label, category in enumerate(categorization_results):
        solution_file.append(
            [trace_id + f'_{counter_label+1}', categorization_results[category]])


25it [01:09,  2.68s/it]

In [None]:
# Write the detected step counts into a .csv file to then upload the .csv file to Kaggle
# When cross-checking the .csv file on your computer, we recommend using the text editor and NOT excel so that the results are displayed correctly
# IMPORTANT: Do NOT change the name of the columns ('Id' and 'Category') of the .csv file
submission_file_df = pd.DataFrame(np.asarray(solution_file), columns=['Id', 'Category'])
submission_file_df.to_csv('/kaggle/working/submission.csv', header=['Id', 'Category'], index=False)