In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

In [2]:
# Utils

from statistics import mean
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os, re
import xml.etree.ElementTree as ET
from datetime import datetime


def time_diff(df):
    res = []
    for i in range(len(df["Time (s)"]) - 1):
        val1 = df["Time (s)"][i]
        val2 = df["Time (s)"][i + 1]
        
        res.append(val2 - val1)
    return mean(res)

def read_phyphox(parent_dir):
    acc = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Accelerometer.csv"))
    gyro = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Gyroscope.csv"))
    loc = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Location.csv"))
        
    data_len = min(len(acc), len(gyro))
    acc = acc[0:data_len]
    gyro = gyro[0:data_len]
        
    time_step = mean([time_diff(acc), time_diff(gyro)])
    
    for i in range(data_len):
        time = time_step * i
        acc["Time (s)"][i] = time
        gyro["Time (s)"][i] = time
    
    acc.set_index('Time (s)', inplace=True)
    gyro.set_index('Time (s)', inplace=True)
    loc.set_index('Time (s)', inplace=True)
        
            
    merged = acc.join(gyro, how="outer")
    merged = pd.concat([merged, loc]).sort_index().interpolate()
    
    # Rename columns
    merged.index.names = ["time"]
    merged.rename(inplace=True, columns={
        "Acceleration x (m/s^2)": "acceleration_x",
        "Acceleration y (m/s^2)": "acceleration_y",
        "Acceleration z (m/s^2)": "acceleration_z",
        "Gyroscope x (rad/s)": "gyroscope_x",
        "Gyroscope y (rad/s)": "gyroscope_y",
        "Gyroscope z (rad/s)": "gyroscope_z",
        "Latitude (°)": "latitude",
        "Longitude (°)": "longitude",
        "Height (m)": "height",
        "Velocity (m/s)": "velocity",
        "Direction (°)": "direction",
        "Horizontal Accuracy (m)": "h_accuracy",
        "Vertical Accuracy (m)": "v_accuracy",

    })
    merged = merged.dropna()
    
    time_df = pd.read_csv(os.path.join(parent_dir, "Phyphox", "meta", "time.csv"))
    start_time = time_df.loc[time_df["event"] == "START"]["system time"][0]
    
    merged.reset_index(inplace=True)
    merged['time'] = pd.to_datetime(merged['time'] + start_time, unit='s')
    return merged



def readtcx(parent_dir):
    # Function to parse heart rate data from .tcx files my Garmin produces
    # Returns DataFrame with 'time' and 'hr' columns
    
    heartrate_data = []    
    # iterating over all files
    for files in os.listdir(parent_dir):
        if files.endswith("tcx"):
            print(files)
            with open(os.path.join(parent_dir, files)) as xml_file:
                xml_str = xml_file.read()
                xml_str = re.sub(' xmlns="[^"]+"', '', xml_str, count=1)
                root = ET.fromstring(xml_str)
                activities = root.findall('.//Activity')
                for activity in activities:
                    tracking_points = activity.findall('.//Trackpoint')
                    for tracking_point in list(tracking_points):
                        children = list(tracking_point)
                        time = datetime.strptime(children[0].text, '%Y-%m-%dT%H:%M:%S.%fZ')
                        hr = list(tracking_point.find('HeartRateBpm'))[0].text
                        heartrate_data.append([time, float(hr)])
        else:
            continue
    
    df = pd.DataFrame(heartrate_data, columns=['start_time', 'heart_rate']).dropna()
    return df


def read_samsung_health(parent_dir):
    files = os.scandir(os.path.join(parent_dir, "SamsungHealth"))
    filename = next(filter(lambda file: "com.samsung.health.exercise.live_data.json" in file.name, files)).name
        
    df = pd.read_json(os.path.join(parent_dir, "SamsungHealth", filename))
    df = df[["start_time", "heart_rate"]].dropna()

    return df

def read_combined(parent_dir, activity_type, file_type):
    pp = read_phyphox(parent_dir)

    # if samsung data or tcx format for heart_rate
    if file_type == "json":
        sh = read_samsung_health(parent_dir)
    else:
        sh = readtcx(parent_dir)

    pp["heart_rate"] = np.nan
    
    # Finds the time intervals of the heart rate measurements and update the phyphox heart rate accordingly
    # This is done as samsung measurements are not very fine grained and only give use relatively large
    # time intervals
    for i in range(len(sh) - 1):
        row1 = sh.iloc[i]
        row2 = sh.iloc[i + 1]
        pp.loc[(pp["time"] >= row1["start_time"]) & (pp["time"] < row2["start_time"]), "heart_rate"] = row1["heart_rate"]
    df = pp.dropna()

    # add an extra column to add the activity type
    df["label"] = activity_type
    return df


In [3]:
def read_datasets(data_dir):
    res_dfs = []
    temp_df = pd.DataFrame()


    for f, act_type, file_type in data_dir:
        temp_df = read_combined(f, act_type, file_type)
        temp_df = temp_df.reset_index().drop(columns="index", errors="ignore")
        res_dfs.append(temp_df.copy())
    
    return res_dfs

In [4]:
import Python3Code.Chapter3.OutlierDetection as od

def clean_chauvenet(df, cols=["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z", "latitude", "longitude", "height", "velocity", "heart_rate"]):
    n_outliers = 0
    distribution_od = od.DistributionBasedOutlierDetection()
    for col in cols:
        df[col + "_outlier"] = False
        df = distribution_od.chauvenet(df, col, 2)
        n_outliers += len(df.loc[df[col + "_outlier"] == True])    
        df.loc[df[col + "_outlier"] == True, col] = np.nan
        df.drop(columns=col + "_outlier", inplace=True)
    
    print(f"Chauvenet found {n_outliers} outliers")
    df.interpolate(inplace=True)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns="index", errors="ignore", inplace=True)
    return df

In [60]:
data_dir = [#("Bike 07-06-23 8:52","biking","json"),
                ("Bike 07-06-23 14:27","biking","json"),
                ("Tram 08-06-23 11:05","public_transport","json"),
                ("Walk 08-06-23 10:41","walking","json"),
                ("walking1","walking","tcx"),
                ("walking2","walking","tcx"),
                ("sitting1","sitting","tcx"),
                ("sitting2","sitting","tcx")]

datasets = read_datasets(data_dir)

for df in datasets:
    clean_chauvenet(df)

datasets[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = activity_type


walking1.tcx
walking2.tcx
sitting.tcx
sitting2.tcx
Chauvenet found 255 outliers
Chauvenet found 363 outliers
Chauvenet found 64 outliers
Chauvenet found 23 outliers
Chauvenet found 67 outliers
Chauvenet found 191 outliers
Chauvenet found 390 outliers


Unnamed: 0,time,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,velocity,direction,h_accuracy,v_accuracy,heart_rate,label
0,2023-06-07 11:56:26.677337885,3.812580,-0.154950,2.682000,0.413188,0.031212,0.012788,52.334749,4.864622,-1.688446,1.369676,169.266003,3.24,6.14,104.0,biking
1,2023-06-07 11:56:26.927304745,4.799370,-3.727950,-1.735950,0.258087,0.165137,0.646800,52.334749,4.864626,-1.748443,1.292982,164.042001,3.18,6.68,104.0,biking
2,2023-06-07 11:56:27.177271843,5.786160,-3.867000,-0.112050,-0.114813,0.458150,0.409819,52.334749,4.864630,-1.808440,1.216289,158.817999,3.12,7.22,104.0,biking
3,2023-06-07 11:56:27.427238703,6.772950,-6.589950,1.590000,-0.573788,-0.223850,0.172837,52.334750,4.864634,-1.868437,1.139595,153.593997,3.06,7.76,104.0,biking
4,2023-06-07 11:56:27.548669100,6.752475,-7.456950,0.851025,-0.328350,-0.071844,0.131312,52.334750,4.864638,-1.928434,1.062901,148.369995,3.00,8.30,104.0,biking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7483,2023-06-07 12:21:23.229163408,-1.780950,-9.523951,-2.586000,1.098350,-0.448388,-0.719262,52.278317,4.825148,-53.089947,0.000000,293.380005,2.00,6.92,151.0,biking
7484,2023-06-07 12:21:23.479130268,1.044000,-9.225000,1.839000,0.159362,0.112613,0.077688,52.278317,4.825149,-53.069946,0.000000,293.380005,2.00,7.08,151.0,biking
7485,2023-06-07 12:21:23.729097128,-1.207050,-9.472051,-1.357050,0.081400,-0.102437,-0.004262,52.278317,4.825149,-53.049946,0.000000,293.380005,2.00,7.24,151.0,biking
7486,2023-06-07 12:21:23.875818014,-0.524025,-9.304050,-0.554025,0.733425,-0.715412,-0.340313,52.278317,4.825150,-53.029946,0.000000,293.380005,2.00,7.40,151.0,biking


In [61]:
from Python3Code.Chapter4.FrequencyAbstraction import FourierTransformation
import copy

def extract_frequency(df, periodic_predictor_cols, fs=4, window_size=40):
    print("Extracting frequency features")
    FreqAbs = FourierTransformation()
    data_table = FreqAbs.abstract_frequency(df, periodic_predictor_cols, window_size, fs)
    return data_table

In [62]:
from Python3Code.Chapter4.TemporalAbstraction import NumericalAbstraction

def extract_temporal(df, feature_map, window_size=40):
    print("Extracting temporal features")
    NumAbs = NumericalAbstraction()
    
    if "time" in df:
        df.set_index("time", inplace=True)
        
    for (func, cols) in feature_map.items():
        df = NumAbs.abstract_numerical(df, cols, 40, func)
    
    df.reset_index(inplace=True)
    df.drop(columns="index", errors="ignore", inplace=True)
    return df

In [63]:
def extract_features(df, temporal_features, frequency_features=['acceleration_x', 'acceleration_y', 'acceleration_z', 'gyroscope_x', 'gyroscope_y', 'gyroscope_z']):
    extract_frequency(df, periodic_predictor_cols=frequency_features)
    extract_temporal(df, temporal_features)
    
    df.replace([np.inf, -np.inf], np.nan, inplace=True)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns="index", errors="ignore", inplace=True)
    return df

In [64]:
temporal_features = {
    "std": ["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z"],
    "mean": ["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z", "latitude", "longitude", "height", "velocity", "heart_rate"],
    "median": ["height", "heart_rate"],
    "slope": ["latitude", "longitude", "velocity"],
    "max": ["velocity", "heart_rate"],
    "min": ["velocity", "heart_rate"],
}

for df in datasets:
    extract_features(df, temporal_features)

datasets[0]

Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features
Extracting frequency features
Extracting temporal features


Unnamed: 0,time,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,...,heart_rate_temp_mean_ws_40,height_temp_median_ws_40,heart_rate_temp_median_ws_40,latitude_temp_slope_ws_40,longitude_temp_slope_ws_40,velocity_temp_slope_ws_40,velocity_temp_max_ws_40,heart_rate_temp_max_ws_40,velocity_temp_min_ws_40,heart_rate_temp_min_ws_40
0,2023-06-07 11:56:34.873654604,-0.935025,-2.463000,-10.538476,1.091131,0.181294,-0.155169,52.334942,4.864727,-1.028114,...,104.000000,-1.228140,104.0,0.000006,0.000003,-0.003138,1.369676,104.0,0.0,104.0
1,2023-06-07 11:56:34.926245689,0.382950,-3.565950,-11.755951,1.390125,0.212300,-0.102163,52.334941,4.864729,-0.988114,...,104.000000,-1.208141,104.0,0.000006,0.000003,-0.002224,1.369676,104.0,0.0,104.0
2,2023-06-07 11:56:35.176212549,0.964950,-4.150050,-8.461050,1.177137,-0.272662,-0.115225,52.334940,4.864731,-0.948115,...,103.976744,-1.188142,104.0,0.000006,0.000003,-0.001273,1.369676,104.0,0.0,103.0
3,2023-06-07 11:56:35.426179409,0.558000,-7.615951,-6.384000,-1.319862,-1.076625,-0.187275,52.334939,4.864734,-0.908115,...,103.954545,-1.188139,104.0,0.000006,0.000003,-0.000296,1.369676,104.0,0.0,103.0
4,2023-06-07 11:56:35.676146269,-0.591000,-6.667050,-7.323000,-0.738787,-0.381700,0.122787,52.334938,4.864736,-0.868115,...,103.933333,-1.188135,104.0,0.000005,0.000003,0.000696,1.369676,104.0,0.0,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7443,2023-06-07 12:21:23.229163408,-1.780950,-9.523951,-2.586000,1.098350,-0.448388,-0.719262,52.278317,4.825148,-53.089947,...,150.970149,-50.329390,151.0,0.000002,-0.000011,-0.019119,5.077735,151.0,0.0,150.0
7444,2023-06-07 12:21:23.479130268,1.044000,-9.225000,1.839000,0.159362,0.112613,0.077688,52.278317,4.825149,-53.069946,...,150.975124,-50.329430,151.0,0.000002,-0.000011,-0.019538,5.077735,151.0,0.0,150.0
7445,2023-06-07 12:21:23.729097128,-1.207050,-9.472051,-1.357050,0.081400,-0.102437,-0.004262,52.278317,4.825149,-53.049946,...,150.980100,-50.349490,151.0,0.000002,-0.000011,-0.019945,5.077735,151.0,0.0,150.0
7446,2023-06-07 12:21:23.875818014,-0.524025,-9.304050,-0.554025,0.733425,-0.715412,-0.340313,52.278317,4.825150,-53.029946,...,150.985075,-50.369391,151.0,0.000002,-0.000010,-0.020342,5.077735,151.0,0.0,150.0


In [65]:
features = [
    # Freq
    "acceleration_x_max_freq", "acceleration_x_freq_weighted", "acceleration_x_pse",
    "acceleration_y_max_freq", "acceleration_y_freq_weighted", "acceleration_y_pse",
    "acceleration_z_max_freq", "acceleration_z_freq_weighted", "acceleration_z_pse",
    "gyroscope_x_max_freq", "gyroscope_x_freq_weighted", "gyroscope_x_pse",
    "gyroscope_y_max_freq", "gyroscope_y_freq_weighted", "gyroscope_y_pse",
    "gyroscope_z_max_freq", "gyroscope_z_freq_weighted", "gyroscope_z_pse",
    
    # Temp
    
    # std
    "acceleration_x_temp_std_ws_40", "acceleration_y_temp_std_ws_40", "acceleration_z_temp_std_ws_40",
    "gyroscope_x_temp_std_ws_40", "gyroscope_y_temp_std_ws_40", "gyroscope_z_temp_std_ws_40",
    
    # mean
    "acceleration_x_temp_mean_ws_40", "acceleration_y_temp_mean_ws_40", "acceleration_z_temp_mean_ws_40",
    "gyroscope_x_temp_mean_ws_40", "gyroscope_y_temp_mean_ws_40", "gyroscope_z_temp_mean_ws_40",
    "latitude_temp_mean_ws_40", "longitude_temp_mean_ws_40", "height_temp_mean_ws_40",
    "velocity_temp_mean_ws_40", "heart_rate_temp_mean_ws_40",
    
    # median
    "height_temp_median_ws_40", "heart_rate_temp_median_ws_40",
    
    # slope
    "latitude_temp_slope_ws_40", "longitude_temp_slope_ws_40", "velocity_temp_slope_ws_40",
    
    # max
    "velocity_temp_max_ws_40", "heart_rate_temp_max_ws_40"
    
    # min
    "velocity_temp_min_ws_40", "heart_rate_temp_min_ws_40"
]

print(len(features))

43


In [66]:
def train_test_split(df, train_percentage=0.8):
    return np.split(df, [int(train_percentage * len(df))])

train_datasets = []
test_datasets = []

train_len = 0
test_len = 0

for df in datasets:
    train, test = train_test_split(df)
    train_len += len(train)
    test_len += len(test)
    train_datasets.append(train)
    test_datasets.append(test)

print(f"train len: {train_len}")
print(f"test  len: {test_len}")

train len: 23937
test  len: 5988


In [67]:
def fold_merge(datasets, kfolds=10):
    datasets_folds = []
    for df in datasets:
        datasets_folds.append(np.array_split(df, kfolds))
    
    ordered_folds = []
    for i in range(kfolds):
        folds = []
        for j in range(len(datasets_folds)):
            folds.append(datasets_folds[j][i])
        
        ordered_folds.append(pd.concat(folds, axis=0))
        ordered_folds[i].set_index("time", inplace=True)
    
    #df = pd.concat(ordered_folds, axis=0)
    #df.reset_index(inplace=True)
    return ordered_folds

folds = fold_merge(train_datasets)

for f in folds:
    print(df.isnull().values.any())

folds[0]

False
False
False
False
False
False
False
False
False
False


Unnamed: 0_level_0,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,velocity,...,heart_rate_temp_mean_ws_40,height_temp_median_ws_40,heart_rate_temp_median_ws_40,latitude_temp_slope_ws_40,longitude_temp_slope_ws_40,velocity_temp_slope_ws_40,velocity_temp_max_ws_40,heart_rate_temp_max_ws_40,velocity_temp_min_ws_40,heart_rate_temp_min_ws_40
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-07 11:56:34.873654604,-0.935025,-2.463000,-10.538476,1.091131,0.181294,-0.155169,52.334942,4.864727,-1.028114,0.852895,...,104.000000,-1.228140,104.0,5.955314e-06,2.747380e-06,-0.003138,1.369676,104.0,0.0,104.0
2023-06-07 11:56:34.926245689,0.382950,-3.565950,-11.755951,1.390125,0.212300,-0.102163,52.334941,4.864729,-0.988114,0.900970,...,104.000000,-1.208141,104.0,5.849553e-06,2.711685e-06,-0.002224,1.369676,104.0,0.0,104.0
2023-06-07 11:56:35.176212549,0.964950,-4.150050,-8.461050,1.177137,-0.272662,-0.115225,52.334940,4.864731,-0.948115,0.949045,...,103.976744,-1.188142,104.0,5.727230e-06,2.679707e-06,-0.001273,1.369676,104.0,0.0,103.0
2023-06-07 11:56:35.426179409,0.558000,-7.615951,-6.384000,-1.319862,-1.076625,-0.187275,52.334939,4.864734,-0.908115,0.997120,...,103.954545,-1.188139,104.0,5.603492e-06,2.651000e-06,-0.000296,1.369676,104.0,0.0,103.0
2023-06-07 11:56:35.676146269,-0.591000,-6.667050,-7.323000,-0.738787,-0.381700,0.122787,52.334938,4.864736,-0.868115,1.045195,...,103.933333,-1.188135,104.0,5.479292e-06,2.625180e-06,0.000696,1.369676,104.0,0.0,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-11 11:21:16.950255632,-9.102896,-3.635894,0.406182,-0.035582,0.030924,0.003665,52.321522,4.871867,-21.697816,0.000000,...,61.771429,-4.175054,59.0,-8.418642e-08,-2.996931e-07,0.000000,0.000000,73.0,0.0,55.0
2023-06-11 11:21:17.008990049,-9.102896,-3.635894,0.406182,-0.020616,0.061237,-0.006414,52.321523,4.871867,-21.377478,0.000000,...,61.806818,-4.309688,59.0,-8.309121e-08,-2.940520e-07,0.000000,0.000000,73.0,0.0,55.0
2023-06-11 11:21:17.256322861,-9.086145,-3.613760,-0.037089,-0.011148,0.027030,-0.005956,52.321523,4.871868,-21.057140,0.000000,...,61.875000,-4.578956,59.0,-8.226832e-08,-2.864277e-07,0.000000,0.000000,73.0,0.0,55.0
2023-06-11 11:21:17.503655434,-9.109476,-3.594618,-0.111864,-0.005345,-0.005192,-0.000000,52.321523,4.871869,-20.736802,0.000000,...,61.943182,-4.848223,59.0,-8.142863e-08,-2.785472e-07,0.000000,0.000000,73.0,0.0,55.0


In [68]:
# Obtain a dataframe X containing only the selected features and Y containing only the labels
def separate_XY(df, cols, label_col="label"):
    return df[df.columns.intersection(cols)], df[label_col]

In [77]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import sys

def cross_validation(model, folds, features):
    for fold_nr in range(len(folds) - 1):
        df_train = pd.concat(folds[0:fold_nr + 1])
        X_train, y_train = separate_XY(df_train, features)
        
        model.fit(X_train, y_train)
        
        df_val = folds[fold_nr + 1]
        X_val, y_val = separate_XY(df_val, features)
        
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        #print(y_train_pred)
        #print(y_train_pred, y_val_pred)
        
        #cm = confusion_matrix(y_val, y_val_pred)
        #print(cm)

        train_f1, val_f1 = f1_score(y_train, y_train_pred, average="macro"), f1_score(y_val, y_val_pred, average="macro")
        print(f"Fold {fold_nr}, train F1-score:{train_f1}, validation F1-score: {val_f1}")



from sklearn.naive_bayes import GaussianNB

np.set_printoptions(threshold=sys.maxsize)

classifier = GaussianNB()
cross_validation(classifier, folds, features)

Fold 0, train F1-score:0.10682085731370058, validation F1-score: 0.1007997334221926
Fold 1, train F1-score:0.10449470206160122, validation F1-score: 0.10070023341113704
Fold 2, train F1-score:0.10323529342007587, validation F1-score: 0.10060060060060061
Fold 3, train F1-score:0.10257867946973508, validation F1-score: 0.10060060060060061
Fold 4, train F1-score:0.10218405400178716, validation F1-score: 0.10060060060060061
Fold 5, train F1-score:0.10236918502562371, validation F1-score: 0.10127602742895532
Fold 6, train F1-score:0.10221324646745081, validation F1-score: 0.10127602742895532
Fold 7, train F1-score:0.10218033020851469, validation F1-score: 0.10056799198128967
Fold 8, train F1-score:0.10215132846731727, validation F1-score: 0.10063523905048478
