In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os

In [2]:
# Utils

from statistics import mean
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import os, re
import xml.etree.ElementTree as ET
from datetime import datetime


def time_diff(df):
    res = []
    for i in range(len(df["Time (s)"]) - 1):
        val1 = df["Time (s)"][i]
        val2 = df["Time (s)"][i + 1]
        
        res.append(val2 - val1)
    return mean(res)

def read_phyphox(parent_dir):
    acc = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Accelerometer.csv"))
    gyro = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Gyroscope.csv"))
    loc = pd.read_csv(os.path.join(parent_dir, "Phyphox", "Location.csv"))
        
    data_len = min(len(acc), len(gyro))
    acc = acc[0:data_len]
    gyro = gyro[0:data_len]
        
    time_step = mean([time_diff(acc), time_diff(gyro)])
    
    for i in range(data_len):
        time = time_step * i
        acc["Time (s)"][i] = time
        gyro["Time (s)"][i] = time
    
    acc.set_index('Time (s)', inplace=True)
    gyro.set_index('Time (s)', inplace=True)
    loc.set_index('Time (s)', inplace=True)
        
            
    merged = acc.join(gyro, how="outer")
    merged = pd.concat([merged, loc]).sort_index().interpolate()
    
    # Rename columns
    merged.index.names = ["time"]
    merged.rename(inplace=True, columns={
        "Acceleration x (m/s^2)": "acceleration_x",
        "Acceleration y (m/s^2)": "acceleration_y",
        "Acceleration z (m/s^2)": "acceleration_z",
        "Gyroscope x (rad/s)": "gyroscope_x",
        "Gyroscope y (rad/s)": "gyroscope_y",
        "Gyroscope z (rad/s)": "gyroscope_z",
        "Latitude (°)": "latitude",
        "Longitude (°)": "longitude",
        "Height (m)": "height",
        "Velocity (m/s)": "velocity",
        "Direction (°)": "direction",
        "Horizontal Accuracy (m)": "h_accuracy",
        "Vertical Accuracy (m)": "v_accuracy",

    })
    merged = merged.dropna()
    
    time_df = pd.read_csv(os.path.join(parent_dir, "Phyphox", "meta", "time.csv"))
    start_time = time_df.loc[time_df["event"] == "START"]["system time"][0]
    
    merged.reset_index(inplace=True)
    merged['time'] = pd.to_datetime(merged['time'] + start_time, unit='s')
    return merged



def readtcx(parent_dir):
    # Function to parse heart rate data from .tcx files my Garmin produces
    # Returns DataFrame with 'time' and 'hr' columns
    
    heartrate_data = []    
    # iterating over all files
    for files in os.listdir(parent_dir):
        if files.endswith("tcx"):
            print(files)
            with open(os.path.join(parent_dir, files)) as xml_file:
                xml_str = xml_file.read()
                xml_str = re.sub(' xmlns="[^"]+"', '', xml_str, count=1)
                root = ET.fromstring(xml_str)
                activities = root.findall('.//Activity')
                for activity in activities:
                    tracking_points = activity.findall('.//Trackpoint')
                    for tracking_point in list(tracking_points):
                        children = list(tracking_point)
                        time = datetime.strptime(children[0].text, '%Y-%m-%dT%H:%M:%S.%fZ')
                        hr = list(tracking_point.find('HeartRateBpm'))[0].text
                        heartrate_data.append([time, float(hr)])
        else:
            continue
    
    df = pd.DataFrame(heartrate_data, columns=['start_time', 'heart_rate']).dropna()
    return df


def read_samsung_health(parent_dir):
    files = os.scandir(os.path.join(parent_dir, "SamsungHealth"))
    filename = next(filter(lambda file: "com.samsung.health.exercise.live_data.json" in file.name, files)).name
        
    df = pd.read_json(os.path.join(parent_dir, "SamsungHealth", filename))
    df = df[["start_time", "heart_rate"]].dropna()

    return df

def read_combined(parent_dir, activity_type, file_type):
    pp = read_phyphox(parent_dir)

    # if samsung data or tcx format for heart_rate
    if file_type == "json":
        sh = read_samsung_health(parent_dir)
    else:
        sh = readtcx(parent_dir)

    pp["heart_rate"] = np.nan
    
    # Finds the time intervals of the heart rate measurements and update the phyphox heart rate accordingly
    # This is done as samsung measurements are not very fine grained and only give use relatively large
    # time intervals
    for i in range(len(sh) - 1):
        row1 = sh.iloc[i]
        row2 = sh.iloc[i + 1]
        pp.loc[(pp["time"] >= row1["start_time"]) & (pp["time"] < row2["start_time"]), "heart_rate"] = row1["heart_rate"]
    df = pp.dropna()

    # add an extra column to add the activity type
    df["label"] = activity_type
    return df


In [3]:
def read_datasets(data_dir):
    res_dfs = []
    temp_df = pd.DataFrame()


    for f, act_type, file_type in data_dir:
        temp_df = read_combined(f, act_type, file_type)
        temp_df = temp_df.reset_index().drop(columns="index", errors="ignore")
        res_dfs.append(temp_df.copy())
    
    return res_dfs

In [4]:
import Python3Code.Chapter3.OutlierDetection as od

def clean_chauvenet(df, cols=["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z", "latitude", "longitude", "height", "velocity", "heart_rate"]):
    n_outliers = 0
    distribution_od = od.DistributionBasedOutlierDetection()
    for col in cols:
        df[col + "_outlier"] = False
        df = distribution_od.chauvenet(df, col, 2)
        n_outliers += len(df.loc[df[col + "_outlier"] == True])    
        df.loc[df[col + "_outlier"] == True, col] = np.nan
        df.drop(columns=col + "_outlier", inplace=True)
    
    print(f"Chauvenet found {n_outliers} outliers")
    df.interpolate(inplace=True)
    df.dropna(inplace=True)
    df.reset_index(inplace=True)
    df.drop(columns="index", errors="ignore", inplace=True)
    return df

In [138]:
data_dir = [#("Bike 07-06-23 8:52","biking","json"),
                ("Bike 07-06-23 14:27","biking","json"),
                ("Tram 08-06-23 11:05","public_transport","json"),
                ("Walk 08-06-23 10:41","walking","json"),
                ("walking1","walking","tcx"),
                ("walking2","walking","tcx"),
                ("sitting1","sitting","tcx"),
                ("sitting2","sitting","tcx")]

orig_datasets = read_datasets(data_dir)

for df in orig_datasets:
    clean_chauvenet(df)

orig_datasets[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["label"] = activity_type


walking1.tcx
walking2.tcx
sitting.tcx
sitting2.tcx
Chauvenet found 255 outliers
Chauvenet found 363 outliers
Chauvenet found 64 outliers
Chauvenet found 23 outliers
Chauvenet found 67 outliers
Chauvenet found 191 outliers
Chauvenet found 390 outliers


Unnamed: 0,time,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,velocity,direction,h_accuracy,v_accuracy,heart_rate,label
0,2023-06-07 11:56:26.677337885,3.812580,-0.154950,2.682000,0.413188,0.031212,0.012788,52.334749,4.864622,-1.688446,1.369676,169.266003,3.24,6.14,104.0,biking
1,2023-06-07 11:56:26.927304745,4.799370,-3.727950,-1.735950,0.258087,0.165137,0.646800,52.334749,4.864626,-1.748443,1.292982,164.042001,3.18,6.68,104.0,biking
2,2023-06-07 11:56:27.177271843,5.786160,-3.867000,-0.112050,-0.114813,0.458150,0.409819,52.334749,4.864630,-1.808440,1.216289,158.817999,3.12,7.22,104.0,biking
3,2023-06-07 11:56:27.427238703,6.772950,-6.589950,1.590000,-0.573788,-0.223850,0.172837,52.334750,4.864634,-1.868437,1.139595,153.593997,3.06,7.76,104.0,biking
4,2023-06-07 11:56:27.548669100,6.752475,-7.456950,0.851025,-0.328350,-0.071844,0.131312,52.334750,4.864638,-1.928434,1.062901,148.369995,3.00,8.30,104.0,biking
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7483,2023-06-07 12:21:23.229163408,-1.780950,-9.523951,-2.586000,1.098350,-0.448388,-0.719262,52.278317,4.825148,-53.089947,0.000000,293.380005,2.00,6.92,151.0,biking
7484,2023-06-07 12:21:23.479130268,1.044000,-9.225000,1.839000,0.159362,0.112613,0.077688,52.278317,4.825149,-53.069946,0.000000,293.380005,2.00,7.08,151.0,biking
7485,2023-06-07 12:21:23.729097128,-1.207050,-9.472051,-1.357050,0.081400,-0.102437,-0.004262,52.278317,4.825149,-53.049946,0.000000,293.380005,2.00,7.24,151.0,biking
7486,2023-06-07 12:21:23.875818014,-0.524025,-9.304050,-0.554025,0.733425,-0.715412,-0.340313,52.278317,4.825150,-53.029946,0.000000,293.380005,2.00,7.40,151.0,biking


In [61]:
from Python3Code.Chapter4.FrequencyAbstraction import FourierTransformation
import copy

def extract_frequency(df, periodic_predictor_cols, fs=4, window_size=40):
    print("Extracting frequency features")
    FreqAbs = FourierTransformation()
    data_table = FreqAbs.abstract_frequency(df, periodic_predictor_cols, window_size, fs)
    return data_table

In [62]:
from Python3Code.Chapter4.TemporalAbstraction import NumericalAbstraction

def extract_temporal(df, feature_map, window_size=40):
    print("Extracting temporal features")
    NumAbs = NumericalAbstraction()
    
    if "time" in df:
        df.set_index("time", inplace=True)
        
    for (func, cols) in feature_map.items():
        df = NumAbs.abstract_numerical(df, cols, 40, func)
    
    df.reset_index(inplace=True)
    df.drop(columns="index", errors="ignore", inplace=True)
    return df

In [367]:
def extract_features(df, temporal_features, window_size=40, remove_overlap=0.5, frequency_features=['acceleration_x', 'acceleration_y', 'acceleration_z', 'gyroscope_x', 'gyroscope_y', 'gyroscope_z']):
    df_copy = df.copy()
    extract_frequency(df_copy, window_size=window_size, periodic_predictor_cols=frequency_features)
    extract_temporal(df_copy, temporal_features, window_size=window_size / 4)
    
    keep = []
    
    item_count = 0
    for i in range(len(df_copy)):
        if item_count == 0:
            keep.append(df_copy.iloc[[i]])
        
        item_count = (item_count + 1) % (window_size * remove_overlap + 1)
    
    print("Len before:", len(df))
    print("Kept:", len(keep))
    
    keep = pd.concat(keep)
    
    keep.replace([np.inf, -np.inf], np.nan, inplace=True)
    keep.dropna(inplace=True)
    keep.reset_index(inplace=True)
    keep.drop(columns="index", errors="ignore", inplace=True)
    return keep

In [379]:
temporal_features = {
    "std": ["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z"],
    "mean": ["acceleration_x", "acceleration_y", "acceleration_z", "gyroscope_x", "gyroscope_y", "gyroscope_z", "latitude", "longitude", "height", "velocity", "heart_rate"],
    "median": ["height", "heart_rate"],
    "slope": ["latitude", "longitude", "velocity"],
    "max": ["velocity", "heart_rate"],
    "min": ["velocity", "heart_rate"],
}

datasets = []

for df in orig_datasets:
    datasets.append(extract_features(df, temporal_features, remove_overlap=0.5))

datasets[0]

Extracting frequency features
Extracting temporal features
Len before: 7488
Kept: 357
Extracting frequency features
Extracting temporal features
Len before: 5252
Kept: 251
Extracting frequency features
Extracting temporal features
Len before: 2606
Kept: 125
Extracting frequency features
Extracting temporal features
Len before: 2632
Kept: 126
Extracting frequency features
Extracting temporal features
Len before: 2425
Kept: 116
Extracting frequency features
Extracting temporal features
Len before: 3414
Kept: 163
Extracting frequency features
Extracting temporal features
Len before: 6390
Kept: 305


Unnamed: 0,time,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,...,heart_rate_temp_mean_ws_40,height_temp_median_ws_40,heart_rate_temp_median_ws_40,latitude_temp_slope_ws_40,longitude_temp_slope_ws_40,velocity_temp_slope_ws_40,velocity_temp_max_ws_40,heart_rate_temp_max_ws_40,velocity_temp_min_ws_40,heart_rate_temp_min_ws_40
0,2023-06-07 11:56:35.176212549,0.964950,-4.150050,-8.461050,1.177137,-0.272662,-0.115225,52.334940,4.864731,-0.948115,...,103.976744,-1.188142,104.0,5.727230e-06,0.000003,-0.001273,1.369676,104.0,0.000000,103.0
1,2023-06-07 11:56:39.425649881,1.293000,-8.317050,-8.142000,-1.055863,-0.355713,0.094875,52.334931,4.864691,-0.330751,...,103.656250,-1.028181,104.0,3.466938e-06,0.000002,0.003458,2.278778,104.0,0.000000,103.0
2,2023-06-07 11:56:43.675087214,-0.220050,-9.175051,-9.289950,-1.698950,-0.096662,0.111787,52.334905,4.864359,-0.337644,...,103.494118,-0.868176,103.0,1.880287e-06,-0.000002,0.030417,3.681808,104.0,0.000000,103.0
3,2023-06-07 11:56:47.880674839,-1.794450,-4.917450,-11.930476,1.034137,0.272319,0.096250,52.334875,4.863957,-0.344538,...,104.188679,-0.515624,104.0,9.703517e-07,-0.000006,0.033919,4.213001,109.0,0.000000,103.0
4,2023-06-07 11:56:51.923994780,-0.603000,-1.120050,-4.573950,2.410100,0.310337,0.203775,52.334862,4.863550,-0.351431,...,104.984252,-0.348805,104.0,3.970291e-07,-0.000009,0.041268,5.503822,109.0,0.000000,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
350,2023-06-07 12:21:04.981579781,-1.362000,-4.207050,-10.180950,-0.625625,0.012512,-0.049362,52.278416,4.825751,-49.809470,...,151.686567,-50.309376,151.0,3.533474e-06,-0.000013,0.001999,5.108618,155.0,2.958109,150.0
351,2023-06-07 12:21:09.231016874,-0.250050,-8.026951,-7.180050,-1.609575,-0.338525,-0.572275,52.278478,4.825521,-50.669511,...,151.268657,-50.229484,151.0,3.531478e-06,-0.000013,-0.003888,5.108618,155.0,3.568569,150.0
352,2023-06-07 12:21:13.480454206,0.747000,-5.484000,-6.004050,-1.525563,-0.573512,-0.009350,52.278438,4.825361,-53.729658,...,150.850746,-50.229484,151.0,3.329401e-06,-0.000013,-0.007837,5.108618,155.0,2.970462,150.0
353,2023-06-07 12:21:17.729891539,-0.280050,-4.957050,-10.020000,-0.186587,0.280638,0.752675,52.278329,4.825254,-54.109870,...,150.830846,-50.229484,151.0,2.716908e-06,-0.000012,-0.009624,5.108618,151.0,2.820266,150.0


In [414]:
features = [
    # Freq
    "acceleration_x_max_freq", "acceleration_x_freq_weighted", #"acceleration_x_pse",
    "acceleration_y_max_freq", "acceleration_y_freq_weighted", #"acceleration_y_pse",
    "acceleration_z_max_freq", "acceleration_z_freq_weighted", #"acceleration_z_pse",
    "gyroscope_x_max_freq", "gyroscope_x_freq_weighted", #"gyroscope_x_pse",
    "gyroscope_y_max_freq", "gyroscope_y_freq_weighted", #"gyroscope_y_pse",
    "gyroscope_z_max_freq", "gyroscope_z_freq_weighted", #"gyroscope_z_pse",
    
    # Temp
    
    # std
    "acceleration_x_temp_std_ws_40", "acceleration_y_temp_std_ws_40", "acceleration_z_temp_std_ws_40",
    "gyroscope_x_temp_std_ws_40", "gyroscope_y_temp_std_ws_40", "gyroscope_z_temp_std_ws_40",
    
    # mean
    #"acceleration_x_temp_mean_ws_40", "acceleration_y_temp_mean_ws_40", "acceleration_z_temp_mean_ws_40",
    #"gyroscope_x_temp_mean_ws_40", "gyroscope_y_temp_mean_ws_40", "gyroscope_z_temp_mean_ws_40",
    #"latitude_temp_mean_ws_40", "longitude_temp_mean_ws_40", "height_temp_mean_ws_40",
    #"velocity_temp_mean_ws_40", "heart_rate_temp_mean_ws_40",
    
    # median
    #"height_temp_median_ws_40", "heart_rate_temp_median_ws_40",
    
    # slope
    "latitude_temp_slope_ws_40", "longitude_temp_slope_ws_40", #"velocity_temp_slope_ws_40",
    
    # max
    "velocity_temp_max_ws_40", "heart_rate_temp_max_ws_40"
    
    # min
    "velocity_temp_min_ws_40", "heart_rate_temp_min_ws_40"
]

#features = [
#    "heart_rate", "heart_rate_temp_max_ws_40", "heart_rate_temp_min_ws_40",
#    
#    
#]

features = ['heart_rate',
 'acceleration_x_max_freq',
 'acceleration_z_max_freq',
 'gyroscope_x_temp_std_ws_40',
 'acceleration_y_temp_mean_ws_40',
 'heart_rate_temp_mean_ws_40',
 'heart_rate_temp_median_ws_40',
 'velocity_temp_max_ws_40',
 'heart_rate_temp_max_ws_40',
 'heart_rate_temp_min_ws_40']

print(sum(map(lambda x: len(x), datasets)))
print(len(features))

1429
10


In [399]:
def train_test_split(df, train_percentage=0.8):
    return np.split(df, [int(train_percentage * len(df))])

train_datasets = []
test_datasets = []

train_len = 0
test_len = 0

for df in datasets:
    train, test = train_test_split(df, train_percentage=0.8)
    train_len += len(train)
    test_len += len(test)
    train_datasets.append(train)
    test_datasets.append(test)

print(f"train len: {train_len}")
print(f"test  len: {test_len}")

train len: 1141
test  len: 288


In [400]:
def fold_merge(datasets, kfolds=10):
    datasets_folds = []
    for df in datasets:
        datasets_folds.append(np.array_split(df, kfolds))
    
    ordered_folds = []
    for i in range(kfolds):
        folds = []
        for j in range(len(datasets_folds)):
            folds.append(datasets_folds[j][i])
        
        ordered_folds.append(pd.concat(folds, axis=0))
        ordered_folds[i].set_index("time", inplace=True)
    
    #df = pd.concat(ordered_folds, axis=0)
    #df.reset_index(inplace=True)
    return ordered_folds

folds = fold_merge(train_datasets)

for f in folds:
    print(df.isnull().values.any())

folds[0]

False
False
False
False
False
False
False
False
False
False


Unnamed: 0_level_0,acceleration_x,acceleration_y,acceleration_z,gyroscope_x,gyroscope_y,gyroscope_z,latitude,longitude,height,velocity,...,heart_rate_temp_mean_ws_40,height_temp_median_ws_40,heart_rate_temp_median_ws_40,latitude_temp_slope_ws_40,longitude_temp_slope_ws_40,velocity_temp_slope_ws_40,velocity_temp_max_ws_40,heart_rate_temp_max_ws_40,velocity_temp_min_ws_40,heart_rate_temp_min_ws_40
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2023-06-07 11:56:35.176212549,0.964950,-4.150050,-8.461050,1.177137,-0.272662,-0.115225,52.334940,4.864731,-0.948115,0.949045,...,103.976744,-1.188142,104.0,5.727230e-06,2.679707e-06,-0.001273,1.369676,104.0,0.0,103.0
2023-06-07 11:56:39.425649881,1.293000,-8.317050,-8.142000,-1.055863,-0.355713,0.094875,52.334931,4.864691,-0.330751,2.278778,...,103.656250,-1.028181,104.0,3.466938e-06,1.543498e-06,0.003458,2.278778,104.0,0.0,103.0
2023-06-07 11:56:43.675087214,-0.220050,-9.175051,-9.289950,-1.698950,-0.096662,0.111787,52.334905,4.864359,-0.337644,2.330559,...,103.494118,-0.868176,103.0,1.880287e-06,-1.639949e-06,0.030417,3.681808,104.0,0.0,103.0
2023-06-07 11:56:47.880674839,-1.794450,-4.917450,-11.930476,1.034137,0.272319,0.096250,52.334875,4.863957,-0.344538,4.213001,...,104.188679,-0.515624,104.0,9.703517e-07,-5.622530e-06,0.033919,4.213001,109.0,0.0,103.0
2023-06-07 11:56:51.923994780,-0.603000,-1.120050,-4.573950,2.410100,0.310337,0.203775,52.334862,4.863550,-0.351431,5.503822,...,104.984252,-0.348805,104.0,3.970291e-07,-8.642763e-06,0.041268,5.503822,109.0,0.0,103.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-06-11 11:20:59.695702553,-7.222735,-2.188834,-6.489335,-0.146451,-0.104149,0.436145,52.321516,4.871838,-6.329196,0.000000,...,55.750000,6.057112,56.0,-7.205649e-08,-4.012118e-07,0.000000,0.000000,59.0,0.0,54.0
2023-06-11 11:21:04.889688730,-7.808977,-2.323431,-5.926424,0.180658,-0.215171,-0.094987,52.321519,4.871839,-11.983818,0.000000,...,56.658537,6.057111,56.0,-1.044004e-07,-5.199655e-07,0.000000,0.000000,67.0,0.0,54.0
2023-06-11 11:21:08.947036505,-8.993723,-3.825226,0.838385,-0.013591,0.024128,-0.008476,52.321514,4.871837,-35.661262,0.000000,...,58.648810,4.845414,56.0,-1.081591e-07,-5.494147e-07,0.000000,0.000000,73.0,0.0,55.0
2023-06-11 11:21:13.051667213,-9.004789,-3.833302,0.797408,0.006719,-0.010537,0.001222,52.321517,4.871882,-35.532811,0.000000,...,60.412791,-0.270672,57.0,-1.089154e-07,-4.184581e-07,0.000000,0.000000,73.0,0.0,55.0


In [401]:
# Obtain a dataframe X containing only the selected features and Y containing only the labels
def separate_XY(df, cols, label_col="label"):
    return df[df.columns.intersection(cols)], df[label_col]

In [413]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
import sys
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from statistics import mean

def cross_validation(model, folds, features):
    val_scores = []
    for fold_nr in range(len(folds) - 1):
        df_train = pd.concat(folds[0:fold_nr + 1])
        X_train, y_train = separate_XY(df_train, features)
        
        model.fit(X_train, y_train)
        
        df_val = folds[fold_nr + 1]
        X_val, y_val = separate_XY(df_val, features)
        
        y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
        #print(y_train_pred)
        #print(y_train_pred, y_val_pred)
        
        #cm = confusion_matrix(y_val, y_val_pred)
        #print(cm)

        train_f1, val_f1 = f1_score(y_train, y_train_pred, average="macro"), f1_score(y_val, y_val_pred, average="macro")
        print(f"Fold {fold_nr}, train F1-score:{train_f1}, validation F1-score: {val_f1}")
        val_scores.append(val_f1)
    
    print(f"{len(folds)}-fold cross validation mean F1-score: {mean(val_scores)}")
    
    



np.set_printoptions(threshold=sys.maxsize)

#classifier = GaussianNB()
classifier = RandomForestClassifier(n_estimators=10, min_samples_leaf=5, criterion='gini')
cross_validation(classifier, folds, features)


# Test
X_train, y_train = separate_XY(pd.concat(train_datasets), features)

classifier.fit(X_train, y_train)

X_test, y_test = separate_XY(pd.concat(test_datasets), features)
y_test_pred = classifier.predict(X_test)
test_f1 = f1_score(y_test, y_test_pred, average="macro")
print(f"Test F1-score: {test_f1}")



Fold 0, train F1-score:1.0, validation F1-score: 1.0
Fold 1, train F1-score:1.0, validation F1-score: 0.9902564102564102
Fold 2, train F1-score:1.0, validation F1-score: 1.0
Fold 3, train F1-score:1.0, validation F1-score: 1.0
Fold 4, train F1-score:1.0, validation F1-score: 1.0
Fold 5, train F1-score:1.0, validation F1-score: 0.9484629294755877
Fold 6, train F1-score:1.0, validation F1-score: 1.0
Fold 7, train F1-score:1.0, validation F1-score: 1.0
Fold 8, train F1-score:0.99881824243377, validation F1-score: 1.0
10-fold cross validation mean F1-score: 0.9931910377479998
Test F1-score: 0.9922783873214909


In [389]:
from sklearn.model_selection import StratifiedKFold

def train(fold_nr, classifier, X_train, y_train, X_val, y_val):
    """
    Train the model and return validation score
    """
    model = classifier
    model.fit(X_train, y_train)
    
    y_train_pred, y_val_pred = model.predict(X_train), model.predict(X_val)
    train_f1, val_f1 = f1_score(y_train, y_train_pred, average="macro"), f1_score(y_val, y_val_pred, average="macro")
    
    print(f"Fold {fold_nr}, train F1-score:{train_f1}, validation F1-score: {val_f1}")
    return train_f1, val_f1

df = pd.concat(datasets)
X, y = separate_XY(df, features)

skf = StratifiedKFold(n_splits=10, shuffle=False)
for fold_nr, (train_index, val_index) in enumerate(skf.split(X, y)):
    train_score, val_score = train(fold_nr, 
                                     classifier, 
                                     X_train=X.iloc[train_index], 
                                     y_train=y.iloc[train_index], 
                                     X_val=X.iloc[val_index], 
                                     y_val=y.iloc[val_index])

Fold 0, train F1-score:1.0, validation F1-score: 0.9784511725688196
Fold 1, train F1-score:0.9984469653123433, validation F1-score: 1.0
Fold 2, train F1-score:0.999223510744248, validation F1-score: 1.0
Fold 3, train F1-score:1.0, validation F1-score: 1.0
Fold 4, train F1-score:1.0, validation F1-score: 1.0
Fold 5, train F1-score:1.0, validation F1-score: 1.0
Fold 6, train F1-score:1.0, validation F1-score: 1.0
Fold 7, train F1-score:1.0, validation F1-score: 1.0
Fold 8, train F1-score:0.9992247391156976, validation F1-score: 1.0
Fold 9, train F1-score:1.0, validation F1-score: 0.9555555555555556


In [411]:
from sklearn.feature_selection import VarianceThreshold
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif

df = pd.concat(train_datasets)

all_features = df.columns.values.tolist()
all_features.remove("label")
all_features.remove("time")
all_features.remove("v_accuracy")
all_features.remove("h_accuracy")
all_features.remove("direction")




X, y = separate_XY(df, all_features)

sel = SelectKBest(f_classif, k=10)
sel.fit_transform(X, y)
mask = sel.get_support() #list of booleans
new_features = [] # The list of your K best features

for bool_val, feature in zip(mask, all_features):
    if bool_val:
        new_features.append(feature)
        
new_features

['heart_rate',
 'acceleration_x_max_freq',
 'acceleration_z_max_freq',
 'gyroscope_x_temp_std_ws_40',
 'acceleration_y_temp_mean_ws_40',
 'heart_rate_temp_mean_ws_40',
 'heart_rate_temp_median_ws_40',
 'velocity_temp_max_ws_40',
 'heart_rate_temp_max_ws_40',
 'heart_rate_temp_min_ws_40']