# Crossvalidation experiments with MotionSense as base and KuHar as target

This notebook will perform crossvalidation experiments using the MotionSense dataset at 20 Hz as training dataset. It will contain the following steps:

1. Quick load train, test and validation CSV subsets from the balanced MotionSense dataset at 20 Hz using `PandasDatasetsIO` helper
2. Quick load train, test and validation CSV subsets from other relevant datasets using `PandasDatasetsIO` helper
3. Subclassing the `Dataset` interface using `PandasMultiModalDataset`
4. Apply the fourier transform on KuHar
5. Train SVM, KNN and Random Forest classification models on the MotionSense dataset in the frequency domain
6. Evaluate SVM, KNN and Random Forest classification models on KuHar in the frequency domain

The experiments will evaluate the performance of SVM, KNN and RF models trained on a balanced MotionSense dataset and tested on KuHar in the frequency domains.

## Common imports and definitions

In [1]:
from pathlib import Path  # For defining dataset Paths
import sys                # For include librep package

# This must be done if librep is not installed via pip,
# as this directory (examples) is appart from librep package root
sys.path.append("..")

In [2]:
# Third party imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from umap import UMAP
from sklearn.manifold import TSNE


# Librep imports
from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.multimodal import PandasMultiModalDataset, TransformMultiModalDataset, WindowedTransform
from librep.transforms.fft import FFT
from librep.utils.workflow import SimpleTrainEvalWorkflow, MultiRunWorkflow
from librep.estimators import RandomForestClassifier, SVC, KNeighborsClassifier
from librep.metrics.report import ClassificationReport
from librep.transforms.resampler import SimpleResampler

2022-09-28 01:06:41.251383: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-28 01:06:41.251404: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data from base dataset
Change the path here to use in other dataset as base

In [20]:
# Path for KuHar resampled to 30Hz view with the same activities (and labels numbers)
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
dataset_path = Path("../data/views/KuHar/resampled_view_20Hz")

Once paths is defined, we can load the CSV as pandas dataframes

In [21]:
# Kuhar dataframes
train_base, validation_base, test_base = PandasDatasetsIO(dataset_path).load()

Letś take a look in the train dataframes

In [22]:
train_base.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-59,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user,normalized activity code
0,0,0.001911,-0.014536,0.005845,0.003675,-0.014972,0.025607,0.000478,-0.031141,-0.014827,...,0.004456,23.223,26.26,26.249,0,300,1,2100,1051,1
1,1,0.004114,-0.003186,0.000759,0.01245,-0.032074,0.00727,-0.00047,0.00698,0.0214,...,0.002979,56.292,59.245,59.245,0,300,1,5700,1037,1
2,2,-0.011282,-0.002432,-0.003199,0.008152,-0.021763,0.000309,-0.004968,-0.009551,0.001497,...,0.003343,27.267,30.29,30.291,0,300,1,2700,1075,1
3,3,-0.009241,-0.004666,0.021606,-0.0072,0.003091,0.00163,0.005057,-0.008149,0.013167,...,-0.002053,39.42,42.441,42.44,0,300,6,3900,1008,1
4,4,-0.013083,-0.005612,0.001645,0.006823,-0.004159,0.000415,0.008178,0.002637,-0.000827,...,0.002603,23.703,26.656,26.656,0,300,1,2400,1038,1


## Creating a Librep dataset from pandas dataframes

Change the features to use in other datasets

In [6]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

combined_df = pd.concat([train_base, validation_base, test_base], ignore_index=True)
df2 = combined_df.query("`activity code` == 0 or `activity code` == 1 or `activity code` == 11 or `activity code` == 14  or `activity code` == 15 or `activity code` == 16")
df2.loc[ df2['activity code'] == 0, 'activity code'] = 3
df2.loc[ df2['activity code'] == 1, 'activity code'] = 2
df2.loc[ df2['activity code'] == 11, 'activity code'] = 4
df2.loc[ df2['activity code'] == 14, 'activity code'] = 5
df2.loc[ df2['activity code'] == 15, 'activity code'] = 1
df2.loc[ df2['activity code'] == 16, 'activity code'] = 0


In [7]:
df2.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-59,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user,normalized activity code
0,0,0.001911,-0.014536,0.005845,0.003675,-0.014972,0.025607,0.000478,-0.031141,-0.014827,...,0.004456,23.223,26.26,26.249,3,300,1,2100,1051,1
1,1,0.004114,-0.003186,0.000759,0.01245,-0.032074,0.00727,-0.00047,0.00698,0.0214,...,0.002979,56.292,59.245,59.245,3,300,1,5700,1037,1
2,2,-0.011282,-0.002432,-0.003199,0.008152,-0.021763,0.000309,-0.004968,-0.009551,0.001497,...,0.003343,27.267,30.29,30.291,3,300,1,2700,1075,1
3,3,-0.009241,-0.004666,0.021606,-0.0072,0.003091,0.00163,0.005057,-0.008149,0.013167,...,-0.002053,39.42,42.441,42.44,3,300,6,3900,1008,1
4,4,-0.013083,-0.005612,0.001645,0.006823,-0.004159,0.000415,0.008178,0.002637,-0.000827,...,0.002603,23.703,26.656,26.656,3,300,1,2400,1038,1


In [8]:
# Creating the train dataset

combined_train_dset = PandasMultiModalDataset(
    df2,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

## Inspect sample

In [9]:
# Lets print the first sample of combined_train dataset.
# Is a tuple, with an vector of 360 elements as first element and the label as second
x = combined_train_dset[0]
#print(x)

In [10]:
# Inspecting sample
#print(f"The sample 0: {x[0]}")
print(f"Shape of sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

Shape of sample 0: (360,)
The label of sample 0: 3


## Fourier Transform

In [11]:
#fft_transform = FFT(centered = True)
fft_transform = FFT()
transformer = TransformMultiModalDataset(transforms=[fft_transform], new_window_name_prefix="fft.")

### Use FFT in the base dataset

In [12]:
combined_train_dset_fft = transformer(combined_train_dset)

In [13]:
combined_train_dset[:][0]

array([[ 1.91093286e-03, -1.45361925e-02,  5.84452385e-03, ...,
         3.45654902e-03,  2.32869360e-03,  4.45589801e-03],
       [ 4.11395657e-03, -3.18646610e-03,  7.58931558e-04, ...,
        -9.94428406e-04, -1.82853273e-03,  2.97903419e-03],
       [-1.12820040e-02, -2.43180090e-03, -3.19908050e-03, ...,
         3.56838998e-03,  4.38234273e-03,  3.34301636e-03],
       ...,
       [-2.98870193e+00, -1.39523386e-01, -6.52441382e+00, ...,
        -7.48947780e-02, -3.92595416e-01, -2.17587752e-01],
       [ 7.96914982e+00,  1.20087272e+01, -3.34976629e+00, ...,
         1.08870693e-01,  4.74598944e-01,  2.26891463e-02],
       [-8.22046480e+00, -4.99540410e+00,  1.68743679e+01, ...,
         6.03348395e-01,  2.79978155e-01, -1.70569178e-01]])

In [14]:
combined_train_dset_fft[:][0]

array([[2.08923330e-02, 1.12081089e-01, 6.03699767e-02, ...,
        5.31635769e-02, 3.12414586e-02, 1.40369727e-01],
       [1.53802877e-02, 8.24343989e-02, 4.18766153e-02, ...,
        9.66322286e-03, 2.59289385e-02, 2.52929170e-02],
       [5.21272671e-02, 4.82816195e-02, 8.93573044e-02, ...,
        1.93188738e-02, 4.41668326e-02, 6.26341618e-02],
       ...,
       [1.66125362e+01, 6.98974508e+00, 1.66400551e+01, ...,
        1.28320305e+01, 1.65101034e+00, 1.28001347e+00],
       [2.83744371e+01, 2.64833653e+01, 2.09319955e+01, ...,
        3.63490975e+00, 4.42132193e+00, 1.50278269e+00],
       [2.12598609e+01, 1.75026493e+01, 1.66059199e+01, ...,
        2.61860549e+00, 4.18991461e+00, 2.71745950e+00]])

## Loading data from a target dataset B - MotionSense
In this case we use the MotionSense dataset resampled at 20 Hz

In [15]:
dataset_path = Path("../data/views/MotionSense/resampled_view_20Hz")

In [16]:
# Motionsense dataframe
train_targetB, validation_targetB, test_targetB = PandasDatasetsIO(dataset_path).load()


In [17]:
train_base.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-59,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user,normalized activity code
0,0,0.001911,-0.014536,0.005845,0.003675,-0.014972,0.025607,0.000478,-0.031141,-0.014827,...,0.004456,23.223,26.26,26.249,0,300,1,2100,1051,1
1,1,0.004114,-0.003186,0.000759,0.01245,-0.032074,0.00727,-0.00047,0.00698,0.0214,...,0.002979,56.292,59.245,59.245,0,300,1,5700,1037,1
2,2,-0.011282,-0.002432,-0.003199,0.008152,-0.021763,0.000309,-0.004968,-0.009551,0.001497,...,0.003343,27.267,30.29,30.291,0,300,1,2700,1075,1
3,3,-0.009241,-0.004666,0.021606,-0.0072,0.003091,0.00163,0.005057,-0.008149,0.013167,...,-0.002053,39.42,42.441,42.44,0,300,6,3900,1008,1
4,4,-0.013083,-0.005612,0.001645,0.006823,-0.004159,0.000415,0.008178,0.002637,-0.000827,...,0.002603,23.703,26.656,26.656,0,300,1,2400,1038,1


In [18]:
# MotionSense features to select
features = [
    "userAcceleration.x",
    "userAcceleration.y",
    "userAcceleration.z",
    "rotationRate.x",
    "rotationRate.y",
    "rotationRate.z"
]

# Creating the train dataset

combined_target_dsetB = PandasMultiModalDataset(
    pd.concat([train_targetB, validation_targetB, test_targetB], ignore_index=True),
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

In [19]:
# Lets print the first sample of combined_target dataset.
# Is a tuple, with an vector of 180 elements as first element and the label as second
x = combined_target_dsetB[0]
#print(x)

KeyError: 'activity code'

In [None]:
# Inspecting sample
print(f"The sample 0: {x[0]}")
print(f"Shape of sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

In [None]:
combined_target_dsetB_fft = transformer(combined_target_dsetB)

In [None]:
combined_target_dsetB[:][0]

In [None]:
combined_target_dsetB_fft[:][0]

## Train and evaluate Random Forest classifier

In [None]:
# The reporter will be the same

reporter = ClassificationReport(
    use_accuracy=True,
    use_f1_score=True,
    use_classification_report=True,
    use_confusion_matrix=True,
    plot_confusion_matrix=True,
    #normalize='true'
)

experiment = SimpleTrainEvalWorkflow(
    estimator=RandomForestClassifier, 
    estimator_creation_kwags ={'n_estimators':100}, 
    do_not_instantiate=False, 
    do_fit=True, 
    evaluator=reporter)

multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=10, debug=False)

In [None]:
results = multi_run_experiment(combined_train_dset_fft, combined_target_dsetB_fft)

mean_acc = np.average(
    [res["result"][0]["accuracy"] for res in results["runs"]]
)
mean_f1 = np.average(
    [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
)
print(f"Mean accuracy (10 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")

## Train and evaluate Support Vector Machine classifier

In [None]:
experiment = SimpleTrainEvalWorkflow(estimator=SVC, estimator_creation_kwags ={'C':3.0, 'kernel':"rbf"} , do_not_instantiate=False, do_fit=True, evaluator=reporter)
multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=10, debug=False)

In [None]:
results = multi_run_experiment(combined_train_dset_fft, combined_target_dsetB_fft)

mean_acc = np.average(
    [res["result"][0]["accuracy"] for res in results["runs"]]
)
mean_f1 = np.average(
    [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
)
print(f"Mean accuracy (10 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")

## Train and evaluate K Neighbors Classifier classifier

In [None]:
experiment = SimpleTrainEvalWorkflow(estimator=KNeighborsClassifier, estimator_creation_kwags ={'n_neighbors' :1} , do_not_instantiate=False, do_fit=True, evaluator=reporter)
multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=10, debug=False)

In [None]:
results = multi_run_experiment(combined_train_dset_fft, combined_target_dsetB_fft)

mean_acc = np.average(
    [res["result"][0]["accuracy"] for res in results["runs"]]
)
mean_f1 = np.average(
    [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
)
print(f"Mean accuracy (10 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")