# Crossvalidation experiments with KuHar as base and Charm as target

This notebook will perform crossvalidation experiments using the KuHar and MotionSense datasets at 20 Hz as training dataset. It will contain the following steps:

1. Quick load train, test and validation CSV subsets from the balanced KuHar and MotionSense datasets at 20 Hz using `PandasDatasetsIO` helper
2. Quick load train, test and validation CSV subsets from other relevant datasets using `PandasDatasetsIO` helper
3. Subclassing the `Dataset` interface using `PandasMultiModalDataset`
4. Apply the fourier transform on Charm
5. Apply universal UMAP
6. Train SVM, KNN and Random Forest classification models on the KuHar and MotionSense dataset in the frequency domain with dimensionality reduction
7. Evaluate SVM, KNN and Random Forest classification models on Charm in the frequency domain with dimensionality reduction

The experiments will evaluate the performance of SVM, KNN and RF models trained on a balanced KuHar and MotionSense datasets and tested on Charm in the frequency domain with dimensionality reduction.

In [1]:
from pathlib import Path  # For defining dataset Paths
import sys
sys.path.append("../../..")

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
#from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

# Librep imports
from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.har.loaders import ExtraSensoryBalancedResampledView20HZ, UCIHARUnbalancedView, WISDMInterpolatedUnbalancedView, CHARMUnbalancedView

from librep.datasets.multimodal import PandasMultiModalDataset, TransformMultiModalDataset, WindowedTransform
from librep.transforms.fft import FFT
from librep.utils.workflow import SimpleTrainEvalWorkflow, MultiRunWorkflow
from librep.estimators import RandomForestClassifier, SVC, KNeighborsClassifier
from librep.metrics.report import ClassificationReport
from librep.transforms.resampler import SimpleResampler

2022-10-03 13:00:57.055249: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-10-03 13:00:57.240078: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Datasets to train the manifold

## Load ExtraSensory

In [None]:
# Load ExtraSensory, creating PandasMultiModalDatasets with the correct pre-defined windows
loader = ExtraSensoryBalancedResampledView20HZ("../../../data/views/ExtraSensory/balanced_view_resampled_20hz", download=False)
train_val_es, test_es = loader.load(concat_train_validation=True)
# X_ es = train_val_es.data.iloc
# X_es = train_val_es.data.iloc[:,1:-6]


train_val_es.data.iloc[:,1:-6]

In [None]:
extrasensory_X = np.array(train_val_es.data.iloc[:,1:-6])
extrasensory_Y = np.array(train_val_es.data['activity code'])
tam = len(extrasensory_Y)
extrasensory_id_dataset = np.array(['E']*tam)

## Load UCI-HAR

In [None]:
# Load UCI-HAR, creating PandasMultiModalDatasets with the correct pre-defined windows
loader = UCIHARUnbalancedView("../../../data/views/UCI-HAR/unbalanced_view_train_test-v1", download=False)
train_val_uci, test_uci = loader.load(concat_train_validation=True)

train_val_uci.data['activity code'] = train_val_uci.data['activity code'].astype('int')

In [None]:
# Resampling the dataset to 20 Hz
resampler = SimpleResampler(new_sample_size=60)
transformer = TransformMultiModalDataset(
    transforms=[resampler], new_window_name_prefix="resampled."
)
train_val_uci = transformer(train_val_uci)
test_uci = transformer(test_uci)
# train_uci = pd.DataFrame(train_val_uci.X)
uci_X = train_val_uci.X
uci_Y = train_val_uci.y

tam = len(uci_Y)
uci_id_dataset = np.array(['U']*tam)

## Load WISDM

In [None]:
# Load WISDM and create PandasMultiModalDatasets with the correct pre-defined windows
loader = WISDMInterpolatedUnbalancedView("../../../data/views/WISDM/interpolated_unbalanced_view_train_test-v1", download=False)
train_val_wisdm, test_wisdm = loader.load(concat_train_validation=True)

#rename "activity" with "activity code"
train_val_wisdm.data.rename(columns = {'activity':'activity code'}, inplace = True)
#test.data.rename(columns = {'activity':'activity code'}, inplace = True)

train_val_wisdm.data['activity code'] = train_val_wisdm.data['activity code'].astype('int')


In [None]:
wisdm_X = np.array(train_val_wisdm.data.iloc[:,:-2])
wisdm_Y = np.array(train_val_wisdm.data['activity code'])

tam = len(wisdm_Y)
wisdm_id_dataset = np.array(['W']*tam)

## Concatenate datasets

In [None]:
columns = list(train_val_wisdm.data.iloc[:,:-2].columns)
train_val_es.window_names, train_val_uci.window_names, train_val_wisdm.window_names

In [None]:
train_X = np.concatenate([wisdm_X, uci_X, extrasensory_X])
train_X.shape

In [None]:
train_Y = np.concatenate([wisdm_Y, uci_Y, extrasensory_Y])
train_Y.shape

In [None]:
train_id_dataset = np.concatenate([wisdm_id_dataset, uci_id_dataset, extrasensory_id_dataset])

In [None]:
train_universal = pd.DataFrame(train_X, columns=columns)

train_universal['Id Dataset'] = train_id_dataset

# Datasets to evaluate the manifold

## Load KuHar

In [None]:
# Path for KuHar resampled to 20Hz view with the same activities (and labels numbers)
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
#dataset_path = Path("../../../../data/views/KuHar/resampled_view_20Hz")
dataset_path = Path("../../../../data/views/KuHar/resampled_view_20Hz")

In [None]:
train_kh, validation_kh, test_kh = PandasDatasetsIO(dataset_path).load()

In [None]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

train_kh = train_kh.query("`normalized activity code` == 0 or `normalized activity code` == 1 or `normalized activity code` == 2 or `normalized activity code` == 3  or `normalized activity code` == 4 or `normalized activity code` == 5")
validation_kh = validation_kh.query("`normalized activity code` == 0 or `normalized activity code` == 1 or `normalized activity code` == 2 or `normalized activity code` == 3  or `normalized activity code` == 4 or `normalized activity code` == 5")
test_kh = test_kh.query("`normalized activity code` == 0 or `normalized activity code` == 1 or `normalized activity code` == 2 or `normalized activity code` == 3  or `normalized activity code` == 4 or `normalized activity code` == 5")

test_kh

# Creating the target dataset

# combined_target_dset_kh = PandasMultiModalDataset(
#     pd.concat([train_kh, validation_kh, test_kh], ignore_index=True),
#     feature_prefixes=features,
#     label_columns="normalized activity code",
#     as_array=True
# )

In [None]:
test_kh.iloc[:,1:-10]

In [None]:
kuhar_X = np.array(test_kh.iloc[:,1:-10])
kuhar_Y = np.array(test_kh['normalized activity code'])
tam = len(kuhar_Y)
kuhar_id_dataset = np.array(['K']*tam)

## Load MotionSense

In [None]:
dataset_path = Path("../../../../data/views/MotionSense/resampled_view_20Hz")

In [None]:
# Motionsense dataframe
train_motion, validation_motion, test_motion = PandasDatasetsIO(dataset_path).load()
test_motion

In [None]:
test_motion.iloc[:,1:-6]

In [None]:
motion_X = np.array(test_motion.iloc[:,1:-6])
motion_Y = np.array(test_motion['normalized activity code'])
tam = len(motion_Y)
motion_id_dataset = np.array(['M']*tam)

## Load CHARM

In [None]:
# Load CHARM, creating PandasMultiModalDatasets with the correct pre-defined windows
loader = CHARMUnbalancedView("../../../data/views/CHARM/unbalanced_view_train_test-v1", download=False)
train_val_charm, test_charm = loader.load(concat_train_validation=True)

#rename "activity" with "activity code"
#train_val.data.rename(columns = {'activity':'activity code'}, inplace = True)
#test.data.rename(columns = {'activity':'activity code'}, inplace = True)

train_val_charm.data['activity code'] = train_val_charm.data['activity code'].astype('int')

In [None]:
#Selecting only classes of interest

test_charm.data = test_charm.data[test_charm.data['activity code'].isin([0, 2, 6, 7, 8, 9])]

# 0: "Sitting in a Chair", 
# 2: "Standing", 
# 6: "Walking", 
# 7: "Running", 
# 8: "Walking Upstairs", 
# 9: "Walking Downstairs"

In [None]:
test_charm.data.loc[test_charm.data['activity code'] == 0, 'activity code'] = 0
test_charm.data.loc[test_charm.data['activity code'] == 2, 'activity code'] = 1
test_charm.data.loc[test_charm.data['activity code'] == 6, 'activity code'] = 2
test_charm.data.loc[test_charm.data['activity code'] == 7, 'activity code'] = 5
test_charm.data.loc[test_charm.data['activity code'] == 8, 'activity code'] = 3
test_charm.data.loc[test_charm.data['activity code'] == 9, 'activity code'] = 4

# 0 - Sit
# 1 - Stand
# 2 - Walk 
# 3 - Upstairs 
# 4 - Downstairs
# 5 - Run/Jogging

In [None]:
test_charm.data

In [None]:
test_charm.data.iloc[:,:-2]

In [None]:
charm_X = np.array(test_charm.data.iloc[:,:-2])
charm_Y = np.array(test_charm.data['activity code'])
tam = len(charm_Y)
charm_id_dataset = np.array(['C']*tam)

In [None]:
test_charm.window_names, test_charm.window_slices

## Prepare the test data

In [None]:
test_X = np.concatenate([charm_X, motion_X, kuhar_X])
test_Y = np.concatenate([charm_Y, motion_Y, kuhar_Y])
test_id_dataset = np.concatenate([charm_id_dataset, motion_id_dataset, kuhar_id_dataset])

In [None]:
test = pd.DataFrame(test_X, columns=columns)
test['normalized activity code'] = test_Y
test['Id Dataset'] = test_id_dataset

In [None]:
test

In [None]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

# Creating the datasets

# Train
train_universal = PandasMultiModalDataset(
    train_universal,
    feature_prefixes=features,
    label_columns="Id Dataset",
    as_array=True
)

# Test
test = PandasMultiModalDataset(
    test,
    feature_prefixes=features,
    label_columns="Id Dataset",
    as_array=True
)

# Evaluate the manifold

In [None]:
# 1. Perform standard scaler in train dataset
#train_scaler = StandardScaler()
#train_scaler.fit(train_val[:][0])
# OK Standard scaler was fit over train dataset.

Let's create the transforms. In general (by default) transforms are applyied over each window of the dataset, separadetly. We can control how transform will be applyied using Wrapping the transform arround `WindowedTransform`. 

The `WindowedTransform` receives, as argument to the constructor:

- The transform to be wrapped
- `fit_on`: can be "all" (apply fit over the whole dataset), "window" (apply fit over each window) or None (does not do fit).
- `transform_on`: can be "all" (apply transform over the whole dataset) or "window" (apply transform over each window)


One transformers will be created:

- `fft_transform`: Apply the transforms over windows of the dataset

In [None]:
# Create the objects
#scaler_transform = WindowedTransform(
#    transform=train_scaler, fit_on=None, transform_on="all")

fft_transform = FFT()

# Compose the transform
# First apply the normalizer over whole dataset and then apply FFT over each window
transformer = TransformMultiModalDataset(
    transforms=[#scaler_transform,
                fft_transform], new_window_name_prefix="scaled."
)

In [None]:
# Transform it and generate a new dataset!
train_universal_fft = transformer(train_universal)
test_fft = transformer(test)

In [None]:
# Checking the whole data...
train_universal[:][0]

## Classification

Let's take the transformed datasets and train using RandomForest, SVM and KNN 3 times each. Then take the average accuracy and f1-score over the runs

In [None]:
# # The reporter will be the same

# reporter = ClassificationReport(
#     use_accuracy=True,
#     use_f1_score=True,
#     use_classification_report=True,
#     use_confusion_matrix=True,
#     plot_confusion_matrix=True,
#     normalize='true'
# )

### RandomForest

In [None]:
# experiment = SimpleTrainEvalWorkflow(
#     estimator=RandomForestClassifier,
#     do_not_instantiate=False,
#     do_fit=True,
#     evaluator=reporter,
# )

# multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=3, debug=False)
# results = multi_run_experiment(train_val_fft, [test_fft])

# mean_acc = np.average(
#     [res["result"][0]["accuracy"] for res in results["runs"]]
# )
# mean_f1 = np.average(
#     [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
# )
# print(f"Mean accuracy (3 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")

### SVM

In [None]:
# experiment = SimpleTrainEvalWorkflow(
#     estimator=SVC,
#     do_not_instantiate=False,
#     do_fit=True,
#     evaluator=reporter,
# )

# multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=3, debug=False)
# results = multi_run_experiment(train_val_fft, [test_fft])

# mean_acc = np.average(
#     [res["result"][0]["accuracy"] for res in results["runs"]]
# )
# mean_f1 = np.average(
#     [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
# )
# print(f"Mean accuracy (3 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")

### KNN

In [None]:
# experiment = SimpleTrainEvalWorkflow(
#     estimator=KNeighborsClassifier,
#     do_not_instantiate=False,
#     do_fit=True,
#     evaluator=reporter,
# )

# multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=3, debug=False)
# results = multi_run_experiment(train_val_fft, [test_fft])

# mean_acc = np.average(
#     [res["result"][0]["accuracy"] for res in results["runs"]]
# )
# mean_f1 = np.average(
#     [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
# )
# print(f"Mean accuracy (3 runs): {mean_acc:.4f}. Mean f1-score: {mean_f1:.4f}")

## Plot UMAP and T-SNE

In [None]:
def plot(df, figsize: tuple = (5, 5), title: str = None, labels: dict = None):
    fig, ax = plt.subplots(figsize=figsize)
    for label, group_df in df.groupby("label"):
        label = labels[label] if labels is not None else label
        ax.scatter(group_df.x, group_df.y, label=label)
    ax.legend()
    plt.title(title)
    plt.show()

In [None]:
labels = {0: "Sitting in a Chair", 1: "Sitting in a Couch", 2: "Standing", 3: "Lying up", 4: "Lying side", 5: "Device on surface",
6: "Walking", 7: "Running", 8: "Walking Upstairs", 9: "Walking Downstairs"}
print(labels)

In [None]:
labels = {'K': 'KuHar', 
          'M': 'MotionSense',
          'C': 'CHARM',
          'E': 'ExtraSensory',
          'W': 'WISDM',
          'U': 'UCI',
         }

# KuHAR	K
# MotionSense	M
# CHARM	C
# ExtraSensory	E
# WISDM	W
# UCI	U

### UMAP

In [None]:
model = UMAP(n_components=2, random_state=42)
result = pd.DataFrame(model.fit_transform(train_universal_fft[:][0]), columns=["x", "y"])
result["label"] = train_universal_fft[:][1]
plot(result, title="UMAP on ExtraSensory, UCI-HAR, and WISDM FFT data", labels = labels)

In [None]:
# model = UMAP(n_components=2)
result = pd.DataFrame(model.transform(test_fft[:][0]), columns=["x", "y"])
result["label"] = test_fft[:][1]
plot(result, title="UMAP projection on KuHar, MotionSense, and CHARM FFT data", labels = labels)

### T-SNE

In [None]:
model = TSNE(n_components=2, random_state=42)
result = pd.DataFrame(model.fit_transform(train_universal[:][0]), columns=["x", "y"])
result["label"] = train_universal[:][1]
plot(result, title="T-SNE on ExtraSensory, UCI-HAR, and WISDM FFT data", labels=labels)