In [1]:
from pathlib import Path  # For defining dataset Paths
import sys
Root = "../../../../../.."
sys.path.append("../../../..")

In [2]:
import time
import warnings
warnings.filterwarnings('ignore')

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from umap import UMAP
#from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE

import plotly.express as px
import plotly.graph_objects as go
import itertools
from itertools import combinations

# Librep imports
from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.har.loaders import (
    KuHar_BalancedView20HzMotionSenseEquivalent
)
from librep.datasets.multimodal import PandasMultiModalDataset, TransformMultiModalDataset, WindowedTransform
from librep.transforms.fft import FFT
from librep.transforms. stats import StatsTransform
from librep.utils.workflow import SimpleTrainEvalWorkflow, MultiRunWorkflow
from librep.estimators import RandomForestClassifier, SVC, KNeighborsClassifier
from librep.metrics.report import ClassificationReport
from librep.transforms.resampler import SimpleResampler

2022-12-19 16:48:26.194363: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-19 16:48:26.194379: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


# Datasets to train the manifold

In [4]:
labels_activity = {
    0: "sit",
    1: "stand",
    2: "walk",
    3: "stair up",
    4: "stair down",
    5: "run",
    6: "stair up and down",
}

classes = list(labels_activity.keys())
print(labels_activity)

labels_dataset = {
    'KuHar': 'KuHar', 
    'RealWorld': 'RealWorld',
    'MotionSense': 'MotionSense',
    'ExtraSensory': 'ExtraSensory',
    'WISDM': 'WISDM',
    'UCI': 'UCI',
}

{0: 'sit', 1: 'stand', 2: 'walk', 3: 'stair up', 4: 'stair down', 5: 'run', 6: 'stair up and down'}


## Load Data

In [5]:
# Load all datasets, creating PandasMultiModalDatasets with the correct pre-defined windows
loader = KuHar_BalancedView20HzMotionSenseEquivalent(
    Root+"/data/views/AllDatasets/balanced_20Hz_filtered", 
#     Root+"/data/views/KuHar/balanced_20Hz_motionsense_equivalent",
    download=False)
train_data, test_data = loader.load(concat_train_validation=True, label="standard activity code")
train_data, test_data

(PandasMultiModalDataset: samples=54522, features=360, no. window=6, label_columns='standard activity code',
 PandasMultiModalDataset: samples=6880, features=360, no. window=6, label_columns='standard activity code')

In [6]:
# dimensions_umap = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100, 150, 200, 250, 300, 360]
dimensions_umap = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 50, 100, 150, 180]

datasets = ['KuHar', 'RealWorld', 'MotionSense', 'WISDM', 'UCI']

In [7]:
columns = ['Classifier', 'Umap dimension', 'Dataset']

metrics = ['accuracy', 'f1 score (weighted)']
stats = ['mean', 'std']
columns += [metric + ' - ' + stat
            for metric in metrics
            for stat in stats]

metrics_class = ['f1-score', 'precision', 'recall', 'support']
columns += [
    metric + ' - ' + stat + ' - ' + activity
    for metric in metrics_class
    for stat in stats
    for activity in labels_activity.values()
]

columns, len(columns)
df_results = {column: [] for column in columns}

results_dict = {
    'RandomForest': {}, 
    'SVC': {}, 
    'KNN': {}
}
for classifier in results_dict.keys():
    results_dict[classifier] = {
        'Umap dimension': [],
        'Dataset': [],
        'result': []
    }

In [8]:
def create_data_multimodal(data):
    # Features to select
    features = [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z"
    ]

    # Creating the datasets

    # Data
    data_multimodal = PandasMultiModalDataset(
        data,
        feature_prefixes=features,
        label_columns="standard activity code",
        as_array=True
    )

    return data_multimodal

In [9]:
def evaluate(dimension, dataset, train, test, evaluators, df, results_dict, labels_activity, metrics_class, 
             reporter):
# The reporter will be the same

    fft_transform = FFT(centered=True)

    if dimension == 180:
        transformer = TransformMultiModalDataset(transforms=[fft_transform], 
                                                 new_window_name_prefix="fft.")

    else:
        transformer_fft = TransformMultiModalDataset(transforms=[fft_transform], 
                                                 new_window_name_prefix="reduced.")

        umap = UMAP(n_components=dimension, random_state=42)
        train_fft = transformer_fft(train)

        umap.fit(train_fft[:][0])

        umap_transform = WindowedTransform(
            transform=umap, fit_on=None, transform_on="all"
        )

        transformer = TransformMultiModalDataset(transforms=[fft_transform, umap_transform], 
                                                 new_window_name_prefix="reduced.")
    train_fft = transformer(train)
    test_fft = transformer(test)

    for estimator, evaluator in evaluators.items():
        multi_run_experiment = MultiRunWorkflow(
            workflow=evaluator['experiment'], 
            num_runs=evaluator['num_runs'],
            debug=False)

        results = multi_run_experiment(train_fft, test_fft)
        results_dict[estimator]['Umap dimension'].append(dimension)
        results_dict[estimator]['Dataset'].append(dataset)
        results_dict[estimator]['result'].append(results)

        df['Classifier'].append(estimator)
        df['Umap dimension'].append(dimension)
        df['Dataset'].append(dataset)

        df['accuracy - mean'].append(
            np.average(
                [res["result"][0]["accuracy"] for res in results["runs"]]
            )
        )
        df['accuracy - std'].append(
            np.std(
                [res["result"][0]["accuracy"] for res in results["runs"]]
            )
        )
        df['f1 score (weighted) - mean'].append(
            np.average(
                [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
            )
        )
        df['f1 score (weighted) - std'].append(
            np.std(
                [res["result"][0]["f1 score (weighted)"] for res in results["runs"]]
            )
        )

        labels = test.data['standard activity code'].unique()
        for metric in metrics_class:
            for index, activity in labels_activity.items():
                df[f'{metric} - mean - {activity}'].append(
                    np.average(
                        [res['result'][0]['classification report'][str(index)][metric] for res in results["runs"]]
                    )
                ) if index in labels else  df[f'{metric} - mean - {activity}'].append(np.nan)

                df[f'{metric} - std - {activity}'].append(
                    np.std(
                        [res['result'][0]['classification report'][str(index)][metric] for res in results["runs"]]
                    )
                ) if index in labels else  df[f'{metric} - std - {activity}'].append(np.nan)
    return df, results_dict

In [10]:
start = time.time()
reporter = ClassificationReport(
    use_accuracy=True,
    use_f1_score=True,
    use_classification_report=True,
    use_confusion_matrix=True,
    plot_confusion_matrix=False,
#     normalize='true',
#     display_labels=labels,
)

evaluators = {
    'RandomForest':
    {
        'experiment':
        SimpleTrainEvalWorkflow(
            estimator=RandomForestClassifier, 
            estimator_creation_kwags ={'n_estimators':100}, 
            do_not_instantiate=False, 
            do_fit=True, 
            evaluator=reporter),
        'num_runs':
        10

    },
    'SVC':
    {
        'experiment':
        SimpleTrainEvalWorkflow(
            estimator=SVC, 
            estimator_creation_kwags ={'C':3.0, 'kernel':"rbf"} , 
            do_not_instantiate=False, 
            do_fit=True, 
            evaluator=reporter),
        'num_runs':
        1
    },
    'KNN':
    {
        'experiment':
        SimpleTrainEvalWorkflow(
            estimator=KNeighborsClassifier, 
            estimator_creation_kwags={'n_neighbors' :1}, 
            do_not_instantiate=False, 
            do_fit=True, 
            evaluator=reporter),
        'num_runs':
        1
    }
}

train_data.data['standard activity code'] = train_data.data['standard activity code'].astype('int')
test_data.data['standard activity code'] = test_data.data['standard activity code'].astype('int')
k=1
for dataset in datasets:

    train = train_data.data[train_data.data['DataSet'].isin([dataset])]
    train = create_data_multimodal(train)

    test = test_data.data[test_data.data['DataSet'].isin([dataset])]
    test = create_data_multimodal(test)

    new_start = time.time()
    for dimension in dimensions_umap:
        df_results, results_dict = evaluate(dimension, dataset, train, test, evaluators, df_results, 
                                            results_dict, labels_activity, metrics_class, reporter)
        new_end = time.time()
        print(f'Combination: {k} \t Time of execution: {int(new_end - new_start) // 60} minutes and {int(new_end - new_start) % 60} seconds')
        k+=1

end = time.time()
total = int(end - start)
print(f'Time of execution: {total} seconds')
print(f'Time of execution: {total // 60} minutes and {total % 60} seconds')
print(f'Time of execution: {(total // 86400)} days, {(total // 3600) % 24} hours, {(total // 60) % 60} minutes and {total % 60} seconds')

Combination: 1 	 Time of execution: 0 minutes and 19 seconds
Combination: 2 	 Time of execution: 0 minutes and 31 seconds
Combination: 3 	 Time of execution: 0 minutes and 43 seconds
Combination: 4 	 Time of execution: 0 minutes and 56 seconds
Combination: 5 	 Time of execution: 1 minutes and 10 seconds
Combination: 6 	 Time of execution: 1 minutes and 23 seconds
Combination: 7 	 Time of execution: 1 minutes and 37 seconds
Combination: 8 	 Time of execution: 1 minutes and 51 seconds
Combination: 9 	 Time of execution: 2 minutes and 6 seconds
Combination: 10 	 Time of execution: 2 minutes and 22 seconds
Combination: 11 	 Time of execution: 2 minutes and 39 seconds
Combination: 12 	 Time of execution: 2 minutes and 59 seconds
Combination: 13 	 Time of execution: 3 minutes and 22 seconds
Combination: 14 	 Time of execution: 3 minutes and 54 seconds
Combination: 15 	 Time of execution: 4 minutes and 39 seconds
Combination: 16 	 Time of execution: 5 minutes and 34 seconds
Combination: 17 	 

In [11]:
df_results = pd.DataFrame(df_results)
df_results.loc[df_results['Classifier'] == 'RandomForest']
# df_results.dropna(axis=1)

Unnamed: 0,Classifier,Umap dimension,Dataset,accuracy - mean,accuracy - std,f1 score (weighted) - mean,f1 score (weighted) - std,f1-score - mean - sit,f1-score - mean - stand,f1-score - mean - walk,...,support - mean - stair down,support - mean - run,support - mean - stair up and down,support - std - sit,support - std - stand,support - std - walk,support - std - stair up,support - std - stair down,support - std - run,support - std - stair up and down
0,RandomForest,1,KuHar,0.654472,0.000000,0.652102,1.110223e-16,0.683544,0.705882,0.652174,...,39.0,34.0,,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,
3,RandomForest,2,KuHar,0.743496,0.009697,0.742913,9.575583e-03,0.786043,0.774419,0.748931,...,38.3,36.0,,1.720465,1.720465,0.600000,0.830662,1.100000,0.000000,
6,RandomForest,3,KuHar,0.765854,0.006350,0.764473,6.159894e-03,0.852841,0.819236,0.722910,...,37.8,34.8,,1.345362,1.345362,0.458258,0.830662,0.748331,0.400000,
9,RandomForest,4,KuHar,0.760569,0.004244,0.759171,4.217539e-03,0.823757,0.790392,0.733333,...,37.6,34.7,,0.830662,0.830662,0.000000,0.640312,0.916515,0.458258,
12,RandomForest,5,KuHar,0.765041,0.005077,0.764478,5.166508e-03,0.838398,0.789223,0.746354,...,38.4,35.2,,0.748331,0.748331,0.806226,1.445683,1.113553,0.400000,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
240,RandomForest,25,UCI,0.806259,0.005032,0.808513,4.665965e-03,0.777643,0.719082,0.920757,...,117.9,,,4.526588,4.526588,0.447214,0.943398,0.943398,,
243,RandomForest,50,UCI,0.794039,0.004979,0.796886,4.777976e-03,0.774622,0.700727,0.912268,...,120.8,,,3.130495,3.130495,0.781025,1.135782,0.871780,,
246,RandomForest,100,UCI,0.803130,0.003158,0.805601,3.182458e-03,0.781522,0.719324,0.922152,...,118.2,,,1.743560,1.743560,0.663325,1.400000,1.077033,,
249,RandomForest,150,UCI,0.793443,0.005504,0.797224,5.155517e-03,0.767118,0.675795,0.912941,...,112.2,,,4.583667,4.583667,0.979796,1.562050,1.720465,,


In [17]:
# Save results
import json

with open('df_results_umap_dimension.json', 'w') as file:
    json.dump(df_results.to_dict(), file)
    
with open('results_dict_umap_dimension.json', 'w') as file:
    json.dump(results_dict, file)