In [25]:
%load_ext autoreload
%autoreload 2

In [1]:
from pathlib import Path  # For defining dataset Paths
import sys                # For include librep package

# This must be done if librep is not installed via pip,
# as this directory (examples) is appart from librep package root
sys.path.append("..")

# Third party imports
import pandas as pd
import numpy as np

# Librep imports
from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.multimodal import PandasMultiModalDataset # Wrap CSVs to librep's `Dataset` interface

2022-09-20 04:42:28.054738: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-20 04:42:28.054768: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
# Path for KuHar resampled to 20Hz view with the same activities (and labels numbers)
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
dataset_path = Path("../data/views/KuHar/resampled_view_20Hz")

In [3]:
# Kuhar dataframes
train, validation, test = PandasDatasetsIO(dataset_path).load()

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-59,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user
0,0,0.001911,-0.014536,0.005845,0.003675,-0.014972,0.025607,0.000478,-0.031141,-0.014827,...,0.004456,23.235,23.223,26.26,26.249,0,300,1,2100,1051
1,1,0.004114,-0.003186,0.000759,0.01245,-0.032074,0.00727,-0.00047,0.00698,0.0214,...,0.002979,56.292,56.292,59.245,59.245,0,300,1,5700,1037
2,2,-0.011282,-0.002432,-0.003199,0.008152,-0.021763,0.000309,-0.004968,-0.009551,0.001497,...,0.003343,27.268,27.267,30.29,30.291,0,300,1,2700,1075
3,3,-0.009241,-0.004666,0.021606,-0.0072,0.003091,0.00163,0.005057,-0.008149,0.013167,...,-0.002053,39.421,39.42,42.441,42.44,0,300,6,3900,1008
4,4,-0.013083,-0.005612,0.001645,0.006823,-0.004159,0.000415,0.008178,0.002637,-0.000827,...,0.002603,23.703,23.703,26.656,26.656,0,300,1,2400,1038


In [5]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

# Creating the datasets

# Train
train_dataset = PandasMultiModalDataset(
    train,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Validation
validation_dataset = PandasMultiModalDataset(
    validation,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Test
test_dataset = PandasMultiModalDataset(
    test,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

In [6]:
# Lets print the first sample of kh_train dataset.
# Is a tuple, with an vector of 1800 elements as first element and the label as second
x = train_dataset[0]
print(x)

(array([ 1.91093286e-03, -1.45361925e-02,  5.84452385e-03,  3.67495627e-03,
       -1.49718059e-02,  2.56068907e-02,  4.77538088e-04, -3.11405362e-02,
       -1.48270261e-02,  7.69834863e-03,  1.06101665e-02, -5.96475630e-02,
       -3.35511310e-03, -1.65885925e-03,  3.94389738e-02, -4.28711994e-02,
       -4.65577088e-03, -1.44686791e-02, -7.36948774e-03, -3.87024460e-03,
        6.24744252e-02, -1.79626835e-02,  3.22744928e-03, -3.75961022e-03,
        1.46163449e-02, -1.07502353e-02, -9.27218103e-03,  5.06417325e-03,
        1.40691624e-02,  1.60138354e-02, -5.34838152e-02, -3.29858611e-03,
        2.31031426e-02,  2.27906805e-02,  2.54595798e-03,  1.75255266e-02,
       -5.10498318e-03, -2.07463519e-02,  1.32902011e-02,  1.37572046e-02,
        7.17675958e-03, -2.01445217e-02,  5.47817384e-03, -7.66570074e-04,
        1.94831071e-02, -1.11694213e-03,  2.27235363e-02, -1.49616813e-02,
       -9.71672954e-03, -7.12839038e-03,  9.02811373e-03, -1.57676951e-03,
       -5.51378813e-03, 

In [7]:
from librep.datasets.multimodal import TransformMultiModalDataset
from librep.transforms.fft import FFT

In [8]:
fft_transform = FFT(centered = True)
transformer = TransformMultiModalDataset(transforms=[fft_transform], new_window_name_prefix="fft.")

In [9]:
train_dataset_fft = transformer(train_dataset)
validation_dataset_fft = transformer(validation_dataset)
test_dataset_fft = transformer(test_dataset)

In [11]:
print(train_dataset_fft.X.shape)
print(validation_dataset_fft.X.shape)
print(test_dataset_fft.X.shape)

(3330, 180)
(108, 180)
(378, 180)


In [34]:
from librep.transforms.topo_ae import TopologicalDimensionalityReduction
from librep.estimators.ae.torch.models.topological_ae.topological_ae import TopologicallyRegularizedAutoencoder
kwargs = {'input_dims':180, 'custom_dim':3}
input_shape = (-1, 1, 180)
transform_topoae = TopologicalDimensionalityReduction(ae_model='DeepAEforKuhar180',
                                                      ae_kwargs=kwargs, input_shape=input_shape)

Topologically Regularized DeepAEforKuhar180
Using python to compute signatures
DeepAEforKuhar180, Input: 180 Inner dim: 3


In [35]:
transform_topoae.fit(train_dataset_fft.X)

Epoch:1, Loss:5.4388
Epoch:2, Loss:16.0613
Epoch:3, Loss:13.8071
Epoch:4, Loss:0.6178
Epoch:5, Loss:45.1977
Epoch:6, Loss:35.7210
Epoch:7, Loss:66.5052
Epoch:8, Loss:900.0685
Epoch:9, Loss:56.1310
Epoch:10, Loss:41.6147
Epoch:11, Loss:82.9860
Epoch:12, Loss:13.0534
Epoch:13, Loss:28.6053


<librep.transforms.topo_ae.TopologicalDimensionalityReduction at 0x7f50c4654b80>

In [42]:
test_applied_topoae = transform_topoae.transform(test_dataset_fft.X)
print('ORIGINAL', test_applied_topoae.shape)
test_applied_topoae = np.reshape(test_applied_topoae, (-1,3))
print('RESHAPED', test_applied_topoae.shape)

ORIGINAL (378, 1, 3)
RESHAPED (378, 3)


In [47]:
from librep.metrics.dimred_evaluator import DimensionalityReductionQualityReport
metrics_reporter = DimensionalityReductionQualityReport(sampling_threshold=378)
metrics_train_applied_topoae = metrics_reporter.evaluate([test_dataset_fft.X, test_applied_topoae])
print(metrics_train_applied_topoae)

{'residual variance (pearson)': 0.5913135322579415, 'residual variance (spearman)': 0.5816854991996547, 'trustworthiness': 0.7929546573399855, 'continuity': 0.8778444785704314, 'co k nearest neighbor size': 0.24336870026525198, 'local continuity meta criterion': 0.20081550877589027, 'local property': 0.3857387376146404, 'global property': 0.8034139822635268}


In [48]:
from librep.utils.workflow import SimpleTrainEvalWorkflow, MultiRunWorkflow
from librep.estimators import RandomForestClassifier
from librep.metrics.report import ClassificationReport
import yaml

reporter = ClassificationReport(use_accuracy=True, use_f1_score=True, use_classification_report=False, use_confusion_matrix=False, plot_confusion_matrix=False)
experiment = SimpleTrainEvalWorkflow(estimator=RandomForestClassifier, estimator_creation_kwags ={'n_estimators':100} , do_not_instantiate=False, do_fit=True, evaluator=reporter)
multi_run_experiment = MultiRunWorkflow(workflow=experiment, num_runs=3, debug=False)

In [49]:
combined_train_dset = PandasMultiModalDataset(
    pd.concat([train, validation]),
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)


result = multi_run_experiment(combined_train_dset, test_dataset)
print(yaml.dump(result, sort_keys=True, indent=4))

runs:
-   end: 1663664887.552666
    result:
    -   accuracy: 0.701058201058201
        f1 score (macro): 0.6921452380549326
        f1 score (micro): 0.701058201058201
        f1 score (weighted): 0.7099711640614697
    run id: 1
    start: 1663664882.08597
    time taken: 5.46669602394104
-   end: 1663664892.011342
    result:
    -   accuracy: 0.7275132275132276
        f1 score (macro): 0.7144602057859362
        f1 score (micro): 0.7275132275132276
        f1 score (weighted): 0.740566249240519
    run id: 2
    start: 1663664887.5526676
    time taken: 4.458674430847168
-   end: 1663664896.432762
    result:
    -   accuracy: 0.7195767195767195
        f1 score (macro): 0.7042066830506104
        f1 score (micro): 0.7195767195767196
        f1 score (weighted): 0.7349467561028287
    run id: 3
    start: 1663664892.011344
    time taken: 4.421417951583862



In [50]:
combined_train_dset_fft = transformer(combined_train_dset)

result = multi_run_experiment(combined_train_dset_fft, test_dataset_fft)
print(yaml.dump(result, sort_keys=True, indent=4))

runs:
-   end: 1663665019.8602655
    result:
    -   accuracy: 0.8201058201058201
        f1 score (macro): 0.8181350671579567
        f1 score (micro): 0.8201058201058201
        f1 score (weighted): 0.8220765730536835
    run id: 1
    start: 1663665016.84495
    time taken: 3.015315532684326
-   end: 1663665022.5059454
    result:
    -   accuracy: 0.8386243386243386
        f1 score (macro): 0.8357999884987348
        f1 score (micro): 0.8386243386243385
        f1 score (weighted): 0.8414486887499422
    run id: 2
    start: 1663665019.8602676
    time taken: 2.6456778049468994
-   end: 1663665025.1512365
    result:
    -   accuracy: 0.8306878306878307
        f1 score (macro): 0.8259884854853735
        f1 score (micro): 0.8306878306878307
        f1 score (weighted): 0.8353871758902878
    run id: 3
    start: 1663665022.5059476
    time taken: 2.6452889442443848



In [67]:
print(train_dataset_fft.y)

print(combined_train_dset.data.shape)

[ 0  0  0 ... 17 17 17]
(3438, 370)


In [74]:
import pandas as pd

# Using the latent space
train_applied_topoae = transform_topoae.transform(train_dataset_fft.X)
print('ORIGINAL', train_applied_topoae.shape)
train_applied_topoae = np.reshape(train_applied_topoae, (-1,3))
print('RESHAPED', train_applied_topoae.shape)

train_applied_topoae_pd = pd.DataFrame(train_applied_topoae)
train_applied_topoae_pd['y'] = train_dataset_fft.y
# print('FINAL', train_applied_topoae_pd)


topoae_train_dset = PandasMultiModalDataset(
    train_applied_topoae_pd,
    label_columns="y",
    as_array=True
)

test_applied_topoae_pd = pd.DataFrame(test_applied_topoae)
test_applied_topoae_pd['y'] = test_dataset_fft.y
# print('FINAL', test_applied_topoae_pd)

topoae_test_dset = PandasMultiModalDataset(
    test_applied_topoae_pd,
    label_columns="y",
    as_array=True
)
result = multi_run_experiment(topoae_train_dset, topoae_test_dset)
print(yaml.dump(result, sort_keys=True, indent=4))

ORIGINAL (3330, 1, 3)
RESHAPED (3330, 3)
runs:
-   end: 1663667430.2590716
    result:
    -   accuracy: 0.29894179894179895
        f1 score (macro): 0.29371077390429573
        f1 score (micro): 0.29894179894179895
        f1 score (weighted): 0.3041728239793022
    run id: 1
    start: 1663667429.8684719
    time taken: 0.39059972763061523
-   end: 1663667430.6462574
    result:
    -   accuracy: 0.2962962962962963
        f1 score (macro): 0.2913466717904681
        f1 score (micro): 0.2962962962962963
        f1 score (weighted): 0.3012459208021245
    run id: 2
    start: 1663667430.2590737
    time taken: 0.38718366622924805
-   end: 1663667431.0362937
    result:
    -   accuracy: 0.30952380952380953
        f1 score (macro): 0.30314404181075016
        f1 score (micro): 0.30952380952380953
        f1 score (weighted): 0.31590357723686885
    run id: 3
    start: 1663667430.6462598
    time taken: 0.3900339603424072



# Dimensionality exploration