# Tutorial 1: Handling HAR dataset views and Transforms

This tutorial will show how to:
1. Quick load train, test and validation CSV subsets from a dataset using `PandasDatasetsIO` helper
2. Subclassing the `Dataset` interface using `PandasMultiModalDataset`
3. Acessing data and labels
4. Apply transforms to a dataset
5. Apply chain transforms to a dataset

## Common imports and definitions

In [1]:
from pathlib import Path  # For defining dataset Paths
import sys                # For include librep package

# This must be done if librep is not installed via pip,
# as this directory (examples) is appart from librep package root
sys.path.append("..")

# Third party imports
import pandas as pd
import numpy as np

# Librep imports
from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.multimodal import PandasMultiModalDataset # Wrap CSVs to librep's `Dataset` interface

2022-08-26 14:25:41.609343: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-26 14:25:41.609365: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data

In [2]:
# Path for KuHar balanced view with the same activities (and labels numbers) as MotionSense
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
kuhar_dataset_path = Path("../data/views/KuHar/balanced_motionsense_equivalent_view")

# Path for Motionsese balanced view
motionsense_dataset_path = Path("../data/views/MotionSense/balanced_view")

Once paths is defined, we can load the CSV as pandas dataframes

In [3]:
# Kuhar dataframes
kh_train, kh_validation, kh_test = PandasDatasetsIO(kuhar_dataset_path).load()

# MotionSense dataframes
ms_train, ms_validation, ms_test = PandasDatasetsIO(motionsense_dataset_path).load()

Letś take a look in the train dataframes

In [4]:
kh_train.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-299,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user
0,0,3.46131,-1.252847,-3.496563,-5.207304,-6.004595,-6.113169,-5.801509,-5.177825,-4.269927,...,0.364809,18.999,18.993,21.989,21.983,0,300,1,1800,1026
1,1,0.543923,2.928824,5.357442,7.666032,9.274595,9.504671,8.480812,7.024246,6.01752,...,0.516896,13.016,13.015,16.006,16.005,0,300,1,1200,1044
2,2,-5.489694,-7.227507,-8.707464,-10.075059,-10.948791,-10.493007,-9.060446,-8.76656,-8.540925,...,-0.478851,23.707,23.706,26.659,26.658,0,300,6,2400,1045
3,3,2.846,3.76,4.0479,3.7381,3.1034,2.1849,1.3306,0.3283,-0.9041,...,-0.23685,89.306,89.024,92.305,92.013,0,300,1,3900,1004
4,4,-4.132881,-4.075327,-3.745526,-3.110118,-2.204922,-1.180573,-0.275069,0.062984,-0.333226,...,0.255464,15.999,15.993,18.989,18.983,0,300,1,1500,1026


In [5]:
ms_train.head()

Unnamed: 0.1,Unnamed: 0,attitude.roll-0,attitude.roll-1,attitude.roll-2,attitude.roll-3,attitude.roll-4,attitude.roll-5,attitude.roll-6,attitude.roll-7,attitude.roll-8,...,userAcceleration.z-145,userAcceleration.z-146,userAcceleration.z-147,userAcceleration.z-148,userAcceleration.z-149,activity code,length,trial_code,index,user
0,0,1.3118,1.309805,1.294033,1.259262,1.214031,1.174594,1.150417,1.126066,1.071678,...,0.198949,-0.241833,-0.228292,-0.409867,-0.227758,0,150,11,300,16
1,1,0.979769,0.853751,0.724747,0.620533,0.563019,0.546236,0.540058,0.531511,0.509747,...,0.061945,0.108357,0.042498,-0.119922,-0.535207,0,150,1,750,7
2,2,2.457231,2.508876,2.562549,2.610262,2.64626,2.662423,2.66341,2.662757,2.656153,...,0.389712,-0.012963,-0.117823,-0.242463,-0.520011,0,150,1,750,11
3,3,-0.816211,-0.847936,-0.773849,-0.642674,-0.511272,-0.443049,-0.422701,-0.404203,-0.357625,...,1.096083,0.919155,0.980044,0.167161,0.291327,0,150,1,450,12
4,4,0.093224,0.153045,0.230516,0.32971,0.430513,0.511403,0.596036,0.68903,0.762821,...,0.559331,0.268818,0.286077,0.244404,0.149644,0,150,1,150,22


## Creating a Librep dataset from pandas dataframes

We use the `PandasMultiModalDataset` class to create librep's dataset.
When calling it, we must define:
- The name of the features (column prefixes) that we want from dataframes
- The name of the label column


The dataset will wrap around librep's `Dataset` class, which override `__getitem__` and `__len__` methods.
- Calling `len(dataset)` will return the number of rows in the dataframe
- Calling `dataset[0]` will return a tuple with 2 elements: 
    - The first element is the sample (at row 0, in this case), with the selected features
    - The second element is the label of this sample

### Kuhar

In [6]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

# Creating the datasets

# Train
kh_train_dataset = PandasMultiModalDataset(
    kh_train,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Validation
kh_validation_dataset = PandasMultiModalDataset(
    kh_validation,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Test
kh_test_dataset = PandasMultiModalDataset(
    kh_test,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

### MotionSense

In [7]:
# MotionSense features to select
features = [
    "userAcceleration.x",
    "userAcceleration.y",
    "userAcceleration.z",
    "rotationRate.x",
    "rotationRate.y",
    "rotationRate.z"
]

# Creating the datasets

# Train
ms_train_dataset = PandasMultiModalDataset(
    ms_train,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Validation
ms_validation_dataset = PandasMultiModalDataset(
    ms_validation,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Test
ms_test_dataset = PandasMultiModalDataset(
    ms_test,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

## Inspect sample

### Kuhar

In [8]:
# Lets print the first sample of kh_train dataset.
# Is a tuple, with an vector of 1800 elements as first element and the label as second
x = kh_train_dataset[0]
print(x)

(array([ 3.4613104 , -1.2528467 , -3.496563  , ...,  0.36297613,
        0.40329325,  0.36480874]), 0)


In [9]:
# Inspecting sample
print(f"The sample 0: {x[0]}")
print(f"Shape of sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

The sample 0: [ 3.4613104  -1.2528467  -3.496563   ...  0.36297613  0.40329325
  0.36480874]
Shape of sample 0: (1800,)
The label of sample 0: 0


### MotionSense

In [10]:
# Lets print the first sample of kh_train dataset.
# Is a tuple, with an vector of 1800 elements as first element and the label as second
x = ms_train_dataset[0]
print(x)

(array([ 1.850310e-01,  1.323820e-01,  8.863600e-02,  8.935600e-02,
        1.302640e-01,  1.320740e-01,  1.471270e-01,  1.316310e-01,
       -1.874900e-02, -7.266600e-02,  2.075100e-02, -1.878720e-01,
       -3.334510e-01, -4.484450e-01, -1.459800e-01,  2.271810e-01,
        2.504250e-01,  2.776350e-01,  1.516580e-01, -2.107200e-02,
        5.909000e-02, -2.750900e-02,  4.955300e-02,  1.133940e-01,
        1.298460e-01,  3.934200e-02, -2.773300e-02, -4.913900e-02,
       -1.986110e-01, -3.969930e-01, -5.211840e-01, -3.820190e-01,
       -2.566590e-01, -6.048700e-02,  1.571870e-01,  2.215400e-01,
        2.895000e-01,  2.309840e-01, -1.455400e-01, -6.067200e-02,
        3.323650e-01,  5.399100e-02, -7.359100e-02, -2.317620e-01,
       -1.721370e-01, -7.184400e-02, -1.080600e-01,  5.770800e-02,
        2.087240e-01,  2.547080e-01,  2.270680e-01,  1.223900e-02,
       -8.920400e-02, -1.924300e-01, -2.729570e-01, -1.280950e-01,
       -1.683400e-02, -8.303300e-02,  1.845200e-02, -1.829600

In [11]:
# Inspecting sample
print(f"The sample 0: {x[0]}")
print(f"Shape sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

The sample 0: [ 1.850310e-01  1.323820e-01  8.863600e-02  8.935600e-02  1.302640e-01
  1.320740e-01  1.471270e-01  1.316310e-01 -1.874900e-02 -7.266600e-02
  2.075100e-02 -1.878720e-01 -3.334510e-01 -4.484450e-01 -1.459800e-01
  2.271810e-01  2.504250e-01  2.776350e-01  1.516580e-01 -2.107200e-02
  5.909000e-02 -2.750900e-02  4.955300e-02  1.133940e-01  1.298460e-01
  3.934200e-02 -2.773300e-02 -4.913900e-02 -1.986110e-01 -3.969930e-01
 -5.211840e-01 -3.820190e-01 -2.566590e-01 -6.048700e-02  1.571870e-01
  2.215400e-01  2.895000e-01  2.309840e-01 -1.455400e-01 -6.067200e-02
  3.323650e-01  5.399100e-02 -7.359100e-02 -2.317620e-01 -1.721370e-01
 -7.184400e-02 -1.080600e-01  5.770800e-02  2.087240e-01  2.547080e-01
  2.270680e-01  1.223900e-02 -8.920400e-02 -1.924300e-01 -2.729570e-01
 -1.280950e-01 -1.683400e-02 -8.303300e-02  1.845200e-02 -1.829600e-02
  3.238000e-02  8.430000e-04 -1.690000e-03  1.066490e-01  1.603170e-01
  1.546510e-01  1.547520e-01  1.644770e-01  1.941880e-01  1.728

## Checking windows

MultiModal datasets have windows as each sample can be composed by elements of different natures. For instance, a single sample of 1800 elements from KuHar dataset view is composed by accelerometer (3-axis) and gyroscope (3-axis).

We can check which parts of the sample (window slices) correspoend to which sensors and axis checking the `window_slices`, `window_name` and `num_windows` atrributes.

These values is automatically filled when using the `PandasMultiModalDataset` class.

### Kuhar windows

In [12]:
print(f"Kuhar train have {kh_train_dataset.num_windows} windows")
print(f"Each sample train can be sliced at: {kh_train_dataset.window_slices}")
print(f"Each slice has the following name associated: {kh_train_dataset.window_names}")

Kuhar train have 6 windows
Each sample train can be sliced at: [(0, 300), (300, 600), (600, 900), (900, 1200), (1200, 1500), (1500, 1800)]
Each slice has the following name associated: ['accel-x', 'accel-y', 'accel-z', 'gyro-x', 'gyro-y', 'gyro-z']


### MotionSense windows

In [13]:
print(f"Motionsense train have {ms_train_dataset.num_windows} windows")
print(f"Each sample train can be sliced at: {ms_train_dataset.window_slices}")
print(f"Each slice has the following name associated: {ms_train_dataset.window_names}")

Motionsense train have 6 windows
Each sample train can be sliced at: [(0, 150), (150, 300), (300, 450), (450, 600), (600, 750), (750, 900)]
Each slice has the following name associated: ['userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z', 'rotationRate.x', 'rotationRate.y', 'rotationRate.z']


## Transforms

We can apply a chain of transforms to a single dataset using `TransformMultiModalDataset`.
This class will iterate over the whole dataset and, for each window of each sample, will apply the transform.

MultiModalDatasets allows applying transforms to a window of a sample instead of the whole sample

In [14]:
from librep.datasets.multimodal import TransformMultiModalDataset
from librep.transforms.fft import FFT
from librep.transforms.stats import StatsTransform
from librep.transforms.resampler import SimpleResampler

In [15]:
resampler = SimpleResampler(new_sample_size=100)
fft_transform = FFT()
transformer = TransformMultiModalDataset(transforms=[resampler, fft_transform], new_window_name_prefix="resample-100-fft.")

### Use Resample and FFT in Kuhar

In [16]:
resampled_kh_train_dataset_fft = transformer(kh_train_dataset)
resampled_kh_validation_dataset_fft = transformer(kh_validation_dataset)
resampled_kh_test_dataset_fft = transformer(kh_test_dataset)

In [17]:
resampled_kh_train_dataset_fft[0]

(array([1.59463682e+01, 8.65515034e+00, 2.27354416e+01, 1.97537874e+01,
        9.25508722e+00, 7.09820675e+01, 7.20439930e+01, 8.63793839e+01,
        5.51569339e+01, 2.55919794e+01, 3.23705964e+01, 2.55852007e+01,
        2.20892355e+01, 1.81927479e+01, 5.00077102e+01, 1.13724929e+02,
        7.78939711e+01, 6.27460963e+01, 2.61259616e+01, 2.55478458e+01,
        2.59205229e+01, 3.36605549e+01, 3.97725356e+01, 7.62682196e+00,
        3.12322281e+01, 1.70850192e+01, 2.67012667e+01, 2.30589481e+01,
        1.21252088e+01, 4.10023238e+00, 5.18090094e+00, 1.21208434e+01,
        9.54793754e+00, 8.47690534e+00, 9.66471919e+00, 6.59353515e+00,
        4.18492000e+00, 1.85648103e+00, 1.10085834e+01, 9.44515729e+00,
        1.21408122e+01, 3.26294045e-01, 2.99345158e+00, 3.11082536e+00,
        4.64142178e+00, 1.44292597e+00, 8.50990863e+00, 9.02777719e+00,
        1.34475812e+01, 3.02679706e+00, 1.14960365e+01, 3.02679706e+00,
        1.34475812e+01, 9.02777719e+00, 8.50990863e+00, 1.442925

### Use Resample and FFT in MotionSense

In [19]:
resampled_ms_train_dataset_fft = transformer(ms_train_dataset)
resampled_ms_validation_dataset_fft = transformer(ms_validation_dataset)
resampled_ms_test_dataset_fft = transformer(ms_test_dataset)