# Resampling KuHar to 30Hz

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../../../")

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.multimodal import PandasMultiModalDataset # Wrap CSVs to librep's `Dataset` interface

from librep.datasets.multimodal import TransformMultiModalDataset
from librep.transforms.resampler import SimpleResampler

2022-08-31 23:53:01.986959: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-08-31 23:53:01.986979: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


## Loading data

In [3]:
# Path for KuHar balanced view with the same activities (and labels numbers) as MotionSense
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
kuhar_dataset_path = Path("../../../data/views/KuHar/balanced_view")

Once paths is defined, we can load the CSV as pandas dataframes

In [4]:
# Kuhar dataframes
kh_train, kh_validation, kh_test = PandasDatasetsIO(kuhar_dataset_path).load()

Letś take a look in the train dataframes

In [5]:
kh_train.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,gyro-z-299,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user
0,0,-0.007251,-0.016431,-0.0019,-0.020529,-0.027133,-0.019558,-0.014525,-0.002541,0.016369,...,0.002956,23.235,23.223,26.26,26.249,0,300,1,2100,1051
1,1,-0.008128,-0.006837,0.008597,0.014337,0.006973,0.00325,-0.005086,-0.014379,-0.007034,...,0.001709,56.292,56.292,59.245,59.245,0,300,1,5700,1037
2,2,-0.033081,-0.037222,-0.043654,-0.038211,0.014246,0.063478,0.043582,-0.013673,-0.029928,...,0.00255,27.268,27.267,30.29,30.291,0,300,1,2700,1075
3,3,-0.00974,-0.016656,0.002454,-0.023503,-0.023115,-0.006241,0.017415,0.014765,0.019231,...,0.002969,39.421,39.42,42.441,42.44,0,300,6,3900,1008
4,4,0.029113,0.042745,0.017337,-0.015903,-0.027398,-0.010438,-0.026766,-0.013397,-0.008499,...,0.006943,23.703,23.703,26.656,26.656,0,300,1,2400,1038


## Creating librep dataset 

In [6]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

# Creating the datasets

# Train
kh_train_dataset = PandasMultiModalDataset(
    kh_train,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Validation
kh_validation_dataset = PandasMultiModalDataset(
    kh_validation,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

# Test
kh_test_dataset = PandasMultiModalDataset(
    kh_test,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

In [7]:
kh_train.columns

Index(['Unnamed: 0', 'accel-x-0', 'accel-x-1', 'accel-x-2', 'accel-x-3',
       'accel-x-4', 'accel-x-5', 'accel-x-6', 'accel-x-7', 'accel-x-8',
       ...
       'gyro-z-299', 'accel-start-time', 'gyro-start-time', 'accel-end-time',
       'gyro-end-time', 'activity code', 'length', 'serial', 'index', 'user'],
      dtype='object', length=1810)

## Inspect the sample

In [8]:
# Lets print the first sample of kh_train dataset.
# Is a tuple, with an vector of 1800 elements as first element and the label as second
x = kh_train_dataset[0]
print(x)

# Inspecting sample
print(f"The sample 0: {x[0]}")
print(f"Shape of sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

(array([-0.00725079, -0.01643086, -0.00189972, ...,  0.00295611,
        0.00295611,  0.00295611]), 0)
The sample 0: [-0.00725079 -0.01643086 -0.00189972 ...  0.00295611  0.00295611
  0.00295611]
Shape of sample 0: (1800,)
The label of sample 0: 0


## Checking the window

In [9]:
print(f"Kuhar train have {kh_train_dataset.num_windows} windows")
print(f"Each sample train can be sliced at: {kh_train_dataset.window_slices}")
print(f"Each slice has the following name associated: {kh_train_dataset.window_names}")

Kuhar train have 6 windows
Each sample train can be sliced at: [(0, 300), (300, 600), (600, 900), (900, 1200), (1200, 1500), (1500, 1800)]
Each slice has the following name associated: ['accel-x', 'accel-y', 'accel-z', 'gyro-x', 'gyro-y', 'gyro-z']


## Activities dictionary

In [10]:
act_names = ['Stand',
 'Sit',
 'Talk-sit',
 'Talk-stand',
 'Stand-sit',
 'Lay',
 'Lay-stand',
 'Pick',
 'Jump',
 'Push-up',
 'Sit-up',
 'Walk',
 'Walk-backwards',
 'Walk-circle',
 'Run',
 'Stair-up',
 'Stair-down',
 'Table-tennis']

In [11]:
act_dict = {}
for i in range(0, len(act_names)):
    act_dict[act_names[i]] = i

## Resampling

### Setting the column names

In [12]:
accel_x = ["accel-x-"+str(i) for i in range(0, 90)]
accel_y = ["accel-y-"+str(i) for i in range(0, 90)]
accel_z = ["accel-z-"+str(i) for i in range(0, 90)]
gyro_x = ["gyro-x-"+str(i) for i in range(0, 90)]
gyro_y = ["gyro-y-"+str(i) for i in range(0, 90)]
gyro_z = ["gyro-z-"+str(i) for i in range(0, 90)]
columns = np.concatenate((accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z))

### Creating the resampled dataframes

In [13]:
resampler = SimpleResampler(new_sample_size=90)
transformer = TransformMultiModalDataset(transforms=[resampler], new_window_name_prefix="resampled-30Hz.")

In [14]:
resampled_kh_train_samples_df = pd.DataFrame(transformer(kh_train_dataset)[:][0], columns=columns)
resampled_kh_train = pd.concat([resampled_kh_train_samples_df, kh_train.iloc[:, -9:]], axis=1)

resampled_kh_validation_samples_df = pd.DataFrame(transformer(kh_validation_dataset)[:][0], columns=columns)
resampled_kh_validation = pd.concat([resampled_kh_validation_samples_df, kh_validation.iloc[:, -9:]], axis=1)

resampled_kh_test_samples_df = pd.DataFrame(transformer(kh_test_dataset)[:][0], columns=columns)
resampled_kh_test = pd.concat([resampled_kh_test_samples_df, kh_test.iloc[:, -9:]], axis=1)

### Creating the datasets

In [15]:
resampled_kh_train_dataset = PandasMultiModalDataset(
    resampled_kh_train,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

resampled_kh_validation_dataset = PandasMultiModalDataset(
    resampled_kh_validation,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

resampled_kh_test_dataset = PandasMultiModalDataset(
    resampled_kh_test,
    feature_prefixes=features,
    label_columns="activity code",
    as_array=True
)

## Saving the resampled data

In [27]:
output_path = Path("../../../data/views/KuHar/resampled_view_30Hz")

train_act_samples = resampled_kh_train["activity code"].value_counts().to_dict()
validation_act_samples = resampled_kh_validation["activity code"].value_counts().to_dict()
test_act_samples = resampled_kh_test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[code]} train, {validation_act_samples[code]} validation, {test_act_samples[code]} test)" for name, code in act_dict.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Resampled to 30Hz KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
{activities}

## Users
- {len(resampled_kh_train.user.unique())} users train dataset: {train_users}.
- {len(resampled_kh_validation.user.unique())} users validation dataset: {validation_users}.
- {len(resampled_kh_test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Resampled to 30Hz KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
- Stand: 0 (185 train, 6 validation, 21 test)
- Sit: 1 (185 train, 6 validation, 21 test)
- Talk-sit: 2 (185 train, 6 validation, 21 test)
- Talk-stand: 3 (185 train, 6 validation, 21 test)
- Stand-sit: 4 (185 train, 6 validation, 21 test)
- Lay: 5 (185 train, 6 validation, 21 test)
- Lay-stand: 6 (185 train, 6 validation, 21 test)
- Pick: 7 (185 train, 6 validation, 21 test)
- Jump: 8 (185 train, 6 validation, 21 test)
- Push-up: 9 (185 train, 6 validation, 21 test)
- Sit-up: 10 (185 train, 6 validation, 21 test)
- Walk: 11 (185 train, 6 validation, 21 test)
- Walk-backwards: 12 (185 train, 6 validation, 21 test)
- Walk-circle: 13 (185 

In [28]:
pandas_io.save(train=resampled_kh_train, validation=resampled_kh_validation, test=resampled_kh_test, description=description)