# Resampling KuHar with MotionSense activities to 20Hz

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../../../")

In [2]:
from pathlib import Path

import numpy as np
import pandas as pd

from librep.utils.dataset import PandasDatasetsIO          # For quick load train, test and validation CSVs
from librep.datasets.multimodal import PandasMultiModalDataset # Wrap CSVs to librep's `Dataset` interface

from librep.datasets.multimodal import TransformMultiModalDataset
from librep.transforms.resampler import SimpleResampler

## Loading data

In [3]:
# Path for KuHar balanced view with the same activities (and labels numbers) as MotionSense
# It is assumed that the directory will contain (train.csv, test.csv and validation.csv)
kuhar_dataset_path = Path("../../../data/views/KuHar/balanced_motionsense_equivalent_view")

Once paths is defined, we can load the CSV as pandas dataframes

In [4]:
# Kuhar dataframes
kh_train, kh_validation, kh_test = PandasDatasetsIO(kuhar_dataset_path).load()

Letś take a look in the train dataframes

In [5]:
kh_train.head()

Unnamed: 0.1,Unnamed: 0,accel-x-0,accel-x-1,accel-x-2,accel-x-3,accel-x-4,accel-x-5,accel-x-6,accel-x-7,accel-x-8,...,accel-start-time,gyro-start-time,accel-end-time,gyro-end-time,activity code,length,serial,index,user,normalized activity code
0,0,3.804435,4.461484,5.098265,5.63333,5.949605,5.865928,5.490818,5.178724,4.903305,...,9.159,9.16,12.199,12.201,0,300,23,900,1101,4
1,1,-2.625497,-2.58011,-2.602727,-2.606654,-2.498523,-2.376706,-2.323041,-2.309243,-2.359475,...,9.167,9.166,12.211,12.208,0,300,43,900,1101,4
2,2,-5.197734,-3.238093,-1.691018,-0.301669,0.546108,1.183569,1.87406,2.399963,2.718681,...,15.262,15.263,18.298,18.299,0,300,24,1500,1101,4
3,3,-3.956416,-3.492933,-3.084537,-2.849711,-2.74826,-2.742799,-2.801168,-2.875001,-2.925813,...,33.572,33.583,36.615,36.626,0,300,41,3300,1101,4
4,4,0.470529,0.25319,0.049246,-0.075565,-0.150161,-0.084733,0.09296,0.292096,0.418061,...,0.008,0.007,3.047,3.048,0,300,30,0,1101,4


## Creating librep dataset 

In [6]:
# Kuhar features to select
features = [
    "accel-x",
    "accel-y",
    "accel-z",
    "gyro-x",
    "gyro-y",
    "gyro-z"
]

# Creating the datasets

# Train
kh_train_dataset = PandasMultiModalDataset(
    kh_train,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

# Validation
kh_validation_dataset = PandasMultiModalDataset(
    kh_validation,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

# Test
kh_test_dataset = PandasMultiModalDataset(
    kh_test,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

In [7]:
kh_train.columns

Index(['Unnamed: 0', 'accel-x-0', 'accel-x-1', 'accel-x-2', 'accel-x-3',
       'accel-x-4', 'accel-x-5', 'accel-x-6', 'accel-x-7', 'accel-x-8',
       ...
       'accel-start-time', 'gyro-start-time', 'accel-end-time',
       'gyro-end-time', 'activity code', 'length', 'serial', 'index', 'user',
       'normalized activity code'],
      dtype='object', length=1811)

## Inspect the sample

In [8]:
# Lets print the first sample of kh_train dataset.
# Is a tuple, with an vector of 1800 elements as first element and the label as second
x = kh_train_dataset[0]
print(x)

# Inspecting sample
print(f"The sample 0: {x[0]}")
print(f"Shape of sample 0: {x[0].shape}")
print(f"The label of sample 0: {x[1]}")

(array([ 3.8044353 ,  4.4614844 ,  5.0982647 , ..., -0.10786453,
       -0.03970453, -0.00881953]), 4)
The sample 0: [ 3.8044353   4.4614844   5.0982647  ... -0.10786453 -0.03970453
 -0.00881953]
Shape of sample 0: (1800,)
The label of sample 0: 4


## Checking the window

In [9]:
print(f"Kuhar train have {kh_train_dataset.num_windows} windows")
print(f"Each sample train can be sliced at: {kh_train_dataset.window_slices}")
print(f"Each slice has the following name associated: {kh_train_dataset.window_names}")

Kuhar train have 6 windows
Each sample train can be sliced at: [(0, 300), (300, 600), (600, 900), (900, 1200), (1200, 1500), (1500, 1800)]
Each slice has the following name associated: ['accel-x', 'accel-y', 'accel-z', 'gyro-x', 'gyro-y', 'gyro-z']


## Activities dictionary

In [10]:
act_names = ['Sit',
 'Stand',
 'Walk',
 'Stair-up',
 'Stair-down',
 'Run']

In [11]:
act_dict = {}
for i in range(0, len(act_names)):
    act_dict[act_names[i]] = i

## Resampling

### Setting the column names

In [12]:
accel_x = ["accel-x-"+str(i) for i in range(0, 60)]
accel_y = ["accel-y-"+str(i) for i in range(0, 60)]
accel_z = ["accel-z-"+str(i) for i in range(0, 60)]
gyro_x = ["gyro-x-"+str(i) for i in range(0, 60)]
gyro_y = ["gyro-y-"+str(i) for i in range(0, 60)]
gyro_z = ["gyro-z-"+str(i) for i in range(0, 60)]
columns = np.concatenate((accel_x, accel_y, accel_z, gyro_x, gyro_y, gyro_z))

### Creating the resampled dataframes

In [13]:
resampler = SimpleResampler(new_sample_size=60)
transformer = TransformMultiModalDataset(transforms=[resampler], new_window_name_prefix="resampled-20Hz.")

In [14]:
resampled_kh_train_samples_df = pd.DataFrame(transformer(kh_train_dataset)[:][0], columns=columns)
resampled_kh_train = pd.concat([resampled_kh_train_samples_df, kh_train.iloc[:, -9:]], axis=1)

resampled_kh_validation_samples_df = pd.DataFrame(transformer(kh_validation_dataset)[:][0], columns=columns)
resampled_kh_validation = pd.concat([resampled_kh_validation_samples_df, kh_validation.iloc[:, -9:]], axis=1)

resampled_kh_test_samples_df = pd.DataFrame(transformer(kh_test_dataset)[:][0], columns=columns)
resampled_kh_test = pd.concat([resampled_kh_test_samples_df, kh_test.iloc[:, -9:]], axis=1)

### Creating the datasets

In [15]:
resampled_kh_train_dataset = PandasMultiModalDataset(
    resampled_kh_train,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

resampled_kh_validation_dataset = PandasMultiModalDataset(
    resampled_kh_validation,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

resampled_kh_test_dataset = PandasMultiModalDataset(
    resampled_kh_test,
    feature_prefixes=features,
    label_columns="normalized activity code",
    as_array=True
)

## Saving the resampled data

In [16]:
act_code = {v: k for k, v in act_dict.items()}
normalized_activity_names = {key: act_code[key] for key in range(0,6)}

In [17]:
output_path = Path("../../../data/views/KuHar/balanced_motionsense_equivalent_resampled_view_20Hz")

train_act_samples = resampled_kh_train["normalized activity code"].value_counts().to_dict()
validation_act_samples = resampled_kh_validation["normalized activity code"].value_counts().to_dict()
test_act_samples = resampled_kh_test["normalized activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in normalized_activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(resampled_kh_test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Balanced KuHar View Resampled to 20Hz

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
{activities}

## Users
- {len(resampled_kh_train.user.unique())} users train dataset: {train_users}.
- {len(resampled_kh_validation.user.unique())} users validation dataset: {validation_users}.
- {len(resampled_kh_test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Balanced KuHar View Resampled to 20Hz

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
- 0: Sit (485 train, 34 validation, 41 test)
- 1: Stand (485 train, 34 validation, 41 test)
- 2: Walk (485 train, 34 validation, 41 test)
- 3: Stair-up (485 train, 34 validation, 41 test)
- 4: Stair-down (485 train, 34 validation, 41 test)
- 5: Run (485 train, 34 validation, 41 test)

## Users
- 56 users train dataset: 1001 (11 samples), 1002 (104 samples), 1003 (13 samples), 1004 (48 samples), 1006 (21 samples), 1007 (18 samples), 1008 (29 samples), 1009 (20 samples), 1013 (25 samples), 1014 (65 samples), 1015 (27 samples), 1016 (20 samples), 1017 (12 samples), 1018 (17 samples), 1019 (17 samples), 1020 (11 samples), 1022 (58 s

In [18]:
pandas_io.save(train=resampled_kh_train, validation=resampled_kh_validation, test=resampled_kh_test, description=description)