# Pre-processing KuHar Dataset and Generate Views

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../../")

In [2]:
from pathlib import Path
from typing import List
import hashlib

import numpy as np
import pandas as pd

from librep.datasets.har.kuhar import (
    RawKuHar,
    RawKuHarIterator,
    KuHarDatasetGenerator
)

from librep.utils.dataset import PandasDatasetsIO

%matplotlib inline

2022-09-06 18:45:59.591266: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-06 18:45:59.591288: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [3]:
dataset_dir = Path("../../data/datasets/KuHar/1.Raw_time_domain_data")
kuhar_dataset = RawKuHar(dataset_dir, download=False)
kuhar_dataset

KuHar Dataset at: '../../data/datasets/KuHar/1.Raw_time_domain_data'

In [4]:
act_names = [kuhar_dataset.activity_names[i] for i in range(18)]
act_names

['Stand',
 'Sit',
 'Talk-sit',
 'Talk-stand',
 'Stand-sit',
 'Lay',
 'Lay-stand',
 'Pick',
 'Jump',
 'Push-up',
 'Sit-up',
 'Walk',
 'Walk-backwards',
 'Walk-circle',
 'Run',
 'Stair-up',
 'Stair-down',
 'Table-tennis']

## Creating a KuHar Balanced View

In [5]:
iterator = RawKuHarIterator(kuhar_dataset)
iterator

Kuhar Iterator: users=89, activities=18

In [6]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

Dataset generator: time_window=300, overlap=0

In [7]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    seed=0
)

Generating full df over KuHar View: 1945it [01:41, 19.14it/s]


In [8]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

ce08f8a1038ab48c890211a6d03233ea1c936c3d
8abd24b492d8cb93bae3055d57720ed7726c562b
7bffb5001651ab21c44b38cba2ed57703c0d3c41


## Normalize the label's names

In [9]:
def apply(row):
    a = {
        1: 0,
        0: 1,
        11: 2,
        15: 3,
        16: 4,
        14: 5,
        2: 6,
        3: 7,
        4: 8,
        5: 9,
        6: 10,
        7: 11,
        8: 12,
        9: 13,
        10: 14,
        12: 15,
        13: 16,
        17: 17,
        18: 18
    }
    row["normalized activity code"] = row["activity code"].map(a,na_action=None)
    return row

train = apply(train)
validation = apply(validation)
test = apply(test)

## Save the dataset

In [9]:
output_path = Path("../../data/views/KuHar/balanced_view")

train_act_samples = train["normalized activity code"].value_counts().to_dict()
validation_act_samples = validation["normalized activity code"].value_counts().to_dict()
test_act_samples = test["normalized activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
- 0: Stand (185 train, 6 validation, 21 test)
- 1: Sit (185 train, 6 validation, 21 test)
- 2: Talk-sit (185 train, 6 validation, 21 test)
- 3: Talk-stand (185 train, 6 validation, 21 test)
- 4: Stand-sit (185 train, 6 validation, 21 test)
- 5: Lay (185 train, 6 validation, 21 test)
- 6: Lay-stand (185 train, 6 validation, 21 test)
- 7: Pick (185 train, 6 validation, 21 test)
- 8: Jump (185 train, 6 validation, 21 test)
- 9: Push-up (185 train, 6 validation, 21 test)
- 10: Sit-up (185 train, 6 validation, 21 test)
- 11: Walk (185 train, 6 validation, 21 test)
- 12: Walk-backwards (185 train, 6 validation, 21 test)
- 13: Walk-circle (185 train, 6 

In [10]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced KuHar Balanced View

In [11]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    seed=0
)

Generating full df over KuHar View: 1945it [01:35, 20.35it/s]


In [12]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

31751c896807dc969d23b5b663b0081591977cac
3891f7989d3769fa0577834a4e51034bade10aaa
d1e70e186d0721a8ea1154733184b507222e9926


## Normalize the label's names

In [9]:
def apply(row):
    a = {
        1: 0,
        0: 1,
        11: 2,
        15: 3,
        16: 4,
        14: 5,
        2: 6,
        3: 7,
        4: 8,
        5: 9,
        6: 10,
        7: 11,
        8: 12,
        9: 13,
        10: 14,
        12: 15,
        13: 16,
        17: 17,
        18: 18
    }
    row["normalized activity code"] = row["activity code"].map(a,na_action=None)
    return row

train = apply(train)
validation = apply(validation)
test = apply(test)

## Save the dataset

In [13]:
output_path = Path("../../data/views/KuHar/non_balanced_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Non-Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Non-Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:
- 0: Stand (1373 train, 183 validation, 274 test)
- 1: Sit (1328 train, 160 validation, 324 test)
- 2: Talk-sit (1321 train, 128 validation, 301 test)
- 3: Talk-stand (1344 train, 166 validation, 310 test)
- 4: Stand-sit (1550 train, 141 validation, 337 test)
- 5: Lay (1287 train, 190 validation, 305 test)
- 6: Lay-stand (1277 train, 168 validation, 242 test)
- 7: Pick (992 train, 131 validation, 158 test)
- 8: Jump (537 train, 56 validation, 63 test)
- 9: Push-up (219 train, 229 validation, 21 test)
- 10: Sit-up (823 train, 123 validation, 47 test)
- 11: Walk (627 train, 27 validation, 137 test)
- 12: Walk-backwards (217 train, 8 validation, 65 test)
- 13: Walk-circle (185 train, 6 validation, 50 test)
- 14: Run (488 train, 16 validation, 56 test)
- 15: Stair-up (750 train, 18 validati

In [14]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Balanced Kuhar with Only MotionSense activities

In [15]:
activities_to_select = [
    "Stair-down",
    "Stair-up",
    "Sit",
    "Stand",
    "Walk",
    "Run"
]

activity_codes = [
    kuhar_dataset.activity_codes[act_name]
    for act_name in activities_to_select
]

print(f"MotionSense equivalent activity codes: {activity_codes}")

activity_remap = {
    code: i
    for i, code in enumerate(activity_codes)
}
print(f"The codes will be remaped as motionsense: {', '.join(f'{old} will become {new}' for old, new in activity_remap.items())}")

MotionSense equivalent activity codes: [16, 15, 1, 0, 11, 14]
The codes will be remaped as motionsense: 16 will become 0, 15 will become 1, 1 will become 2, 0 will become 3, 11 will become 4, 14 will become 5


In [16]:
iterator = RawKuHarIterator(kuhar_dataset, activities=activity_codes)
iterator

Kuhar Iterator: users=89, activities=6

In [17]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

Dataset generator: time_window=300, overlap=0

In [18]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    activities_remap=activity_remap,
    seed=0
)

Generating full df over KuHar View: 625it [00:31, 19.90it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.repl

In [19]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

9d1dc2828f22204cd42a8988ab6ef542be8a7164
89224f59afa7a355f17f90c60cd5ee8d3038661d
2965d7c5682e64ba232262766cc17149a1b8095d


## Normalize the label's names

In [9]:
def apply(row):
    a = {
        2: 0,
        3: 1,
        4: 2,
        1: 3,
        0: 4,
        5: 5,
    }
    row["normalized activity code"] = row["activity code"].map(a,na_action=None)
    return row

train = apply(train)
validation = apply(validation)
test = apply(test)

## Save the dataset

In [20]:
output_path = Path("../../data/views/KuHar/balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Balanced MotionSense equivalent KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Balanced MotionSense equivalent KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: Stair-down, Stair-up, Sit, Stand, Walk, Run, were selected.
To each activity were assigned the same MotionSense activity code, thus: 16 (Stair-down in KuHar) became 0 (in MotionSense), 15 (Stair-up in KuHar) became 1 (in MotionSense), 1 (Sit in KuHar) became 2 (in MotionSense), 0 (Stand in KuHar) became 3 (in MotionSense), 11 (Walk in KuHar) became 4 (in MotionSense), 14 (Run in KuHar) became 5 (in MotionSense)

- 0: Stair-down (485 train, 34 validation, 41 test)
- 1: Stair-up (485 train, 34 validation, 41 test

In [21]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced Kuhar with Only MotionSense activities

In [22]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    activities_remap=activity_remap,
    seed=0
)

Generating full df over KuHar View: 625it [00:27, 22.52it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.repl

In [23]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

739318d8ef43c5126e478ef88d3171ea8ff362d5
67e119c0cef043f6cb29f432e2e6ee63751e9fc7
af713ad97427feb4864e935857f1c519f3e43c15


## Normalize the label's names

In [9]:
def apply(row):
    a = {
        2: 0,
        3: 1,
        4: 2,
        1: 3,
        0: 4,
        5: 5,
    }
    row["normalized activity code"] = row["activity code"].map(a,na_action=None)
    return row

train = apply(train)
validation = apply(validation)
test = apply(test)

## Save the dataset

In [24]:
output_path = Path("../../data/views/KuHar/non_balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Non-Balanced MotionSense equivalent KuHar Dataset View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Non-Balanced MotionSense equivalent KuHar Dataset View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: Stair-down, Stair-up, Sit, Stand, Walk, Run, were selected.
To each activity were assigned the same MotionSense activity code, thus: 16 (Stair-down in KuHar) became 0 (in MotionSense), 15 (Stair-up in KuHar) became 1 (in MotionSense), 1 (Sit in KuHar) became 2 (in MotionSense), 0 (Stand in KuHar) became 3 (in MotionSense), 11 (Walk in KuHar) became 4 (in MotionSense), 14 (Run in KuHar) became 5 (in MotionSense)

- 0: Stair-down (693 train, 43 validation, 57 test)
- 1: Stair-up (706 train, 49 validation, 59 test)
- 2: Sit (1280 train, 138 validation, 394 test)
- 3: Stand (1314 train, 141 validation, 375 test)
- 4: Walk (586 train, 73 validation, 1

In [25]:
pandas_io.save(train=train, validation=validation, test=test, description=description)