# Pre-processing KuHar Dataset and Generate Views

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("..")

In [2]:
from pathlib import Path
from typing import List
import hashlib

import numpy as np
import pandas as pd

from librep.datasets.har.kuhar import (
    RawKuHar,
    RawKuHarIterator,
    KuHarDatasetGenerator
)

from librep.utils.dataset import PandasDatasetsIO

%matplotlib inline

In [3]:
dataset_dir = Path("../data/datasets/KuHar/1.Raw_time_domain_data")
kuhar_dataset = RawKuHar(dataset_dir, download=False)
kuhar_dataset

KuHar Dataset at: '../data/datasets/KuHar/1.Raw_time_domain_data'

In [4]:
act_names = [kuhar_dataset.activity_names[i] for i in range(18)]
act_names

['Stand',
 'Sit',
 'Talk-sit',
 'Talk-stand',
 'Stand-sit',
 'Lay',
 'Lay-stand',
 'Pick',
 'Jump',
 'Push-up',
 'Sit-up',
 'Walk',
 'Walk-backwards',
 'Walk-circle',
 'Run',
 'Stair-up',
 'Stair-down',
 'Table-tennis']

## Creating a KuHar Balanced View

In [5]:
iterator = RawKuHarIterator(kuhar_dataset)
iterator

Kuhar Iterator: users=89, activities=18

In [6]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

Dataset generator: time_window=300, overlap=0

In [7]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    seed=0
)

Generating full df over KuHar View: 1945it [01:39, 19.48it/s]


In [8]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

ca61c922fc72ef34f12bb19c954f63e20d552e1c
53190325648181a44390dd332815ff82d67b6e66
788444769fed9b4a2910739f680a33634a5715dd


In [9]:
output_path = Path("../data/views/KuHar/balanced_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
- 0: Stand (176 train, 13 validation, 24 test)
- 1: Sit (176 train, 13 validation, 24 test)
- 2: Talk-sit (176 train, 13 validation, 24 test)
- 3: Talk-stand (176 train, 13 validation, 24 test)
- 4: Stand-sit (176 train, 13 validation, 24 test)
- 5: Lay (176 train, 13 validation, 24 test)
- 6: Lay-stand (176 train, 13 validation, 24 test)
- 7: Pick (176 train, 13 validation, 24 test)
- 8: Jump (176 train, 13 validation, 24 test)
- 9: Push-up (176 train, 13 validation, 24 test)
- 10: Sit-up (176 train, 13 validation, 24 test)
- 11: Walk (176 train, 13 validation, 24 test)
- 12: Walk-backwards (176 train, 13 validation, 24 test)
- 13: Walk-circle (

In [10]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced KuHar Balanced View

In [11]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    seed=0
)

Generating full df over KuHar View: 1945it [01:34, 20.60it/s]


In [12]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

7f86d5cebdc4a20af30d506e80f17498af521c92
aaf4800108b16aac80b2169331c5d9f112bd21c3
c41fa9a09d4641d982c215648a8c2e04eef5abde


In [13]:
output_path = Path("../data/views/KuHar/non_balanced_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Non-Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Non-Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:
- 0: Stand (1391 train, 141 validation, 298 test)
- 1: Sit (1401 train, 146 validation, 265 test)
- 2: Talk-sit (1356 train, 147 validation, 247 test)
- 3: Talk-stand (1405 train, 146 validation, 269 test)
- 4: Stand-sit (1552 train, 162 validation, 314 test)
- 5: Lay (1359 train, 143 validation, 280 test)
- 6: Lay-stand (1340 train, 128 validation, 219 test)
- 7: Pick (1029 train, 90 validation, 162 test)
- 8: Jump (557 train, 27 validation, 72 test)
- 9: Push-up (432 train, 13 validation, 24 test)
- 10: Sit-up (894 train, 29 validation, 70 test)
- 11: Walk (615 train, 87 validation, 89 test)
- 12: Walk-backwards (232 train, 19 validation, 39 test)
- 13: Walk-circle (176 train, 21 validation, 44 test)
- 14: Run (497 train, 24 validation, 39 test)
- 15: Stair-up (752 train, 30 validatio

In [14]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Balanced Kuhar with Only MotionSense activities

In [15]:
activities_to_select = [
    "Stair-down",
    "Stair-up",
    "Sit",
    "Stand",
    "Walk",
    "Run"
]

activity_codes = [
    kuhar_dataset.activity_codes[act_name]
    for act_name in activities_to_select
]

print(f"MotionSense equivalent activity codes: {activity_codes}")

activity_remap = {
    code: i
    for i, code in enumerate(activity_codes)
}
print(f"The codes will be remaped as motionsense: {', '.join(f'{old} will become {new}' for old, new in activity_remap.items())}")

MotionSense equivalent activity codes: [16, 15, 1, 0, 11, 14]
The codes will be remaped as motionsense: 16 will become 0, 15 will become 1, 1 will become 2, 0 will become 3, 11 will become 4, 14 will become 5


In [16]:
iterator = RawKuHarIterator(kuhar_dataset, activities=activity_codes)
iterator

Kuhar Iterator: users=89, activities=6

In [17]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

Dataset generator: time_window=300, overlap=0

In [18]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    activities_remap=activity_remap,
    seed=0
)

Generating full df over KuHar View: 625it [00:31, 20.00it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.repl

In [19]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

d2f65a92a465ec6c7ed4043ddb0d29d1e2512f4d
be157d48d43b077bc741110a36ddede4bbafe27d
fef3d897a8fbd6c20f45f70095c69bde637ce057


In [20]:
output_path = Path("../data/views/KuHar/balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Balanced MotionSense equivalent KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Balanced MotionSense equivalent KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: Stair-down, Stair-up, Sit, Stand, Walk, Run, were selected.
To each activity were assigned the same MotionSense activity code, thus: 16 (Stair-down in KuHar) became 0 (in MotionSense), 15 (Stair-up in KuHar) became 1 (in MotionSense), 1 (Sit in KuHar) became 2 (in MotionSense), 0 (Stand in KuHar) became 3 (in MotionSense), 11 (Walk in KuHar) became 4 (in MotionSense), 14 (Run in KuHar) became 5 (in MotionSense)

- 0: Stair-down (231 train, 10 validation, 104 test)
- 1: Stair-up (231 train, 10 validation, 104 te

In [21]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced Kuhar with Only MotionSense activities

In [22]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    activities_remap=activity_remap,
    seed=0
)

Generating full df over KuHar View: 625it [00:30, 20.35it/s]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  validation.replace({"activity code": activities_remap}, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test.repl

In [23]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

9bb7b85f4d2225a1c792267e1a6ecc094c90e2e0
fc5f14633cc6c27e7a1d6cb880db8309b4a83fa8
17afce3067ecec5fd2c1f10b56ac8be08f4fb8ab


In [24]:
output_path = Path("../data/views/KuHar/non_balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Non-Balanced MotionSense equivalent KuHar Dataset View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

# Non-Balanced MotionSense equivalent KuHar Dataset View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: Stair-down, Stair-up, Sit, Stand, Walk, Run, were selected.
To each activity were assigned the same MotionSense activity code, thus: 16 (Stair-down in KuHar) became 0 (in MotionSense), 15 (Stair-up in KuHar) became 1 (in MotionSense), 1 (Sit in KuHar) became 2 (in MotionSense), 0 (Stand in KuHar) became 3 (in MotionSense), 11 (Walk in KuHar) became 4 (in MotionSense), 14 (Run in KuHar) became 5 (in MotionSense)

- 0: Stair-down (242 train, 13 validation, 538 test)
- 1: Stair-up (254 train, 18 validation, 542 test)
- 2: Sit (1273 train, 160 validation, 379 test)
- 3: Stand (1273 train, 160 validation, 397 test)
- 4: Walk (630 train, 57 validation,

In [25]:
pandas_io.save(train=train, validation=validation, test=test, description=description)