# Pre-processing KuHar Dataset and Generate Views

In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("../../")

In [2]:
from pathlib import Path
from typing import List
import hashlib

import numpy as np
import pandas as pd

from librep.datasets.har.kuhar import (
    RawKuHar,
    RawKuHarIterator,
    KuHarDatasetGenerator
)

from librep.utils.dataset import PandasDatasetsIO

%matplotlib inline

2022-12-17 13:17:13.734206: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-12-17 13:17:13.734224: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


ImportError: cannot import name 'download_unzip_check' from 'librep.utils.file_ops' (/home/ic-unicamp/miniconda3/lib/python3.9/site-packages/librep/utils/file_ops.py)

In [None]:
dataset_dir = Path("../../data/datasets/KuHar/1.Raw_time_domain_data")
kuhar_dataset = RawKuHar(dataset_dir, download=False)
kuhar_dataset

In [None]:
act_names = [kuhar_dataset.activity_names[i] for i in range(18)]
act_names

## Creating a KuHar Balanced View

In [None]:
iterator = RawKuHarIterator(kuhar_dataset)
iterator

In [None]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

In [None]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    seed=0
)

In [None]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

In [None]:
output_path = Path("../../data/views/KuHar/balanced_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

In [None]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced KuHar Balanced View

In [None]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    seed=0
)

In [None]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

In [None]:
output_path = Path("../../data/views/KuHar/non_balanced_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [f"- {name}: {code} ({train_act_samples[name]} train, {validation_act_samples[name]} validation, {test_act_samples[name]} test)" for name, code in kuhar_dataset.activity_names.items()]
activities = "\n".join(activities)

train_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(train["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
train_users = ', '.join(train_users)
validation_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(validation["user"].value_counts().sort_values().items(), key=lambda x: x[0])]
validation_users = ', '.join(validation_users)
test_users = [f"{user} ({no_samples} samples)" for user, no_samples in sorted(test["user"].value_counts().items(), key=lambda x: x[0])]
test_users = ', '.join(test_users)


description = f"""# Non-Balanced KuHar View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:
{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

In [None]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Balanced Kuhar with Only MotionSense activities

In [None]:
activities_to_select = [
    "Stair-down",
    "Stair-up",
    "Sit",
    "Stand",
    "Walk",
    "Run"
]

activity_codes = [
    kuhar_dataset.activity_codes[act_name]
    for act_name in activities_to_select
]

print(f"MotionSense equivalent activity codes: {activity_codes}")

activity_remap = {
    code: i
    for i, code in enumerate(activity_codes)
}
print(f"The codes will be remaped as motionsense: {', '.join(f'{old} will become {new}' for old, new in activity_remap.items())}")

In [None]:
iterator = RawKuHarIterator(kuhar_dataset, activities=activity_codes)
iterator

In [None]:
kuhar_generator = KuHarDatasetGenerator(iterator, time_window=300, window_overlap=0)
kuhar_generator

In [None]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=True,
    activities_remap=activity_remap,
    seed=0
)

In [None]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

In [None]:
output_path = Path("../../data/views/KuHar/balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Balanced MotionSense equivalent KuHar Dataset

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

After splits, the datasets were balanced in relation to the activity code column, that is, each subset have the same number of activitiy samples.

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

In [None]:
pandas_io.save(train=train, validation=validation, test=test, description=description)

## Creating a Non-Balanced Kuhar with Only MotionSense activities

In [None]:
train, validation, test = kuhar_generator.create_datasets(
    train_size=0.7,
    validation_size=0.1,
    test_size=0.2,
    ensure_distinct_users_per_dataset=True,
    balance_samples=False,
    activities_remap=activity_remap,
    seed=0
)

In [None]:
print(hashlib.sha1(pd.util.hash_pandas_object(train).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(validation).values).hexdigest())
print(hashlib.sha1(pd.util.hash_pandas_object(test).values).hexdigest())

In [None]:
output_path = Path("../../data/views/KuHar/non_balanced_motionsense_equivalent_view")

train_act_samples = train["activity code"].value_counts().to_dict()
validation_act_samples = validation["activity code"].value_counts().to_dict()
test_act_samples = test["activity code"].value_counts().to_dict()
activities = [
    f"- {new}: {kuhar_dataset.activity_names[old]} ({train_act_samples[new]} train, {validation_act_samples[new]} validation, {test_act_samples[new]} test)"
    for old, new in activity_remap.items()
]
activities = "\n".join(activities)

train_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        train["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
train_users = ", ".join(train_users)
validation_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        validation["user"].value_counts().sort_values().items(), key=lambda x: x[0]
    )
]
validation_users = ", ".join(validation_users)
test_users = [
    f"{user} ({no_samples} samples)"
    for user, no_samples in sorted(
        test["user"].value_counts().items(), key=lambda x: x[0]
    )
]
test_users = ", ".join(test_users)


description = f"""# Non-Balanced MotionSense equivalent KuHar Dataset View

This view contains train, validation and test subsets in the following proportions:
- Train: 70% of samples
- Validation: 10% of samples
- Test: 20% of samples

## Activities:

This view contains only samples with activities codes equivalent to MotionSense.
In this way, only activities: {', '.join(activities_to_select)}, were selected.
To each activity were assigned the same MotionSense activity code, thus: {', '.join(f'{old} ({kuhar_dataset.activity_names[old]} in KuHar) became {new} (in MotionSense)' for old, new in activity_remap.items())}

{activities}

## Users
- {len(train.user.unique())} users train dataset: {train_users}.
- {len(validation.user.unique())} users validation dataset: {validation_users}.
- {len(test.user.unique())} users test dataset: {test_users}.

**NOTE**: Each subset contain samples from distinct users, that is, samples of one user belongs exclusivelly to one of three subsets.

"""

print(description)
pandas_io = PandasDatasetsIO(output_path)

In [None]:
pandas_io.save(train=train, validation=validation, test=test, description=description)