In [1]:
import os
import numpy as np
import pandas as pd
import time
from collections import defaultdict
from random import shuffle
from typing import List
from librep.actions.train import plot_histogram

In [2]:
activity_names = {
    0: "Stand",
    1: "Sit",
    2: "Talk-sit",
    3: "Talk-stand",
    4: "Stand-sit",
    5: "Lay",
    6: "Lay-stand",
    7: "Pick",
    8: "Jump",
    9: "Push-up",
    10: "Sit-up",
    11: "Walk",
    12: "Walk-backwards",
    13: "Walk-circle",
    14: "Run",
    15: "Stair-up",
    16: "Stair-down",
    17: "Table-tennis"
}

In [3]:
def read_data_meta(directory, datafile="data.npz", metafile="meta.csv"):
    data_fname = os.path.join(directory, datafile)
    meta_fname = os.path.join(directory, metafile)
    data = np.load(data_fname)["data"]
    meta = pd.read_csv(meta_fname)
    return data, meta

def train_val_test_split(df: pd.DataFrame, users: List[int], activities: List[int], train_size: float, validation_size: float, test_size: float):
    retries = 10
    n_users = len(users)
    n_activities = len(activities)
    
    for i in range(retries):
        shuffle(users)
        # [start ---> train_size)
        train_users = users[0:int(n_users*train_size)]
        # [train_size --> train_size+validation_size)
        validation_users = users[int(n_users*train_size) : int(n_users*(train_size+validation_size))]
        # [train_size+validation_size --> end]
        test_users = users[int(n_users*(train_size+validation_size)):]      
        # iterate over user's lists, filter df for users in the respective list
        all_sets = [df[df["user"].isin(u)] for u in [train_users, validation_users, test_users]]
        # We must guarantee that all sets contains at least 1 sample from each activities listed
        oks = [set(s["class"]) == set(activities) for s in all_sets]
        if all(oks):
            # If all sets contains at least 1 sample for each activity, return train, val, test sets!
            return all_sets
    print(f"Does not found a 3 sets that contain the respective activities!")
    return [None, None, None]

def balance(dataframe):
    df_list = []
    histogram = dataframe.groupby(dataframe["class"], as_index=False).size()
    for c in histogram["class"]:
        temp = dataframe.loc[dataframe["class"] == c]
        temp = temp.sample(n=histogram["size"].min())
        df_list.append(temp)
    return pd.concat(df_list)

def plot_meta_histogram(metadata):
    histograms = metadata.groupby(["class"], as_index=False).size()
    histograms[histograms["size"] == histograms["size"].min()]
    plot_histogram(
        histograms["size"].to_dict(), labels_dict=activity_names,
        minv=histograms["size"].min(), maxv=histograms["size"].max(),
        min_max_v_xticks=1
    )

def select_from_meta(data, metadata):
    arr = np.full((len(metadata), data.shape[1], data.shape[2]), fill_value=np.nan)
    for i, npidx in enumerate(metadata["np_index"]):
        arr[i,:,:] = data[npidx]
    return arr

In [9]:
paths = [
    "data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/",
    "data/dataset_views/kuhar/time-series-300-samples-0-overlap/accelerometer/486c8aad",
    "data/dataset_views/kuhar/time-series-300-samples-0-overlap/gyroscope/486c8aad",
]

data, metadata = read_data_meta(paths[0])
metadata["np_index"] = range(len(metadata))
users = sorted(metadata["user"].unique().tolist())
classes = sorted(metadata["class"].unique().tolist())
    
train_meta, val_meta, test_meta = train_val_test_split(
    df=metadata, users=users, activities=classes, train_size=0.7, 
    validation_size=0.1, test_size=0.2)

train_meta = balance(train_meta)
val_meta = balance(val_meta)
test_meta = balance(test_meta)

for path in paths:
    print(f"---------- Starting with {path} ----------")
    data, metadata = read_data_meta(path)
    for s, set_name in [(train_meta, "train"), (val_meta, "validation"), (test_meta, "test")]:
        arr = select_from_meta(data, s)
        data_output = os.path.join(path, set_name)
        meta_output = os.path.join(path, f"{set_name}.csv")
        np.savez(data_output, data=arr)
        print(f"Data saved to '{data_output}.npz'")
        s.to_csv(meta_output, index=False)
        print(f"Metadata saved to '{meta_output}'")

---------- Starting with data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/ ----------
Data saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/train.npz'
Metadata saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/train.csv'
Data saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/validation.npz'
Metadata saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/validation.csv'
Data saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/test.npz'
Metadata saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/all/486c8aad/test.csv'
---------- Starting with data/dataset_views/kuhar/time-series-300-samples-0-overlap/accelerometer/486c8aad ----------
Data saved to 'data/dataset_views/kuhar/time-series-300-samples-0-overlap/accelerometer/486c8aad/train.npz'
Metadata saved to 'data/dataset_views/kuhar/time-series-3

In [10]:
paths = [
    "data/dataset_views/kuhar/raw/all/486c8aad/",
    "data/dataset_views/kuhar/raw/accelerometer/486c8aad",
    "data/dataset_views/kuhar/raw/gyroscope/486c8aad"
]

data, metadata = read_data_meta(paths[0])
metadata["np_index"] = range(len(metadata))
users = sorted(metadata["user"].unique().tolist())
classes = sorted(metadata["class"].unique().tolist())
    
train_meta, val_meta, test_meta = train_val_test_split(
    df=metadata, users=users, activities=classes, train_size=0.7, 
    validation_size=0.1, test_size=0.2)

train_meta = balance(train_meta)
val_meta = balance(val_meta)
test_meta = balance(test_meta)

for path in paths:
    print(f"---------- Starting with {path} ----------")
    data, metadata = read_data_meta(path)
    for s, set_name in [(train_meta, "train"), (val_meta, "validation"), (test_meta, "test")]:
        arr = select_from_meta(data, s)
        data_output = os.path.join(path, set_name)
        meta_output = os.path.join(path, f"{set_name}.csv")
        np.savez(data_output, data=arr)
        print(f"Data saved to '{data_output}.npz'")
        s.to_csv(meta_output, index=False)
        print(f"Metadata saved to '{meta_output}'")

---------- Starting with data/dataset_views/kuhar/raw/all/486c8aad/ ----------
Data saved to 'data/dataset_views/kuhar/raw/all/486c8aad/train.npz'
Metadata saved to 'data/dataset_views/kuhar/raw/all/486c8aad/train.csv'
Data saved to 'data/dataset_views/kuhar/raw/all/486c8aad/validation.npz'
Metadata saved to 'data/dataset_views/kuhar/raw/all/486c8aad/validation.csv'
Data saved to 'data/dataset_views/kuhar/raw/all/486c8aad/test.npz'
Metadata saved to 'data/dataset_views/kuhar/raw/all/486c8aad/test.csv'
---------- Starting with data/dataset_views/kuhar/raw/accelerometer/486c8aad ----------
Data saved to 'data/dataset_views/kuhar/raw/accelerometer/486c8aad/train.npz'
Metadata saved to 'data/dataset_views/kuhar/raw/accelerometer/486c8aad/train.csv'
Data saved to 'data/dataset_views/kuhar/raw/accelerometer/486c8aad/validation.npz'
Metadata saved to 'data/dataset_views/kuhar/raw/accelerometer/486c8aad/validation.csv'
Data saved to 'data/dataset_views/kuhar/raw/accelerometer/486c8aad/test.npz