In [1]:
from pathlib import Path
import os, shutil
import pandas as pd
from typing import Tuple, List, Dict, Any

from dataset_processor import (
    AddGravityColumn,
    ButterworthFilter,
    CalcTimeDiffMean,
    Convert_G_to_Ms2,
    PlotDiffMean,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    SplitGuaranteeingAllUsersPerSplit,
    BalanceToMinimumClass,
    BalanceToMinimumClassAndUser,
    FilterByCommonRows,
    RenameColumns,
    Pipeline,
)

# Set the seed for reproducibility
import numpy as np

from utils import (
    read_kuhar,
    read_motionsense,
    read_wisdm,
    read_uci,
    read_realworld,
    sanity_function,
    real_world_organize,
)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
datasets: List[str] = [
    "KuHar",
    "MotionSense",
    "UCI",
    "WISDM",
]

maping: List[int] = [4, 3, -1, -1, 5, 0, 1, 2]
tasks: List[str] = [
    "climbingdown",
    "climbingup",
    "jumping",
    "lying",
    "running",
    "sitting",
    "standing",
    "walking",
]
standard_activity_code_realworld_map: Dict[str, int] = {
    activity: maping[tasks.index(activity)] for activity in tasks
}

columns_to_rename = {
    "KuHar": None,
    "MotionSense": {
        "userAcceleration.x": "accel-x",
        "userAcceleration.y": "accel-y",
        "userAcceleration.z": "accel-z",
        "rotationRate.x": "gyro-x",
        "rotationRate.y": "gyro-y",
        "rotationRate.z": "gyro-z",
    },
    "WISDM": None,
    "UCI": None,
    "RealWorld": None,
}
functions: Dict[str, callable] = {
    "KuHar": read_kuhar,
    "MotionSense": read_motionsense,
    "WISDM": read_wisdm,
    "UCI": read_uci,
    "RealWorld": read_realworld,
}
dataset_path: Dict[str, str] = {
    "KuHar": "KuHar/1.Raw_time_domian_data",
    "MotionSense": "MotionSense/A_DeviceMotion_data",
    "WISDM": "WISDM/wisdm-dataset/raw/phone",
    "UCI": "UCI/RawData",
    "RealWorld": "RealWorld/realworld2016_dataset",
}

column_group: Dict[str, str] = {
    "KuHar": ['user', 'activity code', 'csv'],
    "MotionSense": ['user', 'activity code', 'csv'],
    "WISDM": ["user", "activity code", "window"],
    "UCI": ["user", "activity code", "serial"],
    "RealWorld": ["user", "activity code", "position"],
}

feature_columns: Dict[str, List[str]] = {
    "KuHar": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "MotionSense": [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z",
        "attitude.roll",
        "attitude.pitch",
        "attitude.yaw",
        "gravity.x",
        "gravity.y",
        "gravity.z",
    ],
    "WISDM": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "UCI": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "RealWorld": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "RecodGait_v1": ["accel-x", "accel-y", "accel-z"],
    "RecodGait_v2": ["accel-x", "accel-y", "accel-z"],
}

standard_activity_code_map: Dict[str, Dict[Any, int]] = {
    "KuHar": {
        0: 1,
        1: 0,
        2: -1,
        3: -1,
        4: -1,
        5: -1,
        6: -1,
        7: -1,
        8: -1,
        9: -1,
        10: -1,
        11: 2,
        12: -1,
        13: -1,
        14: 5,
        15: 3,
        16: 4,
        17: -1,
    },
    "MotionSense": {0: 4, 1: 3, 2: 0, 3: 1, 4: 2, 5: 5},
    "WISDM": {
        "A": 2,
        "B": 5,
        "C": 6,
        "D": 0,
        "E": 1,
        "F": -1,
        "G": -1,
        "H": -1,
        "I": -1,
        "J": -1,
        "K": -1,
        "L": -1,
        "M": -1,
        "O": -1,
        "P": -1,
        "Q": -1,
        "R": -1,
        "S": -1,
    },
    "UCI": {
        1: 2,  # walk
        2: 3,  # stair up
        3: 4,  # stair down
        4: 0,  # sit
        5: 1,  # stand
        6: -1,  # Laying
        7: -1,  # stand to sit
        8: -1,  # sit to stand
        9: -1,  # sit to lie
        10: -1,  # lie to sit
        11: -1,  # stand to lie
        12: -1,  # lie to stand
    },
    "RealWorld": standard_activity_code_realworld_map,
    "RecodGait_v1": None,
    "RecodGait": None,
}

In [3]:
pipelines: Dict[str, Dict[str, Pipeline]] = {
    "KuHar": {
        "standartized_cpc_dataset": Pipeline(
            [
                CalcTimeDiffMean(
                    groupby_column=column_group[
                        "KuHar"
                    ],
                    column_to_diff="accel-start-time",
                    new_column_name="timestamp diff",
                ),
                ResamplerPoly(
                    features_to_select=feature_columns["KuHar"],
                    up=2,
                    down=10,
                    groupby_column=column_group["KuHar"],
                ),
                AddStandardActivityCode(standard_activity_code_map["KuHar"]),
            ]
        ),
    },
    "MotionSense": {
        "standartized_cpc_dataset": Pipeline(
            [
                RenameColumns(columns_map=columns_to_rename["MotionSense"]),
                AddGravityColumn(
                    axis_columns=["accel-x", "accel-y", "accel-z"],
                    gravity_columns=["gravity.x", "gravity.y", "gravity.z"],
                ),
                Convert_G_to_Ms2(axis_columns=["accel-x", "accel-y", "accel-z"]),
                ButterworthFilter(
                    axis_columns=["accel-x", "accel-y", "accel-z"],
                    fs=50,
                ),
                ResamplerPoly(
                    features_to_select=feature_columns["MotionSense"],
                    up=2,
                    down=5,
                    groupby_column=column_group["MotionSense"],
                ),
                AddStandardActivityCode(standard_activity_code_map["MotionSense"]),
            ]
        ),
    },
    "WISDM": {
        "standartized_cpc_dataset": Pipeline(
                [
                    CalcTimeDiffMean(
                        groupby_column=column_group[
                            "WISDM"
                        ], 
                        column_to_diff="timestamp-accel",
                        new_column_name="accel-timestamp-diff",
                    ),
                    ButterworthFilter(
                        axis_columns=["accel-x", "accel-y", "accel-z"],
                        fs=20,
                    ),
                    AddStandardActivityCode(standard_activity_code_map["WISDM"]),
                ]
        ),
    },
    "UCI": {
        "standartized_cpc_dataset": Pipeline(
            [
                Convert_G_to_Ms2(axis_columns=["accel-x", "accel-y", "accel-z"]),
                ButterworthFilter(
                    axis_columns=["accel-x", "accel-y", "accel-z"],
                    fs=50,
                ),
                ResamplerPoly(
                    features_to_select=feature_columns["UCI"],
                    up=2,
                    down=5,
                    groupby_column=column_group["UCI"],
                ),
                AddStandardActivityCode(standard_activity_code_map["UCI"]),
            ]
        ),
    },
    "RealWorld": {
        "standartized_cpc_dataset": Pipeline(
            [
                CalcTimeDiffMean(
                    groupby_column=column_group[
                        "RealWorld"
                    ],  
                    column_to_diff="accel-start-time",  
                    new_column_name="timestamp diff",
                ),
                ButterworthFilter(
                    axis_columns=["accel-x", "accel-y", "accel-z"],
                    fs=50,
                ),
                ResamplerPoly(
                    features_to_select=feature_columns["RealWorld"],
                    up=2,
                    down=5,
                    groupby_column=column_group["RealWorld"],
                ),
                AddStandardActivityCode(standard_activity_code_map["RealWorld"]),
            ]
        ),
    }
}

In [4]:
# Cria as pastas de treino, teste e validação

# os.makedirs("train", exist_ok=True) if not os.path.exists("train") else None
# os.makedirs("test", exist_ok=True) if not os.path.exists("test") else None
# os.makedirs("validation", exist_ok=True) if not os.path.exists("validation") else None

In [5]:
split_data: object = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=42,
)

split_data_train_val: object = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=42,
)

In [6]:
for dataset in ["KuHar", "MotionSense", "UCI", "WISDM"]:
    reader = functions[dataset]

    os.makedirs(f"{dataset}_cpc/train", exist_ok=True) if not os.path.exists("train") else None
    os.makedirs(f"{dataset}_cpc/test", exist_ok=True) if not os.path.exists("test") else None
    os.makedirs(f"{dataset}_cpc/validation", exist_ok=True)  if not os.path.exists("validation") else None
    
    print(f"Reading {dataset} dataset")
    path = Path(f"../data/original/{dataset_path[dataset]}")
    raw_dataset = reader(path)

    cpc_dataset = pipelines[dataset]["standartized_cpc_dataset"](raw_dataset)
    cpc_dataset = cpc_dataset[cpc_dataset["standard activity code"] != -1]
    train, test = split_data(cpc_dataset)
    train, validation = split_data_train_val(train)

    for user, df in train.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"{dataset}_cpc/train/{user}.csv", index=False)
 #   train = df

    for user, df in test.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"{dataset}_cpc/test/{user}.csv", index=False)
#    test = df

    for user, df in validation.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"{dataset}_cpc/validation/{user}.csv", index=False)
 #   validation = df



Reading KuHar dataset


Executing CalcTimeDiffMean
Executing ResamplerPoly


Resampling: 100%|██████████| 1944/1944 [00:09<00:00, 199.66it/s]


Executing AddStandardActivityCode
Reading MotionSense dataset
Executing RenameColumns
Executing AddGravityColumn
Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 360/360 [00:04<00:00, 85.15it/s] 


Executing AddStandardActivityCode
Reading UCI dataset
Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 714/714 [00:02<00:00, 238.63it/s]


Executing AddStandardActivityCode
Reading WISDM dataset
Executing CalcTimeDiffMean
Executing ButterworthFilter
Executing AddStandardActivityCode


In [7]:
train



Unnamed: 0,timestamp-accel,accel-x,accel-y,accel-z,timestamp-gyro,gyro-x,gyro-y,gyro-z,activity code,user,window,accel-timestamp-diff,standard activity code
0,2.522077e+14,0.203999,-0.094127,0.447211,2.522080e+14,-0.875926,0.015412,0.166908,A,1600,1,50331648.0,2
1,2.522078e+14,3.052405,1.244885,2.027853,2.522080e+14,-0.722302,0.377747,-0.281811,A,1600,1,50331648.0,2
2,2.522078e+14,1.529101,2.781768,-0.320552,2.522081e+14,-0.575688,1.215620,-0.242955,A,1600,1,50331648.0,2
3,2.522079e+14,-1.086387,4.100796,-5.051378,2.522081e+14,-0.383247,1.216769,-0.216700,A,1600,1,50331648.0,2
4,2.522079e+14,-3.412541,3.553666,-1.501781,2.522082e+14,-0.241730,0.577081,0.106819,A,1600,1,50331648.0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
665015,3.570793e+14,-1.197990,0.377899,-3.381398,3.570794e+14,0.051526,0.020400,-0.039030,E,1650,1,33554432.0,1
665016,3.570794e+14,-1.284513,0.408743,-3.686069,3.570795e+14,-0.044752,-0.029419,-0.003082,E,1650,1,67108864.0,1
665017,3.570794e+14,-1.399170,0.544469,-3.707562,3.570795e+14,-0.280722,-0.194404,0.025977,E,1650,1,33554432.0,1
665018,3.570795e+14,-1.609891,0.684023,-4.334916,3.570796e+14,-0.133193,-0.210993,0.099131,E,1650,1,67108864.0,1
