In [20]:
from pathlib import Path
import os, shutil
import pandas as pd
from typing import Tuple, List, Dict, Any

from dataset_processor import (
    AddGravityColumn,
    ButterworthFilter,
    CalcTimeDiffMean,
    Convert_G_to_Ms2,
    PlotDiffMean,
    ResamplerPoly,
    Windowize,
    AddStandardActivityCode,
    SplitGuaranteeingAllClassesPerSplit,
    SplitGuaranteeingAllUsersPerSplit,
    BalanceToMinimumClass,
    BalanceToMinimumClassAndUser,
    FilterByCommonRows,
    RenameColumns,
    Pipeline,
)

# Set the seed for reproducibility
import numpy as np

from utils import (
    read_kuhar,
    read_motionsense,
    read_wisdm,
    read_uci,
    read_realworld,
    sanity_function,
    real_world_organize,
)

In [21]:
datasets: List[str] = [
    "KuHar",
    "MotionSense",
    "UCI",
    "WISDM",
]

maping: List[int] = [4, 3, -1, -1, 5, 0, 1, 2]
tasks: List[str] = [
    "climbingdown",
    "climbingup",
    "jumping",
    "lying",
    "running",
    "sitting",
    "standing",
    "walking",
]
standard_activity_code_realworld_map: Dict[str, int] = {
    activity: maping[tasks.index(activity)] for activity in tasks
}

columns_to_rename = {
    "KuHar": None,
    "MotionSense": {
        "userAcceleration.x": "accel-x",
        "userAcceleration.y": "accel-y",
        "userAcceleration.z": "accel-z",
        "rotationRate.x": "gyro-x",
        "rotationRate.y": "gyro-y",
        "rotationRate.z": "gyro-z",
    },
    "WISDM": None,
    "UCI": None,
    "RealWorld": None,
}
functions: Dict[str, callable] = {
    "KuHar": read_kuhar,
    "MotionSense": read_motionsense,
    "WISDM": read_wisdm,
    "UCI": read_uci,
    "RealWorld": read_realworld,
}
dataset_path: Dict[str, str] = {
    "KuHar": "KuHar/1.Raw_time_domian_data",
    "MotionSense": "MotionSense/A_DeviceMotion_data",
    "WISDM": "WISDM/wisdm-dataset/raw/phone",
    "UCI": "UCI/RawData",
    "RealWorld": "RealWorld/realworld2016_dataset",
}

column_group: Dict[str, str] = {
    "KuHar": ['user', 'activity code', 'csv'],
    "MotionSense": ['user', 'activity code', 'csv'],
    "WISDM": ["user", "activity code", "window"],
    "UCI": ["user", "activity code", "serial"],
    "RealWorld": ["user", "activity code", "position"],
}

feature_columns: Dict[str, List[str]] = {
    "KuHar": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "MotionSense": [
        "accel-x",
        "accel-y",
        "accel-z",
        "gyro-x",
        "gyro-y",
        "gyro-z",
        "attitude.roll",
        "attitude.pitch",
        "attitude.yaw",
        "gravity.x",
        "gravity.y",
        "gravity.z",
    ],
    "WISDM": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "UCI": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "RealWorld": ["accel-x", "accel-y", "accel-z", "gyro-x", "gyro-y", "gyro-z"],
    "RecodGait_v1": ["accel-x", "accel-y", "accel-z"],
    "RecodGait_v2": ["accel-x", "accel-y", "accel-z"],
}

standard_activity_code_map: Dict[str, Dict[Any, int]] = {
    "KuHar": {
        0: 1,
        1: 0,
        2: -1,
        3: -1,
        4: -1,
        5: -1,
        6: -1,
        7: -1,
        8: -1,
        9: -1,
        10: -1,
        11: 2,
        12: -1,
        13: -1,
        14: 5,
        15: 3,
        16: 4,
        17: -1,
    },
    "MotionSense": {0: 4, 1: 3, 2: 0, 3: 1, 4: 2, 5: 5},
    "WISDM": {
        "A": 2,
        "B": 5,
        "C": 6,
        "D": 0,
        "E": 1,
        "F": -1,
        "G": -1,
        "H": -1,
        "I": -1,
        "J": -1,
        "K": -1,
        "L": -1,
        "M": -1,
        "O": -1,
        "P": -1,
        "Q": -1,
        "R": -1,
        "S": -1,
    },
    "UCI": {
        1: 2,  # walk
        2: 3,  # stair up
        3: 4,  # stair down
        4: 0,  # sit
        5: 1,  # stand
        6: -1,  # Laying
        7: -1,  # stand to sit
        8: -1,  # sit to stand
        9: -1,  # sit to lie
        10: -1,  # lie to sit
        11: -1,  # stand to lie
        12: -1,  # lie to stand
    },
    "RealWorld": standard_activity_code_realworld_map,
    "RecodGait_v1": None,
    "RecodGait": None,
}

In [22]:
pipelines: Dict[str, Dict[str, Pipeline]] = {
    "UCI": {
        "standartized_cpc_dataset": Pipeline(
            [
                Convert_G_to_Ms2(axis_columns=["accel-x", "accel-y", "accel-z"]),
                ButterworthFilter(
                    axis_columns=["accel-x", "accel-y", "accel-z"],
                    fs=50,
                ),
                ResamplerPoly(
                    features_to_select=feature_columns["UCI"],
                    up=2,
                    down=5,
                    groupby_column=column_group["UCI"],
                ),
                AddStandardActivityCode(standard_activity_code_map["UCI"]),
            ]
        ),
    }
}

In [23]:
# Cria as pastas de treino, teste e validação

os.makedirs("train", exist_ok=True) if not os.path.exists("train") else None
os.makedirs("test", exist_ok=True) if not os.path.exists("test") else None
os.makedirs("validation", exist_ok=True) if not os.path.exists("validation") else None

In [24]:
split_data: object = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.8,
    random_state=42,
)

split_data_train_val: object = SplitGuaranteeingAllClassesPerSplit(
    column_to_split="user",
    class_column="standard activity code",
    train_size=0.9,
    random_state=42,
)

In [25]:
for dataset in ["UCI"]:
    reader = functions[dataset]
    
    print(f"Reading {dataset} dataset")
    path = Path(f"../data/original/{dataset_path[dataset]}")
    raw_dataset = reader(path)

    cpc_dataset = pipelines[dataset]["standartized_cpc_dataset"](raw_dataset)

    train, test = split_data(cpc_dataset)
    train, validation = split_data_train_val(train)

    for user, df in train.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"train/{user}.csv", index=False)
    for user, df in test.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"test/{user}.csv", index=False)
    for user, df in validation.groupby("user"):
        df = df[df["standard activity code"] != -1]
        df.to_csv(f"validation/{user}.csv", index=False)



Reading UCI dataset


Executing Convert_G_to_Ms2
Executing ButterworthFilter
Executing ResamplerPoly


Resampling: 100%|██████████| 714/714 [00:02<00:00, 257.42it/s]


Executing AddStandardActivityCode


In [26]:
train

Unnamed: 0,level_0,accel-x,accel-y,accel-z,gyro-x,gyro-y,gyro-z,txt,user,serial,index,activity code,standard activity code
0,52654,0.268158,0.531227,-0.504270,0.069122,0.407724,-0.283770,../data/original/UCI/RawData/gyro_exp05_user03...,3,5,243,5,1
1,52655,0.333556,0.828683,-0.699662,0.138770,0.546359,-0.240244,../data/original/UCI/RawData/gyro_exp05_user03...,3,5,244,5,1
2,52656,1.947536,0.593388,-0.075724,0.110050,0.232923,0.057342,../data/original/UCI/RawData/gyro_exp05_user03...,3,5,245,5,1
3,52657,0.325792,0.823183,-0.385482,0.203203,-0.241347,0.180162,../data/original/UCI/RawData/gyro_exp05_user03...,3,5,246,5,1
4,52658,0.650040,0.436959,-0.595097,0.181359,-0.232444,0.188626,../data/original/UCI/RawData/gyro_exp05_user03...,3,5,247,5,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
230228,814814,3.410958,-2.592294,-2.578998,0.648240,-0.067370,0.223439,../data/original/UCI/RawData/gyro_exp61_user30...,30,61,15718,2,3
230229,814815,1.344289,-3.016130,-1.412952,0.877219,0.487476,-0.021490,../data/original/UCI/RawData/gyro_exp61_user30...,30,61,15719,2,3
230230,814816,1.765617,-2.673173,-1.026110,1.202162,1.161450,-0.328383,../data/original/UCI/RawData/gyro_exp61_user30...,30,61,15720,2,3
230231,814817,1.119660,-1.496941,-0.710179,1.167825,0.940050,-0.355065,../data/original/UCI/RawData/gyro_exp61_user30...,30,61,15721,2,3
