## Unzip The Raw Data

In [3]:
!./unzip.sh UCI_HAR_Dataset.zip 2>&1 > /dev/null

## Building The Dataset

In [1]:
import numpy as np
import glob
import os

def get_one_hot(targets, nb_classes):
    res = np.eye(nb_classes)[np.array(targets).reshape(-1)]
    return res.reshape(list(targets.shape)+[nb_classes])

def load_y(subset):
    # Get the path
    path = f'UCI_HAR_Dataset/UCI_HAR_Dataset/{subset}/y_{subset}.txt'

    # Read the file
    y = np.loadtxt(path, delimiter=',', dtype=int)

    # # One-hot encode labels
    one_hot_labels = get_one_hot(y - 1 , len(np.unique(y)))
    if subset == 'train':
        assert one_hot_labels.shape == (7352, 6), f"Wrong dimensions: {one_hot_labels.shape} should be (7352, 6)"
    if subset == 'test':
        assert one_hot_labels.shape == (2947, 6), f"Wrong dimensions: {one_hot_labels.shape} should be (2947, 6)"
    assert y[0] - 1 == np.where(one_hot_labels[0] == np.max(one_hot_labels[0]))[0][0], f"Value mismatch {np.max(one_hot_labels[0])[0][0]} vs {y[13] - 1}"
    return one_hot_labels

def build_data(subset):
    if subset not in ['train', 'val', 'test']:
        raise Exception(f"Invalid subset: {subset}")

    folder_path = f"UCI_HAR_Dataset/UCI_HAR_Dataset/{subset}/Inertial Signals/"

    # Get all signal files in folder
    signal_files = glob.glob(os.path.join(folder_path, '*.txt'))
    # print(signal_files)

    assert len(signal_files) == 9, f"No signal files found in {folder_path}"
    signal_shape = np.loadtxt(signal_files[0]).shape
    # print(f"{signal_shape}")

    # Determine signal order based on file names
    signal_order = [
        "body_acc_x_",
        "body_acc_y_",
        "body_acc_z_",
        "body_gyro_x_",
        "body_gyro_y_",
        "body_gyro_z_",
        "total_acc_x_",
        "total_acc_y_",
        "total_acc_z_",
        ]

    # file_prefix = "UCI_HAR_Dataset/UCI_HAR_Dataset/train/Inertial Signals/"
    # file_suffix = ".txt"
    signal_files = [f"UCI_HAR_Dataset/UCI_HAR_Dataset/{subset}/Inertial Signals/{x}{subset}.txt" for x in signal_order]

    # Load signal data from each file and append to signals_data list
    signals_data = [np.loadtxt(x) for x in signal_files]

    # Transpose signal data array so that shape is (number of samples, number of timesteps, number of signals)
    signals_data = np.transpose(signals_data, (1, 2, 0))

    # Verify final shape of combined data
    # Resultant shape is (7352 train/2947 test samples, 128 timesteps, 9 signals)
    if subset == 'train':
        assert signals_data.shape == (7352, 128, len(signal_files))
    else:
        assert signals_data.shape == (2947, 128, len(signal_files))
    return signals_data

def load_data():
    return build_data('train'), load_y('train'), build_data('test'), load_y('test')

X_train, y_train, X_test, y_test = load_data()

(7352, 128)
(2947, 128)
