### Transformer Training Preprocessing Steps
1. load all matrices in from prepared_data/
2. convert annotations to HAC label standard to be selected
3. Split into Train-Validation-Test -> using split mentioned in paper
4. Save all into .npys in in specified directory

In [None]:
import numpy as np
file_path = 'capture24/prepared_data_512'
# Let's take a look at the prepared data
X_from_npy = np.load(f'{file_path}/X.npy')
print(X_from_npy[10])

# Let's take a look at the labels
Y_from_npy = np.load(f'{file_path}/Y_anno.npy', allow_pickle=True)
print(Y_from_npy[10])

Y_W2020 = np.load(f'{file_path}/Y_Walmsley2020.npy')
print(Y_W2020[10])

Y_Willetts2018 = np.load(f'{file_path}/Y_WillettsSpecific2018.npy')
print(Y_Willetts2018[10])

# Let's take a look at the time stamps
T_from_npy = np.load(f'{file_path}/T.npy')
print(T_from_npy[:10])

# Let's take a look at the patient ids
P_from_npy = np.load(f'{file_path}/P.npy')
# Process data to remove P and convert to int
P_from_npy = [int(i.split('P')[1]) for i in P_from_npy]

In [None]:
# Create Training - Test Split
# Capture 24 specified split as the following:
# Participants 001 - 100 -> Training - for final (001-080: Train, 081-100: Validation)
# Participants (P)101 - 150 -> Test

# Easiest way to split is perhaps utilize a pandas dataframe for filtering
import pandas as pd

df = pd.DataFrame(
    {
        "Accelerometer_data": [x for x in X_from_npy],
        "Raw_Labels": Y_from_npy,
        "Walmsley2020_Labels": Y_W2020,
        "Willetts2018_Labels": Y_Willetts2018,
        "Time_Stamps": T_from_npy,
        "Patient_ID": P_from_npy
    }
)


# Split into training and test - save under capture24/final_data/
train_df = df[(df['Patient_ID'] <= 100)].reset_index(drop=True)
test_df = df[(df['Patient_ID'] > 100)].reset_index(drop=True)


In [None]:
import os
import json

# Create Index Mapping for the Labels
unique_labels = sorted(set(train_df['Willetts2018_Labels'].unique()))
label_to_index = {label: index for index, label in enumerate(unique_labels)}
index_to_label = {index: label for label, index in label_to_index.items()}


# Encode the Labels - let's use the Willetts2018_Labels - in numpy format
Y_train = np.array(train_df['Willetts2018_Labels'])
Y_train_id = np.array([label_to_index[label] for label in Y_train])

# Encode the test labels in a similar manner
Y_test = np.array(test_df['Willetts2018_Labels'])
Y_test_id = np.array([label_to_index[label] for label in Y_test])

dir_path = 'capture24/final_data_512/'
os.makedirs(dir_path, exist_ok=True)
with open(f'{dir_path}/label_to_index.json', 'w') as f:
    json.dump({
        "label_to_index": label_to_index,
        "index_to_label": index_to_label
    }, f)


# Reshape the input data to be [B, T, C]
X_train = train_df["Accelerometer_data"].apply(
    lambda x: np.asarray(x, dtype=np.float32)
)

X_test = test_df["Accelerometer_data"].apply(
    lambda x: np.asarray(x, dtype=np.float32)
)


np.save(f'{dir_path}/X_train.npy', np.stack(X_train.values))
np.save(f'{dir_path}/X_test.npy', np.stack(X_test.values))

# Save the labels
np.save(f'{dir_path}/Y_train.npy', Y_train_id)
np.save(f'{dir_path}/Y_test.npy', Y_test_id)