In [None]:
import numpy as np
import importlib
import matplotlib.pyplot as plt
import random
from tensorflow import keras
import torch
from tensorflow.keras import layers
import keras.backend as K
from keras import regularizers
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping, TensorBoard
from keras.layers import Dense, Dropout, Flatten, Conv2D, MaxPooling2D, Conv1D, MaxPooling1D, Lambda
from keras.layers import GRU, Dense, Activation, Dropout, concatenate, Input, BatchNormalization
from keras.models import Sequential, Model
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau,ModelCheckpoint
from imblearn.over_sampling import SMOTE
from collections import Counter
from sklearn.preprocessing import StandardScaler
import math
from sklearn.metrics import classification_report, confusion_matrix
import torch.nn as nn
import time
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import roc_auc_score, balanced_accuracy_score, accuracy_score, f1_score
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F
from pathlib import Path
import shutil

# Code

In [None]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [None]:
!ls drive/MyDrive/GaitData/data/test/

Trial01_x.csv  Trial02_x.csv  Trial03_x.csv  Trial04_x.csv
Trial01_y.csv  Trial02_y.csv  Trial03_y.csv  Trial04_y.csv


In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from tensorflow.keras import layers, models


def preprocess_data(x_file_path, y_file_path):
    # Load data
    x_data = pd.read_csv(x_file_path, header=None , names=['timestamp', 'accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z'])
    y_data = pd.read_csv(y_file_path, header=None ,names=['timestamp', 'label'])

    print(len(x_data))
    print(len(y_data))
    # Calculate the number of iterations for merging
    num_iterations = len(x_data) // 40

    # Initialize list to store merged data
    merged_data = []

    # Merge data by adding y_train values at specific positions in x_train
    for i in range(num_iterations):
        # Extract 40 rows from x_data and 10 rows from y_data
        x_sample = x_data.iloc[i * 40: (i + 1) * 40]
        y_sample = y_data.iloc[i * 10: (i + 1) * 10]

        # Merge y_sample into x_sample at positions multiples of 4
        for j, y_index in enumerate(range(3, 40, 4)):
            x_sample = x_sample.copy()
            x_sample.loc[x_sample.index[y_index], 'label'] = y_sample.iloc[j, 1]

        # Append the modified x_sample to merged_data
        merged_data.append(x_sample)

    # Concatenate the list of dataframes into a single dataframe
    merged_data = pd.concat(merged_data)

    trial_id = int(x_file_path.split('/')[-1].split('_')[0][5:])
    merged_data.insert(0, 'id', trial_id)

    unique_labels = merged_data['label'].unique()
    print("Unique labels:", unique_labels)

    label_counts = merged_data['label'].value_counts()
    print("\nLabel counts:")
    for label, count in label_counts.items():
        print(f"Label {label}: {count} occurrences")
    print(merged_data.head(1))
    return merged_data



# List of trial IDs
trial_ids = ["01", "02", "03", "04", "05", "06", "07", "08", "09", "10",
             "11", "12", "13", "14", "15", "16", "17", "18", "19", "20",
             "21", "22", "23", "24","25", "26", "27", "28", "29"]

# trial_ids = ["01"]
train_df = pd.DataFrame()

# Process each trial data
for trial_id in trial_ids:
    x_file_path = f"drive/MyDrive/GaitData/data/train/Trial{trial_id}_x.csv"
    y_file_path = f"drive/MyDrive/GaitData/data/train/Trial{trial_id}_y.csv"
    combined_data = preprocess_data(x_file_path, y_file_path)


    # Append preprocessed data to lists
    train_df = pd.concat([train_df,combined_data] , ignore_index=True)

label_counts = train_df["label"].value_counts()



37890
9473
Unique labels: [nan  0.  1.  2.  3.]

Label counts:
Label 0.0: 6441 occurrences
Label 3.0: 1530 occurrences
Label 2.0: 788 occurrences
Label 1.0: 711 occurrences
   id  timestamp   accel_x   accel_y   accel_z    gyro_x    gyro_y    gyro_z  \
0   1        0.0  4.435275  8.196063  2.974488  0.014215 -0.039157 -0.016744   

   label  
0    NaN  
70172
17543
Unique labels: [nan  0.  1.  3.  2.]

Label counts:
Label 0.0: 12563 occurrences
Label 3.0: 2852 occurrences
Label 2.0: 1148 occurrences
Label 1.0: 977 occurrences
   id  timestamp   accel_x   accel_y   accel_z    gyro_x    gyro_y    gyro_z  \
0   2        0.0  1.726654  9.619981  1.723327 -0.001997  0.067502  0.126057   

   label  
0    NaN  
43146
10787
Unique labels: [nan  0.  1.  2.  3.]

Label counts:
Label 0.0: 8292 occurrences
Label 2.0: 1106 occurrences
Label 3.0: 709 occurrences
Label 1.0: 673 occurrences
   id  timestamp   accel_x   accel_y   accel_z    gyro_x    gyro_y    gyro_z  \
0   3        0.0 -0.800765  9.4

In [None]:
print(len(train_df))

1370320


In [None]:
# Create sequences and labels for each trial ID
trial_ids_train = [x for x in range(1, 30) if x not in (2, 11, 25)]

num_sensors = 6
sequence_length = 40
all_sequences_train = []
all_labels_train = []
all_trial_ids_train = []
for trial_id in trial_ids_train:
    trial_data = train_df[train_df['id'] == trial_id]
    num_sequences = len(trial_data) - sequence_length + 1
    for i in range(num_sequences):
        start_index = i
        end_index = start_index + sequence_length
        sequence_data = trial_data.iloc[start_index:end_index].drop(columns=['label', 'id', 'timestamp'])
        sequence_data_label = trial_data.iloc[start_index:end_index].drop(columns=['timestamp', 'accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z'])
        all_sequences_train.append(sequence_data.values)
        sequence_label = sequence_data_label['label'].mode()[0]  # Mode of labels within the sequence
        all_labels_train.append(sequence_label)
        all_trial_ids_train.append(trial_id)


xTrain = np.array(all_sequences_train)
yTrain = np.array(all_labels_train)



trial_ids_val = [2, 11, 25]
num_sensors = 6
sequence_length = 40
all_sequences_val = []
all_labels_val = []
all_trial_ids_val = []
for trial_id in trial_ids_val:
    trial_data = train_df[train_df['id'] == trial_id]
    num_sequences = len(trial_data) - sequence_length + 1
    for i in range(num_sequences):
        start_index = i
        end_index = start_index + sequence_length
        sequence_data = trial_data.iloc[start_index:end_index].drop(columns=['label', 'id', 'timestamp'])
        sequence_data_label = trial_data.iloc[start_index:end_index].drop(columns=['timestamp', 'accel_x', 'accel_y', 'accel_z', 'gyro_x', 'gyro_y', 'gyro_z'])
        all_sequences_val.append(sequence_data.values)
        sequence_label = sequence_data_label['label'].mode()[0]  # Mode of labels within the sequence
        all_labels_val.append(sequence_label)
        all_trial_ids_val.append(trial_id)

xVal = np.array(all_sequences_val)
yVal = np.array(all_labels_val)


In [None]:

# xTrain = xTrain.reshape(xTrain.shape[0], xTrain.shape[1], xTrain.shape[2], 1)
# xVal = xVal.reshape(xVal.shape[0], xVal.shape[1], xVal.shape[2], 1)
input_shape =  (40, 6)


print(xTrain.shape, yTrain.shape)
print(xVal.shape, yVal.shape)


from imblearn.under_sampling import RandomUnderSampler

# Define undersampler targeting the majority class (Value 0)
undersampler = RandomUnderSampler(sampling_strategy={0: 12409, 1: 12409, 2: 12409, 3: 12409})

# Reshape data for undersampling
xTrain_reshaped = xTrain.reshape(xTrain.shape[0], -1)

# Perform undersampling
xTrain_resampled, yTrain_resampled = undersampler.fit_resample(xTrain_reshaped, yTrain)

# Reshape back to original shape
xTrain_resampled = xTrain_resampled.reshape(-1, sequence_length, num_sensors)

# Check class distribution after undersampling
print("Class distribution after undersampling:")
unique, counts = np.unique(yTrain_resampled, return_counts=True)
for label, count in zip(unique, counts):
    print(f"Value {label}: {count} occurrences")


(1204186, 40, 6) (1204186,)
(165003, 40, 6) (165003,)
Class distribution after undersampling:
Value 0.0: 12409 occurrences
Value 1.0: 12409 occurrences
Value 2.0: 12409 occurrences
Value 3.0: 12409 occurrences


In [None]:
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout

# def build_model():
#     model = Sequential()
#     model.add(Conv1D(32, kernel_size=3, activation='relu', input_shape=input_shape))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Conv1D(64, kernel_size=3, activation='relu'))
#     model.add(MaxPooling1D(pool_size=2))
#     model.add(Flatten())
#     model.add(Dense(128, activation='relu'))
#     model.add(Dropout(0.5))  # Adding dropout layer
#     model.add(Dense(4, activation='softmax'))  # Assuming 4 output classes
#     model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
#     return model

class CNN(nn.Module):
    def __init__(self):
        super(CNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), padding=1)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), padding=1)
        self.conv3 = nn.Conv2d(64, 128, kernel_size=(3, 3), padding=1)
        self.pool = nn.MaxPool2d(kernel_size=(2, 2), stride=2)
        self.fc1 = nn.Linear(128 * 10 * 1, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, n_classes)  # Assuming num_classes is defined elsewhere
        self.dropout = nn.Dropout(p=0.2)
        self.bn1 = nn.BatchNorm2d(32)
        self.bn2 = nn.BatchNorm2d(64)
        self.bn3 = nn.BatchNorm2d(128)
        self.bn4 = nn.BatchNorm1d(128)
        self.bn5 = nn.BatchNorm1d(64)

    def forward(self, x):
        # print(x.shape)
        x = x.unsqueeze(1)  # Add channel dimension
        x = self.bn1(F.relu(self.conv1(x)))
        x = self.pool(x)
        # print(x.shape)
        x = self.bn2(F.relu(self.conv2(x)))
        x = self.pool(x)
        # print(x.shape)
        x = self.bn3(F.relu(self.conv3(x)))
        # print(x.shape)
        x = x.view(-1, 128 * 10 * 1)
        # print(x.shape)
        x = self.bn4(F.relu(self.fc1(x)))
        x = self.dropout(x)
        x = self.bn5(F.relu(self.fc2(x)))
        x = self.dropout(x)
        x = self.fc3(x)
        # print(x.shape)
        return x



In [None]:
n_classes = len(np.unique(yTrain))

model = CNN()

weights = torch.tensor(compute_class_weight(class_weight='balanced',classes=np.unique(yTrain),y=yTrain), dtype=torch.float)
criterion = nn.CrossEntropyLoss(weight=weights)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# # Evaluate the model
# test_loss, test\_accuracy = model.evaluate(xVal, yVal)
# y_pred = model.predict(xVal)
# y_pred_labels = np.argmax(y_pred, axis=1)
# cm = confusion_matrix(yVal, y_pred_labels)
# print('Confusion Matrix:', cm)
# print('Classification Report:', classification_report(yVal, y_pred_labels))

In [None]:
# Define DataLoader for training data
x_train_tensor = torch.tensor(xTrain, dtype=torch.float32)
y_train_tensor = torch.tensor(yTrain, dtype=torch.long)

train_dataset = TensorDataset(x_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True)


x_val_tensor = torch.tensor(xVal, dtype=torch.float32)
y_val_tensor = torch.tensor(yVal, dtype=torch.long)

# Assuming you have validation data x_val and y_val, convert them to PyTorch tensors as well

# Define DataLoader for training data
val_dataset = TensorDataset(x_val_tensor, y_val_tensor)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False)

In [None]:
patience, trials = 15, 0
best_acc, best_loss = 0, float('inf')
# Training loop
for epoch in range(8):
    start = time.time()
    model.train()
    train_loss = 0
    train_truth, train_pred = [], []
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        preds = outputs.argmax(dim=1)
        train_pred.extend(preds.cpu().numpy())
        train_truth.extend(targets.numpy())
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        # sched.step()
        train_loss += loss.item()
    train_acc = accuracy_score(train_truth, train_pred)
    train_bal_acc = balanced_accuracy_score(train_truth, train_pred)
    # print(outputs)
    # print(np.array(train_pred).sum())
    # print(np.array(train_truth).sum())
    val_truth = []
    val_pred = []
    val_loss = 0
    with torch.no_grad():
        model.eval()
        for inputs, targets in val_loader:
            outputs = model(inputs)
            preds = outputs.argmax(dim=1)
            val_pred.extend(preds.cpu().numpy())
            val_truth.extend(targets.numpy())
            loss = criterion(outputs, targets)
            val_loss += loss.item()
    # print(np.array(val_pred).sum())
    # print(np.array(val_truth).sum())
    val_acc = accuracy_score(val_truth, val_pred)
    val_bal_acc = balanced_accuracy_score(val_truth, val_pred)
    if epoch % 1 == 0:
        print(f'Epoch: {epoch:3d}. Loss: {train_loss/len(train_loader):.4f}. Acc.: {train_acc:2.2%} Bal Acc.: {train_bal_acc:2.2%}  \
            Val Loss: {val_loss/len(val_loader):.4f} Val Acc.: {val_acc:2.2%}   Val Bal Acc.: {val_bal_acc:2.2%}  time: {time.time() - start:.2f}s   trial: {trials}')
    if val_bal_acc > best_acc:
        best_acc = val_bal_acc
        torch.save(model.state_dict(), 'best.pth')
        print(f'Epoch {epoch} best model saved with accuracy: {best_acc:2.2%}')
    if val_loss < best_loss:
        trials = 0
        best_loss = val_loss
    else:
        trials += 1
        if trials >= patience:
            print(f'Early stopping on epoch {epoch}')
            break

Epoch:   0. Loss: 0.1785. Acc.: 90.42% Bal Acc.: 94.02%              Val Loss: 0.3373 Val Acc.: 86.22%   Val Bal Acc.: 92.10%  time: 917.46s   trial: 0
Epoch 0 best model saved with accuracy: 92.10%
Epoch:   1. Loss: 0.1053. Acc.: 93.79% Bal Acc.: 96.50%              Val Loss: 0.2353 Val Acc.: 91.08%   Val Bal Acc.: 93.83%  time: 907.31s   trial: 0
Epoch 1 best model saved with accuracy: 93.83%
Epoch:   2. Loss: 0.0835. Acc.: 94.93% Bal Acc.: 97.24%              Val Loss: 0.2496 Val Acc.: 91.44%   Val Bal Acc.: 93.89%  time: 902.74s   trial: 0
Epoch 2 best model saved with accuracy: 93.89%
Epoch:   3. Loss: 0.0708. Acc.: 95.60% Bal Acc.: 97.67%              Val Loss: 0.2313 Val Acc.: 92.66%   Val Bal Acc.: 94.00%  time: 917.37s   trial: 1
Epoch 3 best model saved with accuracy: 94.00%
Epoch:   4. Loss: 0.0617. Acc.: 96.11% Bal Acc.: 97.98%              Val Loss: 0.2403 Val Acc.: 92.39%   Val Bal Acc.: 93.80%  time: 896.12s   trial: 0
Epoch:   5. Loss: 0.0555. Acc.: 96.50% Bal Acc.: 98.

In [None]:
def extractFeat(xt,xv,winSz,timeStart,timeEnd,timeStep):
    tList = []
    featList = []

    # Specifying the initial window for extracting features
    t0 = timeStart
    t1 = t0+winSz

    while(t1<=timeEnd):
        # Using the middle time of the window as a reference time
        tList.append((t0+t1)/2)

        # Extracting features
        xWin = xv[(xt>=t0)*(xt<=t1),:]
        # f1 = np.mean(xWin,axis=0)
        # f2 = np.std(xWin,axis=0)

        # Storing the features
        featList.append(xWin)
        # featList.append(np.concatenate((f1,f2)))

        # Updating the window by shifting it by the step size
        t0 = t0+timeStep
        t1 = t0+winSz

    tList = np.array(tList)
    featList = np.array(featList)

    return tList, featList

def loadTrial(dataFolder,id):
    x = np.genfromtxt('{}Trial{:02d}_x.csv'.format(dataFolder,id),delimiter=',')
    xt = x[:,0]
    xv = x[:,1:]
    y = np.genfromtxt('{}Trial{:02d}_y.csv'.format(dataFolder,id),delimiter=',')
    yt = y[:,0]
    yv = y[:,1].astype(int)

    # Returning x measurements and y labels
    return xt, xv, yt, yv

In [None]:
pred_path = Path('drive/MyDrive/Predictions')
pred_path.mkdir(exist_ok=True)
timeStep = 0.1
winSz = 1
test_path = 'drive/MyDrive/GaitData/data/test/'
testIDs = [1,2,3,4]


for k,id in enumerate(testIDs):
        # Loading the raw data
        xt, xv, yt, yv = loadTrial(test_path,id=id)
        pred_trial_path = pred_path/f'Trial{id:02d}_y.csv'

        # Extracting the time window for which we have values for the measurements and the response
        timeStart = np.max((np.min(xt),np.min(yt)))
        timeEnd = np.min((np.max(xt),np.max(yt)))

        # Extracting the features
        _, feat = extractFeat(xt,xv,winSz,timeStart,timeEnd,timeStep)
        x_test_tensor = torch.tensor(np.array(feat), dtype=torch.float32)
        # Assuming you have validation data x_val and y_val, convert them to PyTorch tensors as well

        # Define DataLoader for training data
        test_dataset = TensorDataset(x_test_tensor)
        test_loader = DataLoader(test_dataset, batch_size=128, shuffle=False)
        with torch.no_grad():
            model.eval()
            preds = []
            for [inputs] in test_loader:
                outputs = model(inputs)
                preds.extend(outputs.argmax(dim=1).cpu().numpy())
        # preds = network.predict(np.array(feat)).tolist()
        if len(preds) < len(yt):
            preds.extend([preds[-1]] * (len(yt) - len(preds)))
        shutil.copy('{}Trial{:02d}_x.csv'.format(test_path,id), pred_path)
        yt = np.array(yt)[:, np.newaxis]
        preds = np.array(preds)[:, np.newaxis]
        out = np.hstack((yt, preds))
        np.savetxt(pred_trial_path, out, delimiter=',')
        print(pred_trial_path)

  yv = y[:,1].astype(int)


drive/MyDrive/Predictions/Trial01_y.csv
drive/MyDrive/Predictions/Trial02_y.csv
drive/MyDrive/Predictions/Trial03_y.csv
drive/MyDrive/Predictions/Trial04_y.csv
