In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, TimeSeriesSplit
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

In [2]:
def get_ds_infos():
    """
    Read the file includes data subject information.

    EEE4114: Technically we do not need these data, as we are not concerned about identifying the subjects. 
    
    Data Columns:
    0: code [1-24]
    1: weight [kg]
    2: height [cm]
    3: age [years]
    4: gender [0:Female, 1:Male]
    
    Returns:
        A pandas DataFrame that contains information about data subjects' attributes 
    """ 

    dss = pd.read_csv(f'C:/Users/emanu/Documents/GithubRepos/EEE4114F-ML-Project-2025/data/data_subjects_info.csv')
    print("[INFO] -- Data subjects' information is imported.")
    
    return dss

def set_data_types(data_types=["userAcceleration"]):
    """
    Select the sensors and the mode to shape the final dataset. 

    EEE4114F: Choose sensors that you think are useful or would like to include in training. 
    You can choose all of them, or you could opt to try a limited set of input features
    
    Args:
        data_types: A list of sensor data type from this list: [attitude, gravity, rotationRate, userAcceleration] 

    Returns:
        It returns a list of columns to use for creating time-series from files.
    """
    dt_list = []
    for t in data_types:
        if t != "attitude":
            dt_list.append([t+".x",t+".y",t+".z"])
        else:
            dt_list.append([t+".roll", t+".pitch", t+".yaw"])

    return dt_list


def create_time_series(dt_list, act_labels, trial_codes, mode="mag", labeled=True):
    """
    EEE4114F: This defines what data you would like to include for a given set.  

    Args:
        dt_list: A list of columns that shows the type of data we want.
        act_labels: list of activities.
        trial_codes: list of trial codes corresponding to each activity.
        mode: It can be "raw" (all dimensions of each data type) or "mag" (only magnitude per data type).
        labeled: True if labeled dataset (adds 'act', 'trial', 'sub_id'), False otherwise.

    Returns:
        pd.DataFrame: Time-series sensor data with optional labels.
    """
    num_data_cols = len(dt_list) if mode == "mag" else len(dt_list * 3)

    if labeled:
        # +3 for act, trial, sub_id labels
        dataset = np.zeros((0, num_data_cols + 3))
    else:
        dataset = np.zeros((0, num_data_cols))

    ds_list = get_ds_infos()  # Assume this returns dict with "code" key for subject IDs

    print("[INFO] -- Creating Time-Series")
    for sub_id in ds_list["code"]:
        for act_id, act in enumerate(act_labels):
            for trial in trial_codes[act_id]:
                fname = f'C:/Users/emanu/Documents/GithubRepos/EEE4114F-ML-Project-2025/data/A_DeviceMotion_data/{act}_{trial}/sub_{int(sub_id)}.csv'
                raw_data = pd.read_csv(fname)
                raw_data = raw_data.drop(['Unnamed: 0'], axis=1)
                vals = np.zeros((len(raw_data), num_data_cols))
                for x_id, axes in enumerate(dt_list):
                    if mode == "mag":
                        # Calculate magnitude of 3 axes per sensor type
                        vals[:, x_id] = (raw_data[axes] ** 2).sum(axis=1) ** 0.5        
                    else:
                        vals[:, x_id * 3:(x_id + 1) * 3] = raw_data[axes].values
                    vals = vals[:, :num_data_cols]
                if labeled:
                    # Add act, trial, sub_id as labels (same length as vals)
                    lbls = np.array([[act_id, trial, sub_id]] * len(raw_data))
                    vals = np.concatenate((vals, lbls), axis=1)
                dataset = np.append(dataset, vals, axis=0)

    cols = []
    for axes in dt_list:
        if mode == "raw":
            cols += axes
        else:
            cols += [str(axes[0][:-2])]  # Use first part of sensor name for magnitude

    if labeled:
        cols += ["act", "trial", "sub_id"]

    dataset = pd.DataFrame(data=dataset, columns=cols)
    return dataset

In [3]:
ACT_LABELS = ["dws","ups", "wlk", "jog", "std", "sit"]
TRIAL_CODES = {
    ACT_LABELS[0]:[1,2,11],
    ACT_LABELS[1]:[3,4,12],
    ACT_LABELS[2]:[7,8,15],
    ACT_LABELS[3]:[9,16],
    ACT_LABELS[4]:[6,14],
    ACT_LABELS[5]:[5,13]
}

## Here we set parameter to build labeled time-series from dataset of "(A)DeviceMotion_data"
## attitude(roll, pitch, yaw); gravity(x, y, z); rotationRate(x, y, z); userAcceleration(x,y,z)
sdt = ["attitude", "gravity", "rotationRate", "userAcceleration"]
print("[INFO] -- Selected sensor data types: "+str(sdt))    
act_labels = ACT_LABELS # choose all 6 classes
print("[INFO] -- Selected activites: "+str(act_labels))    
trial_codes = [TRIAL_CODES[act] for act in act_labels]
dt_list = set_data_types(sdt)
dataset = create_time_series(dt_list, act_labels, trial_codes, mode="raw", labeled=True)
print("[INFO] -- Shape of time-Series dataset:"+str(dataset.shape))    
dataset.head()

[INFO] -- Selected sensor data types: ['attitude', 'gravity', 'rotationRate', 'userAcceleration']
[INFO] -- Selected activites: ['dws', 'ups', 'wlk', 'jog', 'std', 'sit']
[INFO] -- Data subjects' information is imported.
[INFO] -- Creating Time-Series
[INFO] -- Shape of time-Series dataset:(1412865, 15)


Unnamed: 0,attitude.roll,attitude.pitch,attitude.yaw,gravity.x,gravity.y,gravity.z,rotationRate.x,rotationRate.y,rotationRate.z,userAcceleration.x,userAcceleration.y,userAcceleration.z,act,trial,sub_id
0,1.528132,-0.733896,0.696372,0.741895,0.669768,-0.031672,0.316738,0.77818,1.082764,0.294894,-0.184493,0.377542,0.0,1.0,1.0
1,1.527992,-0.716987,0.677762,0.753099,0.657116,-0.032255,0.842032,0.424446,0.643574,0.219405,0.035846,0.114866,0.0,1.0,1.0
2,1.527765,-0.706999,0.670951,0.759611,0.649555,-0.032707,-0.138143,-0.040741,0.343563,0.010714,0.134701,-0.167808,0.0,1.0,1.0
3,1.516768,-0.704678,0.675735,0.760709,0.647788,-0.04114,-0.025005,-1.048717,0.03586,-0.008389,0.136788,0.094958,0.0,1.0,1.0
4,1.493941,-0.703918,0.672994,0.760062,0.64721,-0.05853,0.114253,-0.91289,0.047341,0.199441,0.353996,-0.044299,0.0,1.0,1.0


In [4]:
def windows(dataset, window_size=400, stride=200):
    """
    Slice dataset into overlapping windows of fixed length.

    Args:
        dataset (pd.DataFrame): Time-series data with sensor features and 'act' label column.
        window_size (int): Number of samples per window.
        stride (int): Step size between windows.

    Returns:
        pd.DataFrame: A new DataFrame with flattened windows and corresponding labels.
    """
    features = dataset.drop(columns=['act']).values
    labels = dataset['act'].values

    windowed_data = []
    windowed_labels = []

    for start in range(0, len(dataset) - window_size + 1, stride):
        end = start + window_size
        window = features[start:end]
        label_window = labels[start:end]

        # Skip incomplete window (optional)
        if len(window) < window_size:
            continue

        # Flatten window to 1D (e.g., shape becomes [n_features * window_size])
        window_flat = window.T.flatten()
        # Use the most frequent label in the window as the label
        label = np.bincount(label_window.astype(int)).argmax()

        windowed_data.append(window_flat)
        windowed_labels.append(label)

    # Build dataframe
    X = pd.DataFrame(windowed_data)
    Y = pd.Series(windowed_labels, name='act')

    return X, Y

In [5]:
class MotionSense(Dataset):
    def __init__(self, X, Y, transform=None):
        self.X = torch.tensor(X.values)
        self.Y = torch.tensor(Y.values)
        self.transform = transform

    def __getitem__(self, index):
        if torch.is_tensor(index):
            index = int(index.item())

        x = self.X[index]
        y = self.Y[index]

        return x, y

    def __len__(self):
        return len(self.X)

In [6]:
# Split subjects for train/test
unique_sub_ids = dataset['sub_id'].unique()
train_subs, test_subs = train_test_split(unique_sub_ids, test_size=0.15, random_state=12)

train_df = dataset[dataset['sub_id'].isin(train_subs)].reset_index(drop=True)
test_df = dataset[dataset['sub_id'].isin(test_subs)].reset_index(drop=True)

# Select only sensor features
feature_cols = [
    'attitude.roll', 'attitude.pitch', 'attitude.yaw',
    'gravity.x', 'gravity.y', 'gravity.z',
    'rotationRate.x', 'rotationRate.y', 'rotationRate.z',
    'userAcceleration.x', 'userAcceleration.y', 'userAcceleration.z'
]
filtered_train_df = train_df[feature_cols + ['act']]
filtered_test_df = test_df[feature_cols + ['act']]

# Apply windowing
X_train, Y_train = windows(filtered_train_df, window_size=400, stride=200)
X_test, Y_test = windows(filtered_test_df, window_size=400, stride=200)

# 3. Using only training set for CV
ts_cv = TimeSeriesSplit(n_splits=5, gap=20, test_size=1000)

all_splits = list(ts_cv.split(X_train, Y_train))

# # 4. For each split, you get train_idx, val_idx within training data
# for train_idx, val_idx in all_splits:
#     X_tr = X_train.iloc[train_idx].reset_index(drop=True)
#     Y_tr = Y_train.iloc[train_idx].reset_index(drop=True)
#     X_val = X_train.iloc[val_idx].reset_index(drop=True)
#     Y_val = Y_train.iloc[val_idx].reset_index(drop=True)

#     # Create datasets and loaders for training and validation here
#     train_ds = MotionSense(X_tr, Y_tr)
#     val_ds = MotionSense(X_val, Y_val)

#     train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
#     val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

# test_ds = MotionSense(X_test, Y_test)
# test_dl = DataLoader(test_dataset, batch_size=32, shuffle=False)

# # Here is an example of the TimeSeriesSplit. It does create cross-validation data by default. 
# ts_cv = TimeSeriesSplit(
#     n_splits=5,
#     gap=20,
#     #max_train_size=10000,
#     test_size=1000
# )

# all_splits = list(ts_cv.split(X ,Y)) # These are indices, not the data itself.

# # Choose the first split for example
# train_idx, test_idx = all_splits[0]

# X_train = X.iloc[train_idx].reset_index(drop=True)
# Y_train = Y.iloc[train_idx].reset_index(drop=True)
# X_test = X.iloc[test_idx].reset_index(drop=True)
# Y_test = Y.iloc[test_idx].reset_index(drop=True)

In [7]:
# train_df.head()
# test_df.head()

In [8]:
# train_dataset = MotionSense(X_train, Y_train)
# test_dataset = MotionSense(X_test, Y_test)

# train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
# test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [9]:
# # Define the improved fully connected network
# class FCNet(nn.Module):
#     def __init__(self, input_size, hidden_size, num_classes):
#         super(FCNet, self).__init__()
#         self.fc1 = nn.Linear(input_size, hidden_size)
#         self.bn1 = nn.BatchNorm1d(hidden_size)
#         self.dropout1 = nn.Dropout(0.3)
        
#         self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
#         self.bn2 = nn.BatchNorm1d(hidden_size // 2)
#         self.dropout2 = nn.Dropout(0.3)
        
#         self.fc3 = nn.Linear(hidden_size // 2, num_classes)
#         self.relu = nn.ReLU()

#     def forward(self, x):
#         x = x.view(x.size(0), -1)  # Flatten input
#         x = self.fc1(x)
#         x = self.bn1(x)
#         x = self.relu(x)
#         x = self.dropout1(x)
        
#         x = self.fc2(x)
#         x = self.bn2(x)
#         x = self.relu(x)
#         x = self.dropout2(x)
        
#         x = self.fc3(x)
#         return x

# # Parameters - adjust based on your dataset's window size and features
# input_size = 12 * 400  # 6 sensors * 400 time steps
# hidden_size = 200     # Increased for better representation
# num_classes = 6       # Number of activity classes

# # Initialize model, loss, and optimizer with weight decay for regularization
# model = FCNet(input_size, hidden_size, num_classes)
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

# # Example training loop
# num_epochs = 20  # Extended training

# train_losses = []
# train_accuracies = []

# for epoch in range(num_epochs):
#     model.train()
#     running_loss = 0.0
#     correct = 0
#     total = 0

#     for inputs, labels in train_loader:
#         optimizer.zero_grad()
#         outputs = model(inputs.float())
#         loss = criterion(outputs, labels.long())
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#     epoch_loss = running_loss / len(train_loader)
#     epoch_acc = 100 * correct / total

#     train_losses.append(epoch_loss)
#     train_accuracies.append(epoch_acc)

#     print(f"Epoch {epoch+1}: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.2f}%")

In [10]:
# Define the improved fully connected network
class FCNet(nn.Module):
    def __init__(self, input_size, hidden_size, num_classes):
        super(FCNet, self).__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.bn1 = nn.BatchNorm1d(hidden_size)
        self.dropout1 = nn.Dropout(0.3)
        
        self.fc2 = nn.Linear(hidden_size, hidden_size // 2)
        self.bn2 = nn.BatchNorm1d(hidden_size // 2)
        self.dropout2 = nn.Dropout(0.3)
        
        self.fc3 = nn.Linear(hidden_size // 2, num_classes)
        self.relu = nn.ReLU()

    def forward(self, x):
        x = x.view(x.size(0), -1)  # Flatten input
        x = self.fc1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.dropout1(x)
        
        x = self.fc2(x)
        x = self.bn2(x)
        x = self.relu(x)
        x = self.dropout2(x)
        
        x = self.fc3(x)
        return x

# Parameters - adjust based on your dataset's window size and features
input_size = 12 * 400  # 6 sensors * 400 time steps
hidden_size = 100     # Increased for better representation
num_classes = 6       # Number of activity classes

In [11]:
num_epochs = 50

best_val_acc = 0
best_model_state = None

for fold, (train_idx, val_idx) in enumerate(all_splits):
    print(f"Fold {fold + 1}")

    X_tr = X_train.iloc[train_idx].reset_index(drop=True)
    Y_tr = Y_train.iloc[train_idx].reset_index(drop=True)
    X_val = X_train.iloc[val_idx].reset_index(drop=True)
    Y_val = Y_train.iloc[val_idx].reset_index(drop=True)

    train_ds = MotionSense(X_tr, Y_tr)
    val_ds = MotionSense(X_val, Y_val)

    train_dl = DataLoader(train_ds, batch_size=32, shuffle=True)
    val_dl = DataLoader(val_ds, batch_size=32, shuffle=False)

    model = FCNet(input_size, hidden_size, num_classes)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    for epoch in range(num_epochs):
        model.train()
        running_loss = 0
        correct = 0
        total = 0

        for inputs, labels in train_dl:
            optimizer.zero_grad()
            outputs = model(inputs.float())
            loss = criterion(outputs, labels.long())
            loss.backward()
            optimizer.step()

            running_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

        train_loss = running_loss / len(train_dl)
        train_acc = 100 * correct / total

        # Validation
        model.eval()
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in val_dl:
                outputs = model(inputs.float())
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        val_acc = 100 * val_correct / val_total
        print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Train Acc={train_acc:.2f}%, Val Acc={val_acc:.2f}%")

    # Track best validation accuracy to select the best fold model
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_model_state = model.state_dict()

Fold 1
Epoch 1: Train Loss=1.3445, Train Acc=52.96%, Val Acc=46.50%
Epoch 2: Train Loss=0.9815, Train Acc=70.25%, Val Acc=50.20%
Epoch 3: Train Loss=0.7648, Train Acc=79.81%, Val Acc=52.00%
Epoch 4: Train Loss=0.5792, Train Acc=88.88%, Val Acc=59.60%
Epoch 5: Train Loss=0.4210, Train Acc=93.35%, Val Acc=60.50%
Epoch 6: Train Loss=0.3065, Train Acc=96.86%, Val Acc=62.20%
Epoch 7: Train Loss=0.2472, Train Acc=96.74%, Val Acc=67.80%
Epoch 8: Train Loss=0.1984, Train Acc=97.58%, Val Acc=57.90%
Epoch 9: Train Loss=0.1595, Train Acc=98.19%, Val Acc=60.90%
Epoch 10: Train Loss=0.1473, Train Acc=98.31%, Val Acc=68.00%
Epoch 11: Train Loss=0.1178, Train Acc=98.19%, Val Acc=68.70%
Epoch 12: Train Loss=0.1051, Train Acc=98.31%, Val Acc=68.40%
Epoch 13: Train Loss=0.0924, Train Acc=98.43%, Val Acc=64.40%
Epoch 14: Train Loss=0.0860, Train Acc=98.55%, Val Acc=68.10%
Epoch 15: Train Loss=0.0724, Train Acc=98.91%, Val Acc=65.40%
Epoch 16: Train Loss=0.0695, Train Acc=99.03%, Val Acc=64.80%
Epoch 17: 

Epoch 34: Train Loss=0.0338, Train Acc=98.94%, Val Acc=92.70%
Epoch 35: Train Loss=0.0370, Train Acc=99.01%, Val Acc=91.40%
Epoch 36: Train Loss=0.0392, Train Acc=99.08%, Val Acc=94.90%
Epoch 37: Train Loss=0.0410, Train Acc=98.87%, Val Acc=95.00%
Epoch 38: Train Loss=0.0289, Train Acc=99.29%, Val Acc=85.80%
Epoch 39: Train Loss=0.0264, Train Acc=99.22%, Val Acc=91.50%
Epoch 40: Train Loss=0.0291, Train Acc=99.22%, Val Acc=93.00%
Epoch 41: Train Loss=0.0378, Train Acc=98.83%, Val Acc=92.80%
Epoch 42: Train Loss=0.0326, Train Acc=98.90%, Val Acc=92.80%
Epoch 43: Train Loss=0.0409, Train Acc=98.73%, Val Acc=90.40%
Epoch 44: Train Loss=0.0360, Train Acc=99.01%, Val Acc=95.00%
Epoch 45: Train Loss=0.0552, Train Acc=98.66%, Val Acc=92.10%
Epoch 46: Train Loss=0.0605, Train Acc=98.27%, Val Acc=92.60%
Epoch 47: Train Loss=0.0349, Train Acc=98.76%, Val Acc=91.30%
Epoch 48: Train Loss=0.0405, Train Acc=98.76%, Val Acc=92.20%
Epoch 49: Train Loss=0.0302, Train Acc=99.12%, Val Acc=93.00%
Epoch 50

In [12]:
print(f"Best validation accuracy from CV: {best_val_acc:.2f}%")
print("Retraining on full training data...")

full_train_ds = MotionSense(X_train, Y_train)
full_train_dl = DataLoader(full_train_ds, batch_size=32, shuffle=True)

model = FCNet(input_size, hidden_size, num_classes)
model.load_state_dict(best_model_state)  # Optionally initialize from best fold
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

for epoch in range(num_epochs):
    model.train()
    running_loss = 0
    correct = 0
    total = 0

    for inputs, labels in full_train_dl:
        optimizer.zero_grad()
        outputs = model(inputs.float())
        loss = criterion(outputs, labels.long())
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    epoch_loss = running_loss / len(full_train_dl)
    epoch_acc = 100 * correct / total

    print(f"Retrain Epoch {epoch+1}: Loss = {epoch_loss:.4f}, Accuracy = {epoch_acc:.2f}%")

Best validation accuracy from CV: 95.60%
Retraining on full training data...
Retrain Epoch 1: Loss = 0.0891, Accuracy = 97.66%
Retrain Epoch 2: Loss = 0.0561, Accuracy = 98.41%
Retrain Epoch 3: Loss = 0.0484, Accuracy = 98.46%
Retrain Epoch 4: Loss = 0.0504, Accuracy = 98.48%
Retrain Epoch 5: Loss = 0.0380, Accuracy = 98.82%
Retrain Epoch 6: Loss = 0.0355, Accuracy = 98.87%
Retrain Epoch 7: Loss = 0.0419, Accuracy = 98.73%
Retrain Epoch 8: Loss = 0.0330, Accuracy = 99.03%
Retrain Epoch 9: Loss = 0.0342, Accuracy = 99.06%
Retrain Epoch 10: Loss = 0.0369, Accuracy = 98.75%
Retrain Epoch 11: Loss = 0.0362, Accuracy = 98.87%
Retrain Epoch 12: Loss = 0.0325, Accuracy = 99.04%
Retrain Epoch 13: Loss = 0.0365, Accuracy = 98.92%
Retrain Epoch 14: Loss = 0.0294, Accuracy = 99.14%
Retrain Epoch 15: Loss = 0.0361, Accuracy = 98.84%
Retrain Epoch 16: Loss = 0.0313, Accuracy = 99.01%
Retrain Epoch 17: Loss = 0.0332, Accuracy = 98.84%
Retrain Epoch 18: Loss = 0.0284, Accuracy = 99.16%
Retrain Epoch 

In [13]:
test_ds = MotionSense(X_test, Y_test)
test_dl = DataLoader(test_ds, batch_size=32, shuffle=False)

model.eval()
test_correct = 0
test_total = 0

with torch.no_grad():
    for inputs, labels in test_dl:
        outputs = model(inputs.float())
        _, predicted = torch.max(outputs.data, 1)
        test_total += labels.size(0)
        test_correct += (predicted == labels).sum().item()

test_acc = 100 * test_correct / test_total
print(f"Test Accuracy: {test_acc:.2f}%")

Test Accuracy: 85.09%
