In [1]:
import pandas as pd

df = pd.read_csv('./data/run_or_walk.csv')

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
df.head()

Unnamed: 0,date,time,username,wrist,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
0,2017-6-30,13:51:15:847724020,viktor,0,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
1,2017-6-30,13:51:16:246945023,viktor,0,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2,2017-6-30,13:51:16:446233987,viktor,0,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
3,2017-6-30,13:51:16:646117985,viktor,0,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
4,2017-6-30,13:51:16:846738994,viktor,0,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [3]:
df.isnull().sum()

date              0
time              0
username          0
wrist             0
activity          0
acceleration_x    0
acceleration_y    0
acceleration_z    0
gyro_x            0
gyro_y            0
gyro_z            0
dtype: int64

In [4]:
# Drop unnecessary cols
df = df.drop(columns=['username', 'wrist'])
# Truncate timestamp to microsecs since datetime only support up till microsecs
df['time'] = df['time'].apply(lambda x: x[:-3])
df['datetime'] = pd.to_datetime(df['date'] + ' ' + df['time'], format='%Y-%m-%d %H:%M:%S:%f')
df = df.drop(columns=['date', 'time'])
df.set_index('datetime', inplace=True)
df.head()

Unnamed: 0_level_0,activity,acceleration_x,acceleration_y,acceleration_z,gyro_x,gyro_y,gyro_z
datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2017-06-30 13:51:15.847724,0,0.265,-0.7814,-0.0076,-0.059,0.0325,-2.9296
2017-06-30 13:51:16.246945,0,0.6722,-1.1233,-0.2344,-0.1757,0.0208,0.1269
2017-06-30 13:51:16.446233,0,0.4399,-1.4817,0.0722,-0.9105,0.1063,-2.4367
2017-06-30 13:51:16.646117,0,0.3031,-0.8125,0.0888,0.1199,-0.4099,-2.9336
2017-06-30 13:51:16.846738,0,0.4814,-0.9312,0.0359,0.0527,0.4379,2.4922


In [5]:
import numpy as np
from scipy.stats import iqr, skew, kurtosis

def rms(x):
    return np.sqrt(np.mean(x**2))
chunk_size = 54
# Filter away chunks where activity is not all the same value (some 10s intervals are 53 rows long, others are 54)
filtered_df = df.groupby(np.arange(len(df)) // chunk_size).filter(lambda x: len(x['activity'].unique()) == 1)
transformed_df = filtered_df.groupby(np.arange(len(filtered_df)) // chunk_size).agg({
    'acceleration_x': ['median', iqr, 'var', skew, kurtosis, rms],
    'acceleration_y': ['median', iqr, 'var', skew, kurtosis, rms],
    'acceleration_z': ['median', iqr, 'var', skew, kurtosis, rms],
    'gyro_x': ['median', iqr, 'var', skew, kurtosis, rms],
    'gyro_y': ['median', iqr, 'var', skew, kurtosis, rms],
    'gyro_z': ['median', iqr, 'var', skew, kurtosis, rms],
    'activity': lambda x: x.iloc[0]
})
transformed_df.columns = ['_'.join(col).strip() for col in transformed_df.columns.values]
transformed_df = transformed_df.rename(columns={'activity_<lambda>': 'activity'})

In [6]:
transformed_df['activity'].value_counts()

activity
1    811
0    808
Name: count, dtype: int64

In [7]:
y = transformed_df['activity'].to_numpy()
x = transformed_df.drop(['activity'], axis=1).to_numpy()

In [8]:
from sklearn.model_selection import train_test_split

# Stratified sampling to get good ratio of class 0 vs class 1
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42, stratify=y)

In [9]:
unique, counts = np.unique(y_train, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0 646]
 [  1 649]]


In [10]:
unique, counts = np.unique(y_test, return_counts=True)
print(np.asarray((unique, counts)).T)

[[  0 162]
 [  1 162]]


In [11]:
import torch
from torch.utils.data import TensorDataset, DataLoader

x_train = torch.from_numpy(x_train).to(torch.float32)
x_test = torch.from_numpy(x_test).to(torch.float32)
y_train = torch.from_numpy(y_train).to(torch.long)
y_test = torch.from_numpy(y_test).to(torch.long)

train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

In [12]:
import torch
import torch.nn as nn

torch.manual_seed(42)
class Model(nn.Module):
    def __init__(self, input_size, hidden_size1, hidden_size2, output_size, dropout_prob=0.2):
        super(Model, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size1)
        self.layer2 = nn.Linear(hidden_size1, hidden_size2)
        self.layer3 = nn.Linear(hidden_size2, output_size)
        self.dropout = nn.Dropout(dropout_prob)
        self.relu = nn.ReLU()


    def forward(self, x):
        x = self.relu(self.layer1(x))
        x = self.dropout(x)
        x = self.relu(self.layer2(x))
        x = self.dropout(x)
        x = self.layer3(x)
        return x

In [13]:
INPUT_SIZE = x_train.shape[1]
HIDDEN_SIZE1 = 32
HIDDEN_SIZE2 = 32
OUTPUT_SIZE = 2
DROPOUT_PROB = 0.2

model = Model(INPUT_SIZE, HIDDEN_SIZE1, HIDDEN_SIZE2, OUTPUT_SIZE, DROPOUT_PROB)

In [14]:
import torch.optim as optim
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

NUM_EPOCHS = 5
LEARNING_RATE = 0.01

def train(model, train_loader, optimizer, criterion):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0

    for inputs, targets in train_loader:
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += targets.size(0)
        correct += predicted.eq(targets).sum().item()

    train_loss = running_loss / len(train_loader)
    train_accuracy = 100. * correct / total

    return train_loss, train_accuracy

def evaluate(model, x_test, y_test):
    print(f'Evaluating model performance for test set of size {len(x_test)}')
    model.eval()
    with torch.no_grad():
        y_pred = model(x_test)
        _, y_pred = torch.max(y_pred, 1)
    print(f'Accuracy: {100. * accuracy_score(y_test, y_pred)}%')
    print(f'Confusion matrix:\n {confusion_matrix(y_test, y_pred)}')
    print(f'Classification report:\n {classification_report(y_test, y_pred)}')

In [15]:
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)

for epoch in range(NUM_EPOCHS):
    train_loss, train_accuracy = train(model, train_loader, optimizer, criterion)
    print(f'Epoch {epoch+1}/{NUM_EPOCHS}, Train Loss: {train_loss:.4f}, Train Accuracy: {train_accuracy:.2f}%')

Epoch 1/5, Train Loss: 0.1698, Train Accuracy: 92.74%
Epoch 2/5, Train Loss: 0.0156, Train Accuracy: 99.69%
Epoch 3/5, Train Loss: 0.0011, Train Accuracy: 100.00%
Epoch 4/5, Train Loss: 0.0050, Train Accuracy: 99.77%
Epoch 5/5, Train Loss: 0.0029, Train Accuracy: 99.85%


In [16]:
evaluate(model, x_test, y_test)

Evaluating model performance for test set of size 324
Accuracy: 100.0%
Confusion matrix:
 [[162   0]
 [  0 162]]
Classification report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       162
           1       1.00      1.00      1.00       162

    accuracy                           1.00       324
   macro avg       1.00      1.00      1.00       324
weighted avg       1.00      1.00      1.00       324



In [17]:
import os
from datetime import datetime

os.makedirs('models', exist_ok=True)
dt_string = datetime.now().strftime("%d_%m_%Y-%H_%M_%S")
torch.save(model.state_dict(), f'models/mlp_{dt_string}.pt')

In [22]:
model_params_filename = f'models/mlp_{dt_string}_params.txt'
with open(model_params_filename, 'w+') as file:
    for name, param in model.named_parameters():
        file.write(f'Layer: {name}\n')
        # Convert the parameter values to a NumPy array
        param_array = param.data.cpu().numpy()
        
        if "weight" in name:
            for row in param_array:
                file.write('{')
                for val in row:
                    file.write(f'%.6f, ' % val)
                file.write('}')
                file.write('\n')
        else: # bias
            file.write('{')
            for val in param_array:
                file.write(f'%.6f, ' % val)
            file.write('}')
            file.write('\n')
        file.write('\n')