<a href="https://colab.research.google.com/github/pitwegner/UTS_ML2019_Project/blob/master/A2_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import glob
import numpy as np
import math
DATA_SOURCE = 'google'
location_prefix = ''
if DATA_SOURCE == 'google':
    from google.colab import drive
    drive.mount('/content/drive')
    location_prefix = '/content/drive/My Drive/'
elif DATA_SOURCE == 'github':
    import urllib.request   
    dl_location = '/content/'
    filename, headers = urllib.request.urlretrieve('https://github.com/pitwegner/UTS_ML2019_Project/archive/master.zip', filename=dl_location + 'master.zip')
    import zipfile
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(dl_location)
    location_prefix = '/content/UTS_ML2019_Project-master/'

In [None]:
activities = pd.read_csv(location_prefix + "train/activities_train.csv") # Activity Labels for Segments and Nurse ID
mocap = pd.DataFrame()
print("Reading Mocap Data")
i = 0
bar_length = 50
files = glob.glob(location_prefix + "train/mocap/segment*.csv")
for mf in files:
    i += 1
    progress = math.ceil(bar_length * i / len(files))
    print("\r", "[" + "=" * progress + " " * (bar_length - progress) + "] " + "{0:.2f}".format(100 * i / len(files)) + '%', end="")
    mocap = mocap.append(pd.read_csv(mf).ffill().bfill().fillna(0))
mocap = mocap.reset_index().drop(columns=['index','time_elapsed'])

In [None]:
mocap_normalized = (mocap-mocap.min())/(mocap.max()-mocap.min())
mocap_normalized.segment_id = mocap.segment_id
mocap = mocap_normalized
activity_arr = activities.activity_id.unique()
activity_arr.sort()

In [None]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
import torch.optim as optim

torch.manual_seed(0)
np.random.seed(0)

class Dataset(data.Dataset):
  
    def __init__(self, train, labels):
        self.labels = labels
        self.data = train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = self.data[index].drop(columns=['segment_id']).values
        sid = self.data[index].segment_id.unique()[0]
        labels = self.labels[self.labels.segment_id == sid]
        aid = labels.activity_id.values[0]
        y = np.array([activity_arr.tolist().index(aid), sid])

        return X, y

dataset = Dataset(mocap, activities)
window_length = 200

In [None]:
for a in activities.subject.unique():
    act = activities[activities.subject == a]
    print(a, ":", [(i, len(act[act.activity_id == i].segment_id)) for i in np.sort(act.activity_id.unique())])

In [None]:
class SimpleCNN(torch.nn.Module):
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.kernel_size = 3
        self.stride = 1
        self.padding = 1
        self.output_channels = 24
        self.hidden_parameters = 64
        
        self.output_x = int((window_length - self.kernel_size + 2 * self.padding) / self.stride) + 1
        self.output_y = int((dataset[0:1][0].shape[1] - self.kernel_size + 2 * self.padding) / self.stride) + 1
        
        self.conv1 = torch.nn.Conv2d(1, self.output_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding)
        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = torch.nn.Linear(self.output_channels * int(self.output_x / 2) * int(self.output_y / 2), self.hidden_parameters)
        self.fc2 = torch.nn.Linear(self.hidden_parameters, len(activities.activity_id.unique()))
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(-1, self.output_channels * int(self.output_x / 2) * int(self.output_y / 2))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return(x)

In [None]:
from torch.utils.data.sampler import Sampler    

class RandomWindowSampler(Sampler):
  
    def __init__(self, indices):
        self.indices = indices

    def __iter__(self):
        return (slice(self.indices[i], self.indices[i] + window_length) for i in torch.randperm(len(self.indices)))

    def __len__(self):
        return len(self.indices)
  

In [None]:
PERSON_SPLIT = True
TEST_PERSON = 2
VAL_PERSON = 4
if PERSON_SPLIT:
    indices = {}
    for sid in dataset.data.segment_id.unique():
        i = list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
        p = activities[activities.segment_id == sid].subject.item()
        if p not in indices:
            indices[p] = []
        indices[p] += i
    test_indices = indices.pop(TEST_PERSON)
    val_indices = indices.pop(VAL_PERSON)
    train_indices = [item for sublist in indices.values() for item in sublist]
else:
    train_indices, val_indices, test_indices = ([],[],[])
    segments = dataset.data.segment_id.unique()
    split = int(np.floor(0.15 * len(segments)))
    np.random.shuffle(segments)
    train, val, test = segments[split+split:], segments[split:split+split], segments[:split]
    for sid in train:
        train_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
    for sid in val:
        val_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
    for sid in test:
        test_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])

train_sampler = RandomWindowSampler(train_indices)
val_sampler = RandomWindowSampler(val_indices)
test_sampler = RandomWindowSampler(test_indices)

def get_train_loader(batch_size):
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=2)
  
val_loader = torch.utils.data.DataLoader(dataset, batch_size=128, sampler=val_sampler, num_workers=2)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=4, sampler=test_sampler, num_workers=2)

In [None]:
import time

def trainNet(net, batch_size, n_epochs, learning_rate):
  
    print("===== HYPERPARAMETERS =====")
    print("batch_size =", batch_size)
    print("epochs =", n_epochs)
    print("learning_rate =", learning_rate)
    print("=" * 27)
    
    train_loader = get_train_loader(batch_size)
    n_batches = len(train_loader)
    
    loss = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    
    training_start_time = time.time()
    
    min_val_loss = math.inf
    worse_counter = 0
    
    for epoch in range(n_epochs):
        
        running_loss = 0.0
        total_train_loss = 0.0
        print_every = 10
        start_time = time.time()
        
        worse_counter += 1
        
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            if inputs.shape != (32,200,87):
                # TODO: Handle leftover batches (<32)
                continue
            inputs, labels = Variable(inputs.reshape((32,1,200,87))), Variable(labels)
            
            optimizer.zero_grad()
            
            #Forward pass, backward pass, optimize
            outputs = net(inputs)
            loss_size = loss(outputs, labels[:,0])
            loss_size.backward()
            optimizer.step()
            running_loss += loss_size.data.item()
            total_train_loss += loss_size.data.item()
            
            #Print every 10th batch of an epoch
            if (i + 1) % (print_every) == 0:
                print("Epoch {}, {:d}% \t train_loss: {:.2f} took: {:.2f}s".format(epoch+1, int(100 * (i+1) / len(train_loader)), running_loss / print_every, time.time() - start_time))
                running_loss = 0.0
                start_time = time.time()
        total_val_loss = 0.0
        for i, data in enumerate(val_loader, 0):
            inputs, labels = data
            if inputs.shape != (128,200,87):
                break
            inputs, labels = Variable(inputs.reshape((128,1,200,87))), Variable(labels)

            val_outputs = net(inputs)
            val_loss_size = loss(val_outputs, labels[:,0])
            total_val_loss += val_loss_size.data.item()
            
        loss_avg = total_val_loss / len(val_loader)
        print("Validation loss = {:.2f}".format(loss_avg))
        if loss_avg < min_val_loss:
            min_val_loss = loss_avg
            best_model = net
            worse_counter = 0
        
        val_losses.append(loss_avg)
        train_losses.append(total_train_loss / len(train_loader))
        if worse_counter >= 10:
            break
        
    print("Training finished, took {:.2f}s".format(time.time() - training_start_time))

In [None]:
CNN = SimpleCNN()
train_losses = []
val_losses = []
best_model = CNN
trainNet(CNN.double(), batch_size=32, n_epochs=150, learning_rate=0.00001)

In [None]:
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(train_losses)), y=train_losses, mode='lines', name='train_loss'))
fig.add_trace(go.Scatter(x=np.arange(len(val_losses)), y=val_losses, mode='lines', name='val_loss'))
fig.show()

In [None]:
from scipy import stats

np.set_printoptions(suppress=True)
confusion_matrix = np.zeros((6,6))
votes = {}
print("Starting Test Run")
for i, data in enumerate(test_loader, 0):
    inputs, labels = data
    if inputs.shape != (4,200,87):
        # TODO: Handle leftover batches (<4)
        break
    inputs, labels = Variable(inputs.reshape((4,1,200,87))), Variable(labels)
    val_outputs = CNN.double()(inputs)
    preds = val_outputs.argmax(1)
    for s in range(len(labels)):
        if labels[s,1].item() not in votes:
            votes[labels[s,1].item()] = []
        votes[labels[s,1].item()].append(preds[s].item())
    confusion_matrix[preds, labels[:,0]] += 1
    print("\r", "{0:.2f}%".format(100 * i / len(test_loader)), end="")
for sid in votes:
    label = dataset.labels[dataset.labels.segment_id == sid].activity_id.values[0]
    votes[sid] = [stats.mode(votes[sid])[0][0], label]
print("\n")
print(votes)
print(confusion_matrix)
print("{0:.2f}%".format(100 * np.trace(confusion_matrix)/np.sum(confusion_matrix)))

In [None]:
from scipy import stats

np.set_printoptions(suppress=True)
confusion_matrix = np.zeros((6,6))
votes = {}
print("Starting Test Run")
for i, data in enumerate(test_loader, 0):
    inputs, labels = data
    if inputs.shape != (4,200,87):
        # TODO: Handle leftover batches (<4)
        break
    inputs, labels = Variable(inputs.reshape((4,1,200,87))), Variable(labels)
    val_outputs = best_model.double()(inputs)
    preds = val_outputs.argmax(1)
    for s in range(len(labels)):
        if labels[s,1].item() not in votes:
            votes[labels[s,1].item()] = []
        votes[labels[s,1].item()].append(preds[s].item())
    confusion_matrix[preds, labels[:,0]] += 1
    print("\r", "{0:.2f}%".format(100 * i / len(test_loader)), end="")
for sid in votes:
    label = dataset.labels[dataset.labels.segment_id == sid].activity_id.values[0]
    votes[sid] = [stats.mode(votes[sid])[0][0], label]
print("\n")
print(votes)
print(confusion_matrix)
print("{0:.2f}%".format(100 * np.trace(confusion_matrix)/np.sum(confusion_matrix)))