<a href="https://colab.research.google.com/github/pitwegner/UTS_ML2019_Project/blob/master/A2_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import glob
import numpy as np
import math

DATA_SOURCE = 'local' # can be google, github, or local
location_prefix = 'data/' # path of data (or download location for github)

if DATA_SOURCE == 'google':
    from google.colab import drive
    drive.mount('/content/drive')
    location_prefix = '/content/drive/My Drive/' + location_prefix
elif DATA_SOURCE == 'github':
    import urllib.request   
    dl_location = '/content/' + location_prefix
    filename, headers = urllib.request.urlretrieve('https://github.com/pitwegner/UTS_ML2019_Project/archive/master.zip', filename=dl_location + 'master.zip')
    import zipfile
    with zipfile.ZipFile(filename, 'r') as zip_ref:
        zip_ref.extractall(dl_location)
    location_prefix = dl_location + 'UTS_ML2019_Project-master/data/'

In [None]:
# Read activity labels for segments and nurse id
activities = pd.read_csv(location_prefix + "activities_train.csv")
activity_arr = activities.activity_id.unique()
activity_arr.sort()

# Read Motion Capture Data
mocap = pd.DataFrame()
print("Reading Mocap Data")
i = 0
bar_length = 50
files = glob.glob(location_prefix + "mocap/segment*.csv")
for mf in files:
    # Basic NaN value removal
    mocap = mocap.append(pd.read_csv(mf).ffill().bfill().fillna(0))
    i += 1
    progress = math.ceil(bar_length * i / len(files))
    print("\r", "[" + "=" * progress + " " * (bar_length - progress) + "] " + "{0:.2f}".format(100 * i / len(files)) + '%', end="")

# Drop time column, since constant frequency
mocap = mocap.reset_index().drop(columns=['index','time_elapsed'])

In [None]:
#num_point = int((len(mocap.columns)-1)/3) #29 points
#num_sample = len(mocap) #1577775 samples

#mocap_nu = np.array(mocap)
#mocap_nu_id = mocap_nu[:, -1]
#mocap_nu_ft = mocap_nu[:, :-1]
#mocap_nu_ft = mocap_nu_ft.T
#mocap_nu_ft = mocap_nu_ft.reshape(num_point, 3, num_sample)

#total_dis = int(num_point * (num_point - 1) / 2) #406 distances
#mocap_dis = np.zeros((total_dis, num_sample))
#print('num_point:', num_point, 'total_dis:', total_dis,'num_sample:', num_sample)

In [None]:
#m = 0
#for l in range(0, num_point):
#    print('i:', l)                     
#    a1 = mocap_nu_ft[l, :, :]
#    for k in range(l + 1, num_point):
#        a2 = mocap_nu_ft[k, :, :]
#        mocap_dis[m, :] = np.sqrt(np.sum((a1 - a2)**2, axis=0, keepdims=True))
#        m += 1

In [None]:
#mocap_distance = np.zeros((num_sample, total_dis + 1)) #shape(1577775, 407)
#mocap_dist = mocap_dis.T #shape(1577775, 406)
#mocap_distance[:, :-1] = mocap_dist
#mocap_distance[:, -1] = mocap_nu_id
#mocap_distance = pd.DataFrame(mocap_distance)

In [None]:
# Min-max normalization
mocap_normalized = (mocap-mocap.min())/(mocap.max()-mocap.min())
mocap_normalized.segment_id = mocap.segment_id
mocap = mocap_normalized

In [None]:
import torch
from torch.autograd import Variable
import torch.nn.functional as F
from torch.utils import data
import torch.optim as optim

torch.manual_seed(0)
np.random.seed(0)

class Dataset(data.Dataset):
  
    def __init__(self, train, labels):
        self.labels = labels
        self.data = train

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        X = self.data[index].drop(columns=['segment_id']).values
        sid = self.data[index].segment_id.unique()[0]
        labels = self.labels[self.labels.segment_id == sid]
        aid = labels.activity_id.values[0]
        y = np.array([activity_arr.tolist().index(aid), sid])

        return X, y

dataset = Dataset(mocap, activities)

In [None]:
print("Person", ":", "[(activity_id, count), ...]")
for a in activities.subject.unique():
    act = activities[activities.subject == a]
    print(a, ":", [(i, len(act[act.activity_id == i].segment_id)) for i in np.sort(act.activity_id.unique())])

In [None]:
from torch.utils.data.sampler import Sampler    

window_length = 200 # = 2*f

# Sampler that iterates a random permutation of start indices and selects window
class RandomWindowSampler(Sampler):
  
    def __init__(self, indices):
        self.indices = indices

    def __iter__(self):
        return (slice(self.indices[i], self.indices[i] + window_length) for i in torch.randperm(len(self.indices)))

    def __len__(self):
        return len(self.indices)
  

In [None]:
# Split data either randomly by segments or by person (as official test set introduces a new person)
PERSON_SPLIT = False
TEST_PERSON = 2
VAL_PERSON = 4
if PERSON_SPLIT:
    indices = {}
    for sid in dataset.data.segment_id.unique():
        i = list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
        p = activities[activities.segment_id == sid].subject.item()
        if p not in indices:
            indices[p] = []
        indices[p] += i
    test_indices = indices.pop(TEST_PERSON)
    val_indices = indices.pop(VAL_PERSON)
    train_indices = [item for sublist in indices.values() for item in sublist]
else:
    train_indices, val_indices, test_indices = ([],[],[])
    segments = dataset.data.segment_id.unique()
    split = int(np.floor(0.15 * len(segments))) # 15%
    np.random.shuffle(segments)
    # 70% training, 15% validation, 15% testing
    train, val, test = segments[split+split:], segments[split:split+split], segments[:split]
    for sid in train:
        train_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
    for sid in val:
        val_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])
    for sid in test:
        test_indices += list(dataset.data[dataset.data.segment_id == sid].index[0:-window_length:50])

train_sampler = RandomWindowSampler(train_indices)
val_sampler = RandomWindowSampler(val_indices)
test_sampler = RandomWindowSampler(test_indices)

# Create data loaders to parallelize batch training to multiple cores
def get_train_loader(batch_size):
    return torch.utils.data.DataLoader(dataset, batch_size=batch_size, sampler=train_sampler, num_workers=3)
val_loader = torch.utils.data.DataLoader(dataset, batch_size=128, sampler=val_sampler, num_workers=3)
test_loader = torch.utils.data.DataLoader(dataset, batch_size=4, sampler=test_sampler, num_workers=3)

In [None]:
class SimpleCNN(torch.nn.Module):
    
    def __init__(self):
        super(SimpleCNN, self).__init__()
        
        self.kernel_size = 3
        self.stride = 1
        self.padding = 1
        self.output_channels = 24
        self.hidden_parameters = 64
        
        # Calculate output size after convolution
        self.output_x = int((window_length            - self.kernel_size + 2 * self.padding) / self.stride) + 1
        self.output_y = int((dataset[0:1][0].shape[1] - self.kernel_size + 2 * self.padding) / self.stride) + 1
        
        self.conv1 = torch.nn.Conv2d(1, self.output_channels, kernel_size=self.kernel_size, stride=self.stride, padding=self.padding)
        self.pool = torch.nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.fc1 = torch.nn.Linear(self.output_channels * int(self.output_x / 2) * int(self.output_y / 2), self.hidden_parameters)
        self.fc2 = torch.nn.Linear(self.hidden_parameters, len(activities.activity_id.unique()))
        
    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = x.view(-1, self.output_channels * int(self.output_x / 2) * int(self.output_y / 2))
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return(x)

In [None]:
import time

def trainNet(net, batch_size, n_epochs, learning_rate):
    print("Training started")
    train_loader = get_train_loader(batch_size)
    n_batches = len(train_loader)
    
    # Select loss function and optimizer
    loss = torch.nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=learning_rate)
    
    min_val_loss = math.inf
    worse_counter = 0
    
    training_start_time = time.time()
    
    for epoch in range(n_epochs):
        
        running_loss = 0.0
        total_train_loss = 0.0
        print_every = 25
        start_time = time.time()
        
        worse_counter += 1
        
        for i, data in enumerate(train_loader, 0):
            inputs, labels = data
            # reshape data since we only have 1 channel
            inputs = inputs.reshape((inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))
            inputs, labels = Variable(inputs), Variable(labels)
            
            # Forward pass, backward pass, optimize
            optimizer.zero_grad()
            outputs = net(inputs)
            loss_size = loss(outputs, labels[:,0])
            loss_size.backward()
            optimizer.step()
            
            # Aggregate losses for plotting and printing
            running_loss += loss_size.data.item()
            total_train_loss += loss_size.data.item()
            
            # Print average running loss every 25th batch of an epoch
            if (i + 1) % print_every == 0:
                print("Epoch {}, {:d}% \t train_loss: {:.2f} took: {:.2f}s".format(epoch + 1, int(100 * (i + 1) / n_batches), running_loss / print_every, time.time() - start_time))
                running_loss = 0.0
                start_time = time.time()
        
        # Run validation pass at end of epoch
        total_val_loss = 0.0
        for i, data in enumerate(val_loader, 0):
            inputs, labels = data
            inputs = inputs.reshape((inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))
            inputs, labels = Variable(inputs), Variable(labels)

            val_outputs = net(inputs)
            val_loss_size = loss(val_outputs, labels[:,0])
            total_val_loss += val_loss_size.data.item()
            
        loss_avg = total_val_loss / len(val_loader)
        print("Validation loss = {:.2f}{}".format(loss_avg, ' (worse, {})'.format(worse_counter) if loss_avg >= min_val_loss else ' (better)'))
        if loss_avg < min_val_loss:
            min_val_loss = loss_avg
            worse_counter = 0
            
            # Save best model for testing
            best_model = SimpleCNN().load_state_dict(net.state_dict())
        
        # Append average loss for plotting
        val_losses.append(loss_avg)
        train_losses.append(total_train_loss / n_batches)
        
        # Stop training if we haven't improved for 10 epochs
        if worse_counter >= 10:
            break
        
    print("Training finished, took {}".format(time.strftime('%H:%M:%S', time.time() - training_start_time)))

In [None]:
CNN = SimpleCNN()
train_losses = []
val_losses = []
best_model = CNN
trainNet(CNN.double(), batch_size=32, n_epochs=150, learning_rate=0.000001)

In [None]:
# Loss visualization
import plotly.graph_objects as go
fig = go.Figure()
fig.add_trace(go.Scatter(x=np.arange(len(train_losses)), y=train_losses, mode='lines', name='train_loss'))
fig.add_trace(go.Scatter(x=np.arange(len(val_losses)), y=val_losses, mode='lines', name='val_loss'))
fig.show()

In [None]:
from scipy import stats

np.set_printoptions(suppress=True)
confusion_matrix = np.zeros((6,6))
votes = {}
print("Starting Test Run")

for i, data in enumerate(test_loader, 0):
    inputs, labels = data
    inputs = inputs.reshape((inputs.shape[0], 1, inputs.shape[1], inputs.shape[2]))
    inputs, labels = Variable(inputs), Variable(labels)
    
    val_outputs = best_model.double()(inputs)
    predictions = val_outputs.argmax(1)
    
    # Collect votes
    for s in range(len(labels)):
        segment = labels[s,1].item()
        if segment not in votes:
            votes[segment] = []
        votes[segment].append(predictions[s].item())
    
    # Compare test vs. label
    confusion_matrix[predictions, labels[:,0]] += 1
    print("\r", "{0:.2f}%".format(100 * i / len(test_loader)), end="")

# Select prediction as vote majority
correct_votes = 0
for sid in votes:
    label = dataset.labels[dataset.labels.segment_id == sid].activity_id.values[0]
    votes[sid] = [activity_arr[stats.mode(votes[sid])[0][0]], label]
    correct_votes += int(votes[sid][0] == votes[sid][1])

print("\r", "Confusion matrix for individual windows:")
print(confusion_matrix)
print("Accuracy: {0:.2f}%".format(100 * np.trace(confusion_matrix)/np.sum(confusion_matrix)))
print("\r", "Vote prediction for entire segments:")
print(votes)
print("Accuracy: {0:.2f}%".format(100 * correct_votes/len(votes)))