In [1]:
# https://stackoverflow.com/questions/45113245/how-to-get-mini-batches-in-pytorch-in-a-clean-and-efficient-way
# https://distill.pub/2016/misread-tsne/
# http://setosa.io/ev/principal-component-analysis/

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader

from tensorboardX import SummaryWriter

from sklearn.preprocessing import LabelEncoder

In [3]:
SESSION = 1
SPLIT = 128

EPOCHS = 200
BATCH_SIZE = 128
INPUT_SIZE = 59
ENCODE_DIM = 8
INSIDE_BATCH = False

BASE_FOLDER = Path('../data')
SESSIONS = {0: 22, 1: 153, 2: 153}

device = torch.device("cpu")

In [4]:
class GaitDataset(Dataset):
    def __init__(self, filename):
        df = pd.read_csv(BASE_FOLDER.joinpath(Path(filename)), header=None)
        y = df[df.columns[-1]].values
        df.drop([df.columns[-1]], axis=1, inplace=True)
        y = LabelEncoder().fit_transform(y)
        
        self.Xdata = df
        self.Ydata = y
        
    def __len__(self):
        return len(self.Xdata)
    
    def __getitem__(self, index):
        vector = self.Xdata.iloc[index, :].values.astype(np.float32)
        label  = self.Ydata[index]
        
        return vector, label

In [5]:
def get_csv(session, split):
    return 'zju_gaitaccel_session_' + str(session) + '_' + str(split) + '.csv'

In [6]:
train_dataset = GaitDataset(get_csv(SESSION, SPLIT))
dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)
lossloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, drop_last=True)

In [7]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(INPUT_SIZE, 32),
            nn.ReLU(True),
            nn.Linear(32, 16),
            nn.ReLU(True),
            nn.Linear(16, ENCODE_DIM),
            nn.ReLU(True))

        self.decoder = nn.Sequential(             
            nn.Linear(ENCODE_DIM, 16),
            nn.ReLU(True),
            nn.Linear(16, 32),
            nn.ReLU(True),
            nn.Linear(32, INPUT_SIZE),
            nn.Sigmoid())

    def forward(self,x):
        x_enc = self.encoder(x)
        x_dec = self.decoder(x_enc)
        return x_dec, x_enc

In [8]:
class SeparatorLoss(nn.Module):
    def __init__(self, loader, encoder, writer):
        super(SeparatorLoss, self).__init__()
        self.loader = loader
        self.loader_iter = iter(loader)
        self.encoder = encoder
        self.pdist = nn.PairwiseDistance(p=2)
        self.writer = writer
        
    def forward(self, x_pred, x_true, encoded, labels, n_iter):    
        if INSIDE_BATCH:
            # Option A: inside current batch
            
            # Prepare similarity
            sep = torch.pdist(encoded, 2)
        else:
            # Option B: with other batch
        
            # Get a batch from the same dataset
            try:
                batch_X, batch_y = next(self.loader_iter)
            except StopIteration:
                self.loader_iter = iter(self.loader)
                batch_X, batch_y = next(self.loader_iter)
                
            # Encode it
            batch_encoded = self.encoder(batch_X)

            # Prepare same class vector
            X_labels = labels.view(-1, 1).repeat(1, BATCH_SIZE).view(1, BATCH_SIZE * BATCH_SIZE)
            batch_labels = batch_y.view(-1, 1).repeat(1, BATCH_SIZE).view(1, BATCH_SIZE * BATCH_SIZE)
            same = (X_labels == batch_labels).to(device, dtype=torch.float32)
            same = same * SESSIONS[SESSION] - 1
                
            # Prepare/repeat matrices
            dist_X = encoded.repeat(1, BATCH_SIZE).view(-1, ENCODE_DIM)
            dist_batch = batch_encoded.expand(BATCH_SIZE, BATCH_SIZE, ENCODE_DIM).reshape(-1, ENCODE_DIM)

            # Calculate pairwise distances
            sep = (self.pdist(dist_X, dist_batch) * same).sum() / BATCH_SIZE
            writer.add_scalar('data/sep', sep, n_iter)
 
        mse = F.mse_loss(x_pred, x_true)
        writer.add_scalar('data/mse', mse, n_iter)
#         print('sep: {:.4f}, mse: {:.4f}'.format(sep, mse))
        return sep + mse

In [9]:
writer = SummaryWriter(log_dir='runs/SeparatorTestEmbed')
# writer = SummaryWriter()
model = Autoencoder().cpu()
# distance = nn.MSELoss()
distance = SeparatorLoss(lossloader, model.encoder, writer)
# optimizer = torch.optim.Adam(model.parameters(), weight_decay=1e-5)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.0003, momentum = 0.9)
# print(model)

In [10]:
%%time
i = 0
for epoch in range(EPOCHS):
    for data in dataloader:
        vec, labels = data
        vec = Variable(vec, requires_grad=True).cpu()
        # ===================forward=====================
        dec, enc = model(vec)
        loss = distance(dec, vec, enc, labels, i)
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        i += 1
    # ===================log========================
    print('epoch [{}/{}], loss: {:.4f}'.format(epoch + 1, EPOCHS, loss.item()))
    writer.add_scalar('data/loss', loss.item(), epoch + 1)
    
    # Add embeddings
    if epoch % 10 == 0:
        encs = torch.Tensor([])
        labels = []
        for data in dataloader:
            vec, label = data
            vec = Variable(vec).cpu()
            _, enc = model(vec)

            encs = torch.cat((encs, enc))
            labels.extend([str(l.tolist()) for l in label])
            
        writer.add_embedding(encs, metadata=labels, global_step=epoch + 1, tag='train')
        
# writer.export_scalars_to_json('./all_scalars.json')
writer.close()

epoch [1/200], loss: 0.2537
EMBED
epoch [2/200], loss: -4.4228
epoch [3/200], loss: -0.4008
epoch [4/200], loss: -1.8963
epoch [5/200], loss: 4.0500
epoch [6/200], loss: 0.1372
epoch [7/200], loss: 0.8437
epoch [8/200], loss: 5.3576
epoch [9/200], loss: 0.4029
epoch [10/200], loss: 0.4785
epoch [11/200], loss: 3.2824
EMBED
epoch [12/200], loss: 0.2484
epoch [13/200], loss: 0.0125
epoch [14/200], loss: 1.1122
epoch [15/200], loss: -0.0567
epoch [16/200], loss: 0.5823
epoch [17/200], loss: 1.0928
epoch [18/200], loss: 0.4735
epoch [19/200], loss: -0.1486
epoch [20/200], loss: 0.3100
epoch [21/200], loss: 1.0275
EMBED
epoch [22/200], loss: 0.4330
epoch [23/200], loss: 0.3281
epoch [24/200], loss: 0.1735
epoch [25/200], loss: 0.3338
epoch [26/200], loss: 0.2358
epoch [27/200], loss: 0.3983
epoch [28/200], loss: 0.5264
epoch [29/200], loss: 0.7055
epoch [30/200], loss: 0.2485
epoch [31/200], loss: 0.4084
EMBED
epoch [32/200], loss: 0.8920
epoch [33/200], loss: 0.6730
epoch [34/200], loss: 0