In [1]:
# https://stackoverflow.com/questions/45113245/how-to-get-mini-batches-in-pytorch-in-a-clean-and-efficient-way
# https://distill.pub/2016/misread-tsne/
# http://setosa.io/ev/principal-component-analysis/
# https://stackoverflow.com/questions/50544730/how-do-i-split-a-custom-dataset-into-training-and-test-datasets

In [2]:
import numpy as np
import pandas as pd
from pathlib import Path

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler

from tensorboardX import SummaryWriter

from sklearn.preprocessing import LabelEncoder

In [3]:
SESSION = 1
SPLIT = 128
RANDOM_SEED = 42
SHUFFLE = True

VALIDATION_SPLIT = 0.2
EPOCHS = 10
BATCH_SIZE = 128
INPUT_SIZE = 59
ENCODE_DIM = 8
LOG_EMBED = False

BASE_FOLDER = Path('../data')
SESSIONS = {0: 22, 1: 153, 2: 153}

device = torch.device("cpu")

In [4]:
class GaitDataset(Dataset):
    def __init__(self, filename):
        df = pd.read_csv(BASE_FOLDER.joinpath(Path(filename)), header=None)
        y = df[df.columns[-1]].values
        df.drop([df.columns[-1]], axis=1, inplace=True)
        y = LabelEncoder().fit_transform(y)
        
        self.Xdata = df
        self.Ydata = y
        
    def __len__(self):
        return len(self.Xdata)
    
    def __getitem__(self, index):
        vector = self.Xdata.iloc[index, :].values.astype(np.float32)
        label  = self.Ydata[index]
        
        return vector, label

In [5]:
def get_csv(session, split):
    return 'zju_gaitaccel_session_' + str(session) + '_' + str(split) + '.csv'

In [6]:
gait_dataset = GaitDataset(get_csv(SESSION, SPLIT))

# Creating data indices for training and validation splits:
dataset_size = len(gait_dataset)
indices = list(range(dataset_size))
split = int(np.floor(VALIDATION_SPLIT * dataset_size))
if SHUFFLE:
    np.random.seed(RANDOM_SEED)
    np.random.shuffle(indices)
train_indices, valid_indices = indices[split:], indices[:split]

# Creating data samplers and loaders:
train_sampler = SubsetRandomSampler(train_indices)
valid_sampler = SubsetRandomSampler(valid_indices)

train_loader = DataLoader(gait_dataset, batch_size=BATCH_SIZE, 
                          sampler=train_sampler)
valid_loader = DataLoader(gait_dataset, batch_size=BATCH_SIZE,
                          sampler=valid_sampler)
lossloader = DataLoader(gait_dataset, batch_size=BATCH_SIZE, shuffle=True)

In [7]:
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Linear(INPUT_SIZE, 32),
            nn.ReLU(True),
            nn.Linear(32, ENCODE_DIM),
            nn.ReLU(True))

        self.decoder = nn.Sequential(             
            nn.Linear(ENCODE_DIM, 32),
            nn.ReLU(True),
            nn.Linear(32, INPUT_SIZE))

    def forward(self,x):
        x_enc = self.encoder(x)
        x_dec = self.decoder(x_enc)
        return x_dec, x_enc

In [8]:
x1 = torch.tensor([1, 2, 3])
x1_rep = x1.view(-1, 1).repeat(1, 4).view(1, 12)
x1_rep

tensor([[1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3]])

In [9]:
x2 = torch.tensor([1, 2, 3, 4])
x2_rep = x2.repeat(1, 3)
x2_rep

tensor([[1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4]])

In [10]:
(x1_rep == x2_rep).to(device, dtype=torch.int32) * 25 - 1

tensor([[24, -1, -1, -1, -1, 24, -1, -1, -1, -1, 24, -1]], dtype=torch.int32)

In [11]:
class SeparatorLoss(nn.Module):
    def __init__(self, loader, encoder):
        super(SeparatorLoss, self).__init__()
        self.loader = loader
        self.loader_iter = iter(loader)
        self.encoder = encoder
        self.pdist = nn.PairwiseDistance(p=2)
        
    def forward(self, x_pred, x_true, encoded, labels, n_iter):    
        # Get a batch from the same dataset
        try:
            batch_X, batch_y = next(self.loader_iter)
        except StopIteration:
            self.loader_iter = iter(self.loader)
            batch_X, batch_y = next(self.loader_iter)

        # Encode it
        batch_encoded = self.encoder(batch_X)
        outer_batch_size = batch_y.shape[0]
        inner_batch_size = labels.shape[0]
#         print('OUTER: {}, INNER: {}'.format(outer_batch_size, inner_batch_size))

        # Prepare same class vector
        X_labels = labels.view(-1, 1).repeat(1, outer_batch_size).view(1, inner_batch_size * outer_batch_size)
        batch_labels = batch_y.repeat(1, inner_batch_size)
        same = (X_labels == batch_labels).to(device, dtype=torch.float32)
#         print(same.sum())
        same = same * (SESSIONS[SESSION] ** 2) - 1

        # Prepare matrices
        dist_X = encoded.repeat(1, outer_batch_size).view(-1, ENCODE_DIM)
        dist_batch = batch_encoded.expand(inner_batch_size, outer_batch_size, ENCODE_DIM).reshape(-1, ENCODE_DIM)

        # Calculate pairwise distances
        # TODO: divide by BATCH_SIZE?!
        sep = (self.pdist(dist_X, dist_batch) * same).sum() / BATCH_SIZE
        if self.training:
            writer.add_scalar('data/sep', sep, n_iter)
 
        mse = F.mse_loss(x_pred, x_true)
        if self.training:
            writer.add_scalar('data/mse', mse, n_iter)
#         print('sep: {:.4f}, mse: {:.4f}'.format(sep, mse))
        return sep + mse

In [12]:
def train(epoch, batch_idx, loss_fn):
    loss_fn.train()
    losses = np.array([])
    for data in train_loader:
        vec, labels = data
        vec = Variable(vec, requires_grad=True).cpu()
        
        # ===================forward=====================
        dec, enc = model(vec)
        loss = distance(dec, vec, enc, labels, batch_idx)
        losses = np.append(losses, loss.item())
        
        # ===================backward====================
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        batch_idx += 1
    avg_loss = np.average(losses)
    
    # ===================validate========================
    loss_fn.eval()
    val_losses = np.array([])
    for data in valid_loader:
        vec, labels = data
        vec = Variable(vec, requires_grad=False).cpu()
        
        # ===================forward=====================
        dec, enc = model(vec)
        val_loss = distance(dec, vec, enc, labels, 0).item()
        val_losses = np.append(val_losses, val_loss)
    
    avg_val_loss = np.average(val_losses)
    
    # ===================log========================
    print('epoch [{}/{}], train_loss: {:.4f}, val_loss: {:.4f}'.format(epoch + 1, EPOCHS, avg_loss, avg_val_loss))
    writer.add_scalars('data/loss', {'train_loss': avg_loss,
                                     'val_loss': avg_val_loss}, 
                                     epoch + 1)
    
    # Add embeddings
    if LOG_EMBED:
        if epoch % 10 == 0:
            encs = torch.Tensor([])
            labels = []
            for data in train_loader:
                vec, label = data
                vec = Variable(vec).cpu()
                _, enc = model(vec)

                encs = torch.cat((encs, enc))
                labels.extend([str(l.tolist()) for l in label])
            
        writer.add_embedding(encs, metadata=labels, global_step=epoch + 1, tag='train')
        
    # writer.export_scalars_to_json('./all_scalars.json')
    return batch_idx

In [13]:
# writer = SummaryWriter(log_dir='runs/SeparatorTestEmbed')
writer = SummaryWriter()
model = Autoencoder().cpu()
distance = SeparatorLoss(lossloader, model.encoder)
optimizer = torch.optim.SGD(model.parameters(), lr = 0.0003, momentum = 0.9)

train_idx = 0
for epoch in range(EPOCHS):
    train_idx = train(epoch, train_idx, distance)
writer.close()

epoch [1/10], train_loss: 21.0586, val_loss: 0.2064
epoch [2/10], train_loss: 0.2100, val_loss: 0.2021
epoch [3/10], train_loss: 0.2050, val_loss: 0.1996
epoch [4/10], train_loss: 0.2044, val_loss: 0.2007
epoch [5/10], train_loss: 0.2018, val_loss: 0.1972
epoch [6/10], train_loss: 0.1989, val_loss: 0.1941
epoch [7/10], train_loss: 0.1974, val_loss: 0.1911
epoch [8/10], train_loss: 0.1956, val_loss: 0.1888
epoch [9/10], train_loss: 0.1927, val_loss: 0.1875
epoch [10/10], train_loss: 0.1905, val_loss: 0.1838
