In [4]:
import os
import pandas as pd
import numpy as np
import pickle
import time
import random

import torch
import torch.nn as nn
import torch.nn.functional as F


In [2]:
with open('/home/ngr/gdrive/wearables/data/processed/MOD_1000_Woman_Activity_Data.pkl', 'rb') as f:
    data = pickle.load(f)
    f.close()


In [7]:
np.random.choice(list(data.keys()), 3)

array(['1455-32-NA', '2312-7-1', '2124-8-1'], dtype='<U15')

In [27]:
lstm_out, hidden = nn.LSTM(1, 64)(torch.FloatTensor(np.array([float(i) for i in data['1455-32-NA'][1]])).view(len(data['1455-32-NA'][1]), 1, -1))

In [36]:
nn.Linear(64, 2)(lstm_out).shape

torch.Size([20219, 1, 2])

In [26]:
nn.LayerNorm(1)(torch.FloatTensor(np.array([float(i) for i in data['1455-32-NA'][1]])).view(-1, 1, 1))

tensor([[[-0.0002]],

        [[-0.0007]],

        [[-0.0007]],

        ...,

        [[-0.0027]],

        [[ 0.0007]],

        [[ 0.0061]]], grad_fn=<NativeLayerNormBackward>)

In [None]:
class BieberLSTM(nn.Module):
    def __init__(self, nb_layers, nb_lstm_units=100, embedding_dim=3, batch_size=3):
        self.vocab = {'<PAD>': 0, 'is': 1, 'it': 2, 'too': 3, 'late': 4, 'now': 5, 'say': 6, 'sorry': 7, 'ooh': 8,
                      'yeah': 9}
        self.tags = {'<PAD>': 0, 'VB': 1, 'PRP': 2, 'RB': 3, 'JJ': 4, 'NNP': 5}

        self.nb_layers = nb_layers
        self.nb_lstm_units = nb_lstm_units
        self.embedding_dim = embedding_dim
        self.batch_size = batch_size

        # don't count the padding tag for the classifier output
        self.nb_tags = len(self.tags) - 1

        # when the model is bidirectional we double the output dimension
        self.lstm

        # build actual NN
        self.__build_model()

    def __build_model(self):
        # build embedding layer first
        nb_vocab_words = len(self.vocab)

        # whenever the embedding sees the padding index it'll make the whole vector zeros
        padding_idx = self.vocab['<PAD>']
        self.word_embedding = nn.Embedding(
            num_embeddings=nb_vocab_words,
            embedding_dim=self.embedding_dim,
            padding_idx=padding_idx
        )

        # design LSTM
        self.lstm = nn.LSTM(
            input_size=self.embedding_dim,
            hidden_size=self.nb_lstm_units,
            num_layers=self.nb_lstm_layers,
            batch_first=True,
        )

        # output layer which projects back to tag space
        self.hidden_to_tag = nn.Linear(self.nb_lstm_units, self.nb_tags)

    def init_hidden(self):
        # the weights are of the form (nb_layers, batch_size, nb_lstm_units)
        hidden_a = torch.randn(self.hparams.nb_lstm_layers, self.batch_size, self.nb_lstm_units)
        hidden_b = torch.randn(self.hparams.nb_lstm_layers, self.batch_size, self.nb_lstm_units)

        if self.hparams.on_gpu:
            hidden_a = hidden_a.cuda()
            hidden_b = hidden_b.cuda()

        hidden_a = Variable(hidden_a)
        hidden_b = Variable(hidden_b)

        return (hidden_a, hidden_b)

    def forward(self, X, X_lengths):
        # reset the LSTM hidden state. Must be done before you run a new batch. Otherwise the LSTM will treat
        # a new batch as a continuation of a sequence
        self.hidden = self.init_hidden()

        batch_size, seq_len, _ = X.size()

        # ---------------------
        # 1. embed the input
        # Dim transformation: (batch_size, seq_len, 1) -> (batch_size, seq_len, embedding_dim)
        X = self.word_embedding(X)

        # ---------------------
        # 2. Run through RNN
        # TRICK 2 ********************************
        # Dim transformation: (batch_size, seq_len, embedding_dim) -> (batch_size, seq_len, nb_lstm_units)

        # pack_padded_sequence so that padded items in the sequence won't be shown to the LSTM
        X = torch.nn.utils.rnn.pack_padded_sequence(x, X_lengths, batch_first=True)

        # now run through LSTM
        X, self.hidden = self.lstm(X, self.hidden)

        # undo the packing operation
        X, _ = torch.nn.utils.rnn.pad_packed_sequence(X, batch_first=True)

        # ---------------------
        # 3. Project to tag space
        # Dim transformation: (batch_size, seq_len, nb_lstm_units) -> (batch_size * seq_len, nb_lstm_units)

        # this one is a bit tricky as well. First we need to reshape the data so it goes into the linear layer
        X = X.contiguous()
        X = X.view(-1, X.shape[2])

        # run through actual linear layer
        X = self.hidden_to_tag(X)

        # ---------------------
        # 4. Create softmax activations bc we're doing classification
        # Dim transformation: (batch_size * seq_len, nb_lstm_units) -> (batch_size, seq_len, nb_tags)
        X = F.log_softmax(X, dim=1)

        # I like to reshape for mental sanity so we're back to (batch_size, seq_len, nb_tags)
        X = X.view(batch_size, seq_len, self.nb_tags)

        Y_hat = X
        return Y_hat

    def loss(self, Y_hat, Y, X_lengths):
        # TRICK 3 ********************************
        # before we calculate the negative log likelihood, we need to mask out the activations
        # this means we don't want to take into account padded items in the output vector
        # simplest way to think about this is to flatten ALL sequences into a REALLY long sequence
        # and calculate the loss on that.

        # flatten all the labels
        Y = Y.view(-1)

        # flatten all predictions
        Y_hat = Y_hat.view(-1, self.nb_tags)

        # create a mask by filtering out all tokens that ARE NOT the padding token
        tag_pad_token = self.tags['<PAD>']
        mask = (Y > tag_pad_token).float()

        # count how many tokens we have
        nb_tokens = int(torch.sum(mask).data[0])

        # pick the values for the label and zero out the rest with the mask
        Y_hat = Y_hat[range(Y_hat.shape[0]), Y] * mask

        # compute cross entropy loss which ignores all <PAD> tokens
        ce_loss = -torch.sum(Y_hat) / nb_tokens

        return ce_loss

In [81]:
# exclude sequences fewer than 1000

In [82]:
# return shortest sequence
seq_lengths = {}
for k, v in data.items():
    seq_lengths[k] = len(v[1])
print('Sequences min_n_t: {}\tmax_n_t: {}'.format(min(list(seq_lengths.values())), max(list(seq_lengths.values()))))

# return pid of min and delete from data
exclude_pids = []
for k, v in seq_lengths.items():
    if v < 1000:
        exclude_pids.append(k)

for i in exclude_pids:
    del data[i]

Sequences min_n_t: 14	max_n_t: 131072


In [83]:
# recheck shortest
seq_lengths = {}
for k, v in data.items():
    seq_lengths[k] = len(v[1])
print('Sequences min_n_t: {}\tmax_n_t: {}'.format(min(list(seq_lengths.values())), max(list(seq_lengths.values()))))

Sequences min_n_t: 1262	max_n_t: 131072


In [117]:
## data loader

In [114]:
class actigraphydata(torch.utils.data.Dataset):
    def __init__(self, datapkl, max_seq_length):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.datapkl = datapkl
        self.max_seq_length = max_seq_length
        
    def __len__(self):
        return len(self.datapkl.keys())

    def __getitem__(self, idx):
        pid = list(self.datapkl.keys())[idx]
        GA = int(pid.split('-')[1])
        t, xt = transform(self.datapkl[pid], max_length=self.max_seq_length)
        sample = {'x':xt, 'y':torch.tensor(GA)}
        return sample

In [115]:
dataset = actigraphydata(data, 1200)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [165]:
def transform(input_seq, max_length=1200):
    tod = input_seq[0][:max_length] # time of day
    activity = torch.FloatTensor(np.array([float(i) for i in input_seq[1]])[:max_length]).view(-1, 1)
    
    # log-pseudocount
    activity = activity + 1
    activity = activity.log()
    
    # alignment
    return (tod, activity)

class EmbeddingNet(nn.Module):
    def __init__(self, hidden_layer_size, max_seq_length, output_size):
        super().__init__()
        self.max_seq_length = max_seq_length
        self.hidden_layer_size = hidden_layer_size
        
        self.LN = nn.LayerNorm(1)
        self.conv1d = nn.Conv1d(1, hidden_layer_size, 5, 1)
        self.lstm = nn.LSTM(hidden_layer_size, hidden_layer_size)
        self.fc = nn.Linear(hidden_layer_size, output_size)
        
        # initialize hidden cell
        self.hidden_cell = (torch.zeros(1,self.hidden_layer_size,self.hidden_layer_size), 
                            torch.zeros(1,self.hidden_layer_size,self.hidden_layer_size))
        
    def forward(self, input_seq):
        xt = self.LN(input_seq)
        xt = self.conv1d(xt.view(-1, 1, self.max_seq_length))
        lstm_out, self.hidden_cell = self.lstm(xt.view(xt.shape[2], -1, self.hidden_layer_size), self.hidden_cell)
        return self.fc(lstm_out.view(-1, self.hidden_layer_size)) # NOTE: calculate 1196 based on conv transform

In [166]:
x.shape

torch.Size([32, 1200, 1])

In [167]:
EmbeddingNet(32, 1200, 2)(x).shape

torch.Size([38272, 2])

In [None]:
class TripletSelector:
    """
    Implementation should return indices of anchors, positive and negative samples
    return np array of shape [N_triplets x 3]
    """

    def __init__(self):
        pass

    def get_triplets(self, embeddings, labels):
        raise NotImplementedError


class AllTripletSelector(TripletSelector):
    """
    Returns all possible triplets
    May be impractical in most cases
    """

    def __init__(self):
        super(AllTripletSelector, self).__init__()

    def get_triplets(self, embeddings, labels):
        labels = labels.cpu().data.numpy()
        triplets = []
        for label in set(labels):
            label_mask = (labels == label)
            label_indices = np.where(label_mask)[0]
            if len(label_indices) < 2:
                continue
            negative_indices = np.where(np.logical_not(label_mask))[0]
            anchor_positives = list(combinations(label_indices, 2))  # All anchor-positive pairs

            # Add all negatives for all positive pairs
            temp_triplets = [[anchor_positive[0], anchor_positive[1], neg_ind] for anchor_positive in anchor_positives
                             for neg_ind in negative_indices]
            triplets += temp_triplets

        return torch.LongTensor(np.array(triplets))