In [1]:
import numpy as np 
import torch 
from torch import nn, Tensor
import torch.nn.functional as F

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [3]:
class BiLSTMEncoder(nn.Module):
    def __init__(self,seq_len, input_size, hidden_size,linear_filters,embedding_size:int, num_layers = 1,bidirectional=True,batch_size=32):
        super(BiLSTMEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.linear_filters = linear_filters
        self.embedding_size = embedding_size
        self.bidirectional = bidirectional
        self.batch_size = batch_size
        self.seq_len = seq_len

        # define LSTM layer
        self.layers = []

        # add linear layers 
        for __id,layer_out in enumerate(self.linear_filters):
            if __id == 0:
                self.layers.append(nn.Linear(self.input_size, layer_out))
            else:
                self.layers.append(nn.Linear(self.linear_filters[__id-1], layer_out))

        # add lstm layer
        self.lstm = nn.LSTM(input_size = layer_out, hidden_size = self.hidden_size,
                            num_layers = self.num_layers, bidirectional=self.bidirectional,
                            batch_first=True)
        
        self.net = nn.Sequential(*self.layers)

        #add embedding out
        if bidirectional:
            self.out_linear = nn.Linear(self.hidden_size*4, self.embedding_size)
        else:
            self.out_linear = nn.Linear(self.hidden_size*2, self.embedding_size)

        
    def forward(self, x_input):
        '''
        : param x_input:               input of shape (seq_len, # in batch, input_size)
        : return lstm_out, hidden:     lstm_out gives all the hidden states in the sequence; hidden gives the hidden state and cell state for the last element in the sequence                         
        '''
        
        x = self.net(x_input)
        lstm_out, self.hidden = self.lstm(x)
        hidden_transformed = torch.concat(self.hidden,0)
        hidden_transformed = torch.transpose(hidden_transformed,0,1)
        hidden_transformed = torch.flatten(hidden_transformed,start_dim=1)
        
        hidden_transformed = self.out_linear(hidden_transformed)
        
        return lstm_out, hidden_transformed

    
class BiLSTMDecoder(nn.Module):
    def __init__(self,seq_len, input_size, hidden_size, linear_filters,embedding_size:int, num_layers = 1,bidirectional=True,batch_size=32, device='cpu'):
        super(BiLSTMDecoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.linear_filters = linear_filters[::-1]
        self.embedding_size = embedding_size
        self.bidirectional = bidirectional
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.device = device

        if bidirectional:
            self.input_linear = nn.Linear(self.embedding_size,4*self.hidden_size)
        else:
            self.input_linear = nn.Linear(self.embedding_size,2*self.hidden_size)

        # define LSTM layer
        self.layers = []
        # add lstm
        self.lstm = nn.LSTM(input_size = self.linear_filters[0], hidden_size = self.hidden_size,
                            num_layers = self.num_layers, bidirectional=True,
                            batch_first=bidirectional)

                        
        # add linear layers 
        if bidirectional:
            self.layers.append(nn.Linear(2*hidden_size,self.linear_filters[0]))
        else:
            self.layers.append(nn.Linear(hidden_size,self.linear_filters[0]))

        for __id,layer_in in enumerate(self.linear_filters):
            if __id == len(linear_filters)-1:
                self.layers.append(nn.Linear(layer_in,self.input_size))
            else:
                self.layers.append(nn.Linear(layer_in,self.linear_filters[__id+1]))

        self.net = nn.Sequential(*self.layers)

        
        

    def forward(self,encoder_hidden):
        '''
        : param x_input:               input of shape (seq_len, # in batch, input_size)
        : return lstm_out, hidden:     lstm_out gives all the hidden states in the sequence; hidden gives the hidden state and cell state for the last element in the sequence                         
        '''
        
        
        hidden_shape = encoder_hidden.shape
        encoder_hidden = self.input_linear(encoder_hidden)
        
        if self.bidirectional:
            hidden = encoder_hidden.view((self.batch_size,4,self.hidden_size))
            # print(hidden.shape)
            hidden = torch.transpose(hidden,1,0)
            h1,h2,c1,c2 = torch.unbind(hidden,0)
            h,c = torch.stack((h1,h2)),torch.stack((c1,c2))
        else:
            hidden = encoder_hidden.view((self.batch_size,2,self.hidden_size))
            hidden = torch.transpose(hidden,1,0)
            h,c = torch.unbind(hidden,0)
        
        dummy_input = torch.rand((self.batch_size,self.seq_len,self.hidden_size), requires_grad=True)
        dummy_input = dummy_input.to(self.device)
        
        lstm_out, self.hidden = self.lstm(dummy_input,(h,c))
        x = self.net(lstm_out)
        
        return x

class BiLSTMEncDecModel(nn.Module):
    def __init__(self,seq_len, input_size, hidden_size, linear_filters=[128,256,512],embedding_size:int=256, num_layers = 1,bidirectional=True, batch_size=32, device='cpu'):
        super(BiLSTMEncDecModel, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.linear_filters = linear_filters[::-1]
        self.embedding_size = embedding_size
        self.bidirectional = bidirectional
        self.batch_size = batch_size
        self.seq_len = seq_len
        
        self.encoder = BiLSTMEncoder(seq_len, input_size, hidden_size, linear_filters,embedding_size, num_layers = 1,bidirectional=True,batch_size=32)
        self.decoder = BiLSTMDecoder(seq_len, input_size, hidden_size, linear_filters,embedding_size, num_layers = 1,bidirectional=True,batch_size=32, device=device)
        
    def forward(self,x):
        lstm_out,embedding = self.encoder(x)
        decoder_out = self.decoder(embedding)
        
        return decoder_out, embedding  

In [4]:
ae_model = BiLSTMEncDecModel(seq_len=50, input_size=36, hidden_size=512, linear_filters=[128,256,512], embedding_size=256, num_layers=1,bidirectional=True,batch_size=32, device=device)
prep_dir = '../tmp/random_input_100_epochs.pt'

ae_model.load_state_dict(torch.load(prep_dir))

<All keys matched successfully>

In [5]:
skeleton_data = np.load('../data/skeleton_k10_v7_movements.npz')
skeleton_classes, skeleton_mov = skeleton_data['arr_0'], skeleton_data['arr_1']

In [6]:
from collections import defaultdict

label_map = [(1, 'lying'),
 (2, 'sitting'),
 (3, 'standing'),
 (4, 'walking'),
 (5, 'running'),
 (6, 'cycling'),
 (7, 'Nordic walking'),
 (9, 'watching TV'),
 (10, 'computer work'),
 (11, 'car driving'),
 (12, 'ascending stairs'),
 (13, 'descending stairs'),
 (16, 'vacuum cleaning'),
 (17, 'ironing'),
 (18, 'folding laundry'),
 (19, 'house cleaning'),
 (20, 'playing soccer'),
 (24, 'rope jumping')]

label2Id = {c[1]:i for i,c in enumerate(label_map)}
action_dict = defaultdict(list)
skeleton_Ids = []
for i, a in enumerate(skeleton_classes):
    action_dict[label2Id[a]].append(i)
    skeleton_Ids.append(label2Id[a])

In [7]:
action_dict

defaultdict(list,
            {10: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
             9: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19],
             8: [20, 21, 22, 23, 24, 25, 26, 27, 28, 29],
             5: [30, 31, 32, 33, 34, 35, 36, 37, 38, 39],
             11: [40, 41, 42, 43, 44, 45, 46, 47, 48, 49],
             14: [50, 51, 52, 53, 54, 55, 56, 57, 58, 59],
             15: [60, 61, 62, 63, 64, 65, 66, 67, 68, 69],
             13: [70, 71, 72, 73, 74, 75, 76, 77, 78, 79],
             0: [80, 81, 82, 83, 84, 85, 86, 87, 88, 89],
             6: [90, 91, 92, 93, 94, 95, 96, 97, 98, 99],
             16: [100, 101, 102, 103, 104, 105, 106, 107, 108, 109],
             17: [110, 111, 112, 113, 114, 115, 116, 117, 118, 119],
             4: [120, 121, 122, 123, 124, 125, 126, 127, 128, 129],
             1: [130, 131, 132, 133, 134, 135, 136, 137, 138, 139],
             2: [140, 141, 142, 143, 144, 145, 146, 147, 148, 149],
             12: [150, 151, 152, 153, 154, 155, 156, 157, 158, 15

In [8]:
def get_class_ft(data, model, device, bs=32):
    ns, _, _ = data.shape 
    padded_mat = F.pad(input=data, pad=(0,0,0,0,0,bs-ns), mode='constant', value=0)
    _, vector_out = model(padded_mat.float().to(device)) # batch second mode
    action_feat_mat = vector_out[:ns, :].cpu().detach().numpy()
    return action_feat_mat

In [9]:
ae_model = ae_model.to(device)

action_ft_dict = {a: get_class_ft(torch.from_numpy(skeleton_mov[i, ...]), ae_model, device) for a,i in action_dict.items()}

In [10]:
action_ft_dict[10].shape

(10, 256)

In [11]:
from random import sample, choice

In [12]:
def positive_sampling(label, k=10):
    total_samples = action_ft_dict[label]
    n, _ = total_samples.shape 
    sample_idx = sample(range(n), k)
    samples = total_samples[sample_idx, :]
    return torch.from_numpy(samples) 

In [13]:
positive_sampling(10, 5).shape

torch.Size([5, 256])

In [14]:
def negative_sampling(label, k=10):
    neg_action = set(action_ft_dict.keys())-{label}
    neg_sample_action = sample(neg_action, k)
    sample_point = choice(range(10)) # has to change
    neg_samples = np.array([action_ft_dict[a][sample_point, :] for a in neg_sample_action])
    neg_samples = torch.from_numpy(neg_samples)
    return neg_samples

In [15]:
negative_sampling(10, 5).shape

torch.Size([5, 256])

In [16]:
disMet = nn.CosineSimilarity(dim=1, eps=1e-6)
pos_thr = 0.05
neg_thr = 0.85

In [17]:
input1 = torch.randn(1, 128)
input2 = torch.randn(100, 128)
cos = nn.CosineSimilarity(dim=1, eps=1e-6)
output = cos(input1, input2)
output.shape

torch.Size([100])

In [18]:
def postive_distance(pred_vector, pos_vectors, agg='mean'):
    pos_distances = disMet(pred_vector, pos_vectors)
    soft_distance = F.relu(torch.abs(pos_distances)-Tensor([pos_thr]))
    if agg == "mean":
        return soft_distance.mean()
    else:
        return soft_distance.sum()

In [19]:
pred_vector = torch.randn((1, 256))

In [20]:
positive_sampling(10, 10).shape

torch.Size([10, 256])

In [21]:
postive_distance(pred_vector, positive_sampling(10, 10))

tensor(0.0175)

In [22]:
def negative_distance(pred_vector, neg_vectors, agg="mean"):
    neg_distances = disMet(pred_vector, neg_vectors)
    soft_distance = F.relu(Tensor([neg_thr])-torch.abs(neg_distances))
    if agg == "mean":
        return soft_distance.mean()
    else:
        return soft_distance.sum()

In [23]:
negative_distance(pred_vector, negative_sampling(10, 10))

tensor(0.8037)

---

In [24]:
import numpy as np 
import torch 
import torch.nn.functional as F 
from torch import nn, Tensor 

from random import sample, choice
from collections import defaultdict
from copy import deepcopy

In [33]:
class ActionTripletLoss(nn.Module):
    # def __init__(self, action_feats, label_map, pos_thr=0.05, neg_thr=0.95, agg="mean"):
    def __init__(self, action_feats, distance='cosine', k=10, pos_thr=0.05, neg_thr=0.95, agg="mean", device="cpu"):
        super(ActionTripletLoss, self).__init__()
        self.action_feats = deepcopy(action_feats)
        self.get_disMet(distance)
        self.k = k
        self.pos_thr = pos_thr
        self.neg_thr = neg_thr
        self.agg = agg 
        self.device = device 

    def get_disMet(self, distance):
        if distance == 'cosine':
            self.disMet = nn.CosineSimilarity(dim=1, eps=1e-6)
        else:
            self.disMet = nn.PairwiseDistance(p=2)

    def positive_sampling(self, label, k=10):
        total_samples = self.action_feats[label]
        self.n, _ = total_samples.shape 
        sample_idx = sample(range(self.n), k)
        samples = total_samples[sample_idx, :]
        return torch.from_numpy(samples) 

    def negative_sampling(self, label, k=10):
        neg_action = set(self.action_feats.keys())-{label}
        neg_sample_action = sample(neg_action, k)
        sample_point = choice(range(self.n)) # has to change
        neg_samples = np.array([self.action_feats[a][sample_point, :] for a in neg_sample_action])
        neg_samples = torch.from_numpy(neg_samples)
        return neg_samples

    def postive_distance(self, pred_vector, pos_vectors, agg='mean'):
        pos_distances = self.disMet(pred_vector, pos_vectors)
        soft_distance = F.relu(torch.abs(pos_distances)-Tensor([self.pos_thr]))
        if agg == "mean":
            return soft_distance.mean()
        else:
            return soft_distance.sum()

    
    def negative_distance(self, pred_vector, neg_vectors, agg="mean"):
        neg_distances = self.disMet(pred_vector, neg_vectors)
        soft_distance = F.relu(Tensor([self.neg_thr])-torch.abs(neg_distances))
        if agg == "mean":
            return soft_distance.mean()
        else:
            return soft_distance.sum()

    def forward(self, pred_fts, labels):
        pos_loss, neg_loss = 0, 0
        for i,l in enumerate(labels):
            ft = pred_fts[i, ...]
            pos_vectors = self.positive_sampling(l, self.k)
            neg_vectors = self.negative_sampling(l, self.k)
            pos_loss += self.postive_distance(ft, pos_vectors, agg=self.agg)
            neg_loss += self.negative_distance(ft, neg_vectors, agg=self.agg)

        pos_loss = pos_loss/len(labels)
        neg_loss = neg_loss/len(labels)
        return pos_loss+neg_loss


In [34]:
skeleton_data = np.load('../data/skeleton_k10_v7_movements.npz')
skeleton_classes, skeleton_mov = skeleton_data['arr_0'], skeleton_data['arr_1']

In [35]:
label_map = [(1, 'lying'),
 (2, 'sitting'),
 (3, 'standing'),
 (4, 'walking'),
 (5, 'running'),
 (6, 'cycling'),
 (7, 'Nordic walking'),
 (9, 'watching TV'),
 (10, 'computer work'),
 (11, 'car driving'),
 (12, 'ascending stairs'),
 (13, 'descending stairs'),
 (16, 'vacuum cleaning'),
 (17, 'ironing'),
 (18, 'folding laundry'),
 (19, 'house cleaning'),
 (20, 'playing soccer'),
 (24, 'rope jumping')]

label2Id = {c[1]:i for i,c in enumerate(label_map)}
action_dict = defaultdict(list)
skeleton_Ids = []
for i, a in enumerate(skeleton_classes):
    action_dict[label2Id[a]].append(i)
    skeleton_Ids.append(label2Id[a])

In [36]:
def get_class_ft(data, model, device, bs=32):
    ns, _, _ = data.shape 
    padded_mat = F.pad(input=data, pad=(0,0,0,0,0,bs-ns), mode='constant', value=0)
    _, vector_out = model(padded_mat.float().to(device)) # batch second mode
    action_feat_mat = vector_out[:ns, :].cpu().detach().numpy()
    return action_feat_mat

In [37]:
ae_model = ae_model.to(device)
action_ft_dict = {a: get_class_ft(torch.from_numpy(skeleton_mov[i, ...]), ae_model, device) for a,i in action_dict.items()}

In [38]:
trpLoss =ActionTripletLoss(action_ft_dict, distance='cosine', k=10, pos_thr=0.90, neg_thr=0.05, agg="mean", device=device)

In [39]:
pred_batch = torch.randn(8, 256)
true_actions = [1, 4, 3, 6, 10, 5, 3, 1]

In [40]:
loss = trpLoss(pred_batch, true_actions)
loss

tensor(0.0157)

In [41]:
loss.grad_fn

---

In [1]:
import numpy as np 

import torch 
import torch.nn.functional as F 
from torch import nn, Tensor 

In [12]:
class ActionTripletLoss(nn.Module):
    # def __init__(self, action_feats, label_map, pos_thr=0.05, neg_thr=0.95, agg="mean"):
    def __init__(self, distance='cosine', k=10, pos_thr=0.05, neg_thr=0.95, theta=1e-4, agg="mean", device="cpu"):
        super(ActionTripletLoss, self).__init__()
        self.distance = distance
        self.get_disMet(distance)
        self.k = k
        self.pos_thr = pos_thr
        self.neg_thr = neg_thr
        self.agg = agg 
        self.device = device 
        self.theta = theta

    def get_disMet(self, distance):
        if distance == 'cosine':
            self.disMet = nn.CosineSimilarity(dim=1, eps=1e-6)
        else:
            self.disMet = nn.PairwiseDistance(p=2)

    def forward(self, pred_fts, pos_fts, neg_fts):
        neg_distances = self.disMet(pred_fts, neg_fts).to(self.device)
        pos_distances = self.disMet(pred_fts, pos_fts).to(self.device)

        if self.distance == 'cosine':
            neg_distances = torch.abs(neg_distances)
            pos_distances = torch.abs(pos_distances)

        triplet_loss = F.relu(pos_distances-neg_distances+self.theta)
        return triplet_loss.mean()


In [13]:
loss_func = ActionTripletLoss(distance='cosine')

In [14]:
pred_fts = torch.randn((32, 128))
pos_fts = torch.randn((32, 128))
neg_fts = torch.randn((32, 128))

loss_func(pred_fts, pos_fts, neg_fts)

tensor(0.0353)

In [10]:
pred_fts+pos_fts

RuntimeError: The size of tensor a (32) must match the size of tensor b (320) at non-singleton dimension 0

---

In [20]:
pred_fts = torch.randn((32, 128))
pos_fts = torch.randn((10, 32, 128))

mat = nn.PairwiseDistance(p=2)
mat(pred_fts, pos_fts).shape

torch.Size([10, 32])

In [1]:
import numpy as np 

import torch 
import torch.nn.functional as F 
from torch import nn, Tensor 

In [2]:
class ActionTripletLoss(nn.Module):
    # def __init__(self, action_feats, label_map, pos_thr=0.05, neg_thr=0.95, agg="mean"):
    def __init__(self, distance='cosine', k=10, pos_thr=0.05, neg_thr=0.95, theta=1e-4, agg="mean", device="cpu"):
        super(ActionTripletLoss, self).__init__()
        self.distance = distance
        self.get_disMet(distance)
        self.k = k
        self.pos_thr = pos_thr
        self.neg_thr = neg_thr
        self.agg = agg 
        self.device = device 
        self.theta = theta

    def get_disMet(self, distance):
        if distance == 'cosine':
            self.disMet = nn.CosineSimilarity(dim=2, eps=1e-6)
        else:
            self.disMet = nn.PairwiseDistance(p=2)

    def forward(self, pred_fts, pos_fts, neg_fts):
        neg_distances = self.disMet(pred_fts, neg_fts).to(self.device)
        pos_distances = self.disMet(pred_fts, pos_fts).to(self.device)

        if self.distance == 'cosine':
            neg_distances = torch.abs(neg_distances)
            pos_distances = torch.abs(pos_distances)

        triplet_loss = F.relu(pos_distances-neg_distances+self.theta)
        return triplet_loss.mean()

In [3]:
loss_func = ActionTripletLoss(distance='cosine')

In [4]:
pred_fts = torch.randn((32, 128))
pos_fts = torch.randn((10, 32, 128))
neg_fts = torch.randn((10, 32, 128))

loss_func(pred_fts, pos_fts, neg_fts)

tensor(0.0323)