# Libraries

In [1]:
import os

# import argparse
import sys
import torch
import yaml
import time
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import nn
from torch.optim.lr_scheduler import LambdaLR


from ae_dataset import make_dataset as ae_make_dataset
from ae_dataset import make_dataloader as ae_make_dataloader

from est_dataset import make_dataset as est_make_dataset
from est_dataset import make_dataloader as est_make_dataloader
from sklearn.model_selection import train_test_split


from tqdm import tqdm
from metrics import compute_pck_pckh, calculate_error
import numpy as np
from torch import nn
from torchsummary import summary


# Model


In [2]:
#Some codes are adopted from cite dynamic cnn 

class Dynamic_conv2d(nn.Module):
    def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, bias=False, n_basis_kernels=4,
                 temperature=31, pool_dim='freq'):
        super(Dynamic_conv2d, self).__init__()

        self.in_planes = in_planes
        self.out_planes = out_planes
        self.kernel_size = kernel_size
        self.stride = stride
        self.padding = padding
        self.pool_dim = pool_dim

        self.n_basis_kernels = n_basis_kernels
        self.attention = attention2d(in_planes, self.kernel_size, self.stride, self.padding, n_basis_kernels,
                                     temperature, pool_dim)

        self.weight = nn.Parameter(torch.randn(n_basis_kernels, out_planes, in_planes, self.kernel_size, self.kernel_size),
                                   requires_grad=True)

        if bias:
            self.bias = nn.Parameter(torch.Tensor(n_basis_kernels, out_planes))
        else:
            self.bias = None

        for i in range(self.n_basis_kernels):
            nn.init.kaiming_normal_(self.weight[i])

    def forward(self, x): #x size : [bs, in_chan, frames, freqs]   -> new: [bs, in_chan, freqs, frames]
        if self.pool_dim in ['freq', 'chan']:
            # softmax_attention = self.attention(x).unsqueeze(2).unsqueeze(4)    # size : [bs, n_ker, 1, frames, 1]
            softmax_attention = self.attention(x).unsqueeze(2).unsqueeze(3)    # size : [bs, n_ker, 1,1,frames]
        elif self.pool_dim == 'time':
            softmax_attention = self.attention(x).unsqueeze(2).unsqueeze(4)    # size : [bs, n_ker, 1, freqs, 1]
        elif self.pool_dim == 'both':
            softmax_attention = self.attention(x).unsqueeze(-1).unsqueeze(-1).unsqueeze(-1)    # size : [bs, n_ker, 1, 1, 1]

        batch_size = x.size(0)

        aggregate_weight = self.weight.view(-1, self.in_planes, self.kernel_size, self.kernel_size) # size : [n_ker * out_chan, in_chan]

        if self.bias is not None:
            aggregate_bias = self.bias.view(-1)
            output = F.conv2d(x, weight=aggregate_weight, bias=aggregate_bias, stride=self.stride, padding=self.padding)
        else:
            output = F.conv2d(x, weight=aggregate_weight, bias=None, stride=self.stride, padding=self.padding)
            # output size : [bs, n_ker * out_chan, frames, freqs]

        output = output.view(batch_size, self.n_basis_kernels, self.out_planes, output.size(-2), output.size(-1))
        # output size : [bs, n_ker, out_chan, frames, freqs]

        if self.pool_dim in ['freq', 'chan']:
            assert softmax_attention.shape[-1] == output.shape[-1]
        elif self.pool_dim == 'time':
            assert softmax_attention.shape[-2] == output.shape[-2]

        output = torch.sum(output * softmax_attention, dim=1)  # output size : [bs, out_chan, frames, freqs]

        return output


class attention2d(nn.Module):
    def __init__(self, in_planes, kernel_size, stride, padding, n_basis_kernels, temperature, pool_dim):
        super(attention2d, self).__init__()
        self.pool_dim = pool_dim
        self.temperature = temperature

        hidden_planes = int(in_planes / 4)

        if hidden_planes < 4:
            hidden_planes = 4

        if not pool_dim == 'both':
            self.conv1d1 = nn.Conv1d(in_planes, hidden_planes, kernel_size, stride=stride, padding=padding, bias=False)
            self.bn = nn.BatchNorm1d(hidden_planes)
            self.relu = nn.ReLU(inplace=True)
            self.conv1d2 = nn.Conv1d(hidden_planes, n_basis_kernels, 1, bias=True)
            for m in self.modules():
                if isinstance(m, nn.Conv1d):
                    nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
                    if m.bias is not None:
                        nn.init.constant_(m.bias, 0)
                if isinstance(m, nn.BatchNorm1d):
                    nn.init.constant_(m.weight, 1)
                    nn.init.constant_(m.bias, 0)
        else:
            self.fc1 = nn.Linear(in_planes, hidden_planes)
            self.relu = nn.ReLU(inplace=True)
            self.fc2 = nn.Linear(hidden_planes, n_basis_kernels)


    def forward(self, x): #CSI size : [bs, chan, freqs, frames] 
        if self.pool_dim == 'freq':
            x = torch.mean(x, dim=2)  #x size : [bs, chan, frames] 
        elif self.pool_dim == 'time':
            x = torch.mean(x, dim=3)  #x size : [bs, chan, freqs]
        elif self.pool_dim == 'both':
            # x = torch.mean(torch.mean(x, dim=2), dim=1)  #x size : [bs, chan]
            x = F.adaptive_avg_pool2d(x, (1, 1)).squeeze(-1).squeeze(-1)
        elif self.pool_dim == 'chan':
            x = torch.mean(x, dim=1)  #x size : [bs, freqs, frames]

        if not self.pool_dim == 'both':
            x = self.conv1d1(x)               #x size : [bs, hid_chan, frames]
            x = self.bn(x)
            x = self.relu(x)
            x = self.conv1d2(x)               #x size : [bs, n_ker, frames]
        else:
            x = self.fc1(x)               #x size : [bs, hid_chan]
            x = self.relu(x)
            x = self.fc2(x)               #x size : [bs, n_ker]

        return F.softmax(x / self.temperature, 1)

In [3]:
# DENOISER 

# Doi Batch Norm len truoc 


#d_hidden, dropout_ae
class Encoder(nn.Module):
    def __init__(self, shape_data, n_kernels, kernel_size, maxpooling):
        super().__init__()
        self.shape_data = shape_data
        self.n_kernels = n_kernels
        self.kernel_size = kernel_size
        self.maxpooling = maxpooling 

      # input -> conv2d -> batchnorm -> maxpool2d ->conv2d -> batchnorm -> maxpool     
       # layer1
        self.conv1 = nn.Conv2d(
                                in_channels=self.shape_data[0],
                                out_channels=self.n_kernels[0],
                                kernel_size=self.kernel_size[0],
                                padding=1
                               )
        self.atv1 = nn.ReLU()
        self.batchNorm1 = nn.BatchNorm2d(num_features=self.n_kernels[0])
        self.pool1 = nn.MaxPool2d(kernel_size=self.maxpooling[0])     

        # layer 2 
        self.conv2 = nn.Conv2d(
                                in_channels=self.n_kernels[0],
                                out_channels=self.n_kernels[1],
                                kernel_size=self.kernel_size[1],
                                padding=1
                                )
        self.atv2 = nn.ReLU()
        self.batchNorm2 = nn.BatchNorm2d(num_features=self.n_kernels[1])
        self.pool2 = nn.MaxPool2d(kernel_size=self.maxpooling[1]) 
       
        # height and weight of the output shape of conv1 
        self.h_conv1 = int((self.shape_data[1] - self.kernel_size[0] + 3 )) # padding = 1
        self.w_conv1 = int((self.shape_data[2] - self.kernel_size[0] + 3 ))      
        
        # output of max-pooling 1
        self.h_pool1 = int((self.h_conv1 - self.maxpooling[0])/self.maxpooling[0] + 1)
        # stride of max-pooling = maxpooling 
        self.w_pool1 = int((self.w_conv1 - self.maxpooling[0])/self.maxpooling[0] + 1) 

        # height and weight of the output shape of conv2 
        self.h_conv2 = int(self.h_pool1 - self.kernel_size[1] + 3 ) # padding = 1
        self.w_conv2 = int(self.w_pool1 - self.kernel_size[1] + 3 )      
        
        #output of max-pooling 2
        self.h_pool2 = int((self.h_conv2 - self.maxpooling[1])/self.maxpooling[1] + 1)
        self.w_pool2 = int((self.w_conv2 - self.maxpooling[1])/self.maxpooling[1] + 1) 
        
        # # layer3
        # self.conv3 = nn.Conv2d(in_channels=64, out_channels=3, kernel_size=3, padding=1)
        # self.relu3 = nn.ReLU()
        # self.pool3 = nn.MaxPool2d(kernel_size=2, padding=0)

    def forward(self, x):
        # x = self.pool1(self.batchNorm1(self.relu1(self.conv1(x))))
        x = self.pool1(self.atv1(self.batchNorm1(self.conv1(x))))
        x = self.pool2(self.atv2(self.batchNorm2(self.conv2(x))))

       
        # x = self.pool3(self.relu3(self.conv3(x)))
        return x


class Decoder(nn.Module):
    def __init__(self, shape_data, n_kernels, kernel_size, maxpooling):
        super().__init__()
        self.shape_data = shape_data
        self.n_kernels = n_kernels
        self.kernel_size = kernel_size
        self.maxpooling = maxpooling 

        self.h_conv1 = int((self.shape_data[1] - self.kernel_size[0] + 3 ))
        self.w_conv1 = int((self.shape_data[2] - self.kernel_size[0] + 3 ))   
        self.h_pool1 = int((self.h_conv1 - self.maxpooling[0])/self.maxpooling[0] + 1)
        self.w_pool1 = int((self.w_conv1 - self.maxpooling[0])/self.maxpooling[0] + 1) 
        
        self.h_conv2 = int(self.h_pool1 - self.kernel_size[1] + 3 )
        self.w_conv2 = int(self.w_pool1 - self.kernel_size[1] + 3)    
        
        self.h_pool2 = int((self.h_conv2 - self.maxpooling[1])/self.maxpooling[1] + 1)
        self.w_pool2 = int((self.w_conv2 - self.maxpooling[1])/self.maxpooling[1] + 1) 
        
        # Layer 3 ~ hidden layer
        self.conv3 = nn.Conv2d(in_channels=self.n_kernels[1],
                               out_channels=self.n_kernels[1],
                               kernel_size=self.kernel_size[1],
                               padding=1
                               )
        self.atv3 = nn.ReLU()
        self.batchNorm3 = nn.BatchNorm2d(num_features=self.n_kernels[1])
        # Upsampling:  desired_output_size = (32, 32)  # Upsample to 32x32
        # upsample_layer = nn.Upsample(size=desired_output_size, mode='bilinear', align_corners=False)
        self.up_pool3 = nn.Upsample(size =(self.h_conv2, self.w_conv2))

        # Layer 4
        self.conv4 = nn.Conv2d(in_channels=self.n_kernels[1],
                               out_channels=self.n_kernels[0],
                               kernel_size=self.kernel_size[0],
                               padding=1 
                                )
        self.atv4 = nn.ReLU()
        self.batchNorm4 = nn.BatchNorm2d(num_features=self.n_kernels[0])

        self.up_pool4 = nn.Upsample(size = (self.h_conv1, self.w_conv1) )

        # Layer 5  = output 
        self.conv5 = nn.Conv2d(in_channels=self.n_kernels[0],
                               out_channels=self.shape_data[0],
                               kernel_size=self.kernel_size[0],
                               padding=1 
                                )
        self.atv5 = nn.ReLU()

    def forward(self, x):
        x = self.up_pool3(self.batchNorm3(self.atv3(self.conv3(x))))
        x = self.up_pool4(self.batchNorm4(self.atv4(self.conv4(x))))
        x = self.atv5(self.conv5(x))

        return x


class Denoiser(nn.Module):
    def __init__(self, shape_data, n_kernels, kernel_size, maxpooling):
        super().__init__()
        self.shape_data = shape_data
        self.n_kernels = n_kernels
        self.kernel_size = kernel_size
        self.maxpooling = maxpooling
        self.encoder = Encoder(self.shape_data,  self.n_kernels, self.kernel_size, self.maxpooling)
        self.decoder = Decoder(self.shape_data,  self.n_kernels, self.kernel_size, self.maxpooling)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)
        return x


# if __name__ == '__main__':
#     test_model = AutoEncoder().to("cuda")

#     # ummary(test_model, input_size=(3, 32,136), batch_size=16, device="cuda")
#     pass

# model = AutoEncoder(shape_data = (3,32,136), n_kernels = [64,4], kernel_size = [3,3], maxpooling = [2,2]).to("cuda")
model = Denoiser(shape_data = (3,136,32), n_kernels = [26,183], kernel_size = [3,3], maxpooling = [2,2]).to("cuda")
summary(model, (3, 136, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 26, 136, 32]             728
       BatchNorm2d-2          [-1, 26, 136, 32]              52
              ReLU-3          [-1, 26, 136, 32]               0
         MaxPool2d-4           [-1, 26, 68, 16]               0
            Conv2d-5          [-1, 183, 68, 16]          43,005
       BatchNorm2d-6          [-1, 183, 68, 16]             366
              ReLU-7          [-1, 183, 68, 16]               0
         MaxPool2d-8           [-1, 183, 34, 8]               0
           Encoder-9           [-1, 183, 34, 8]               0
           Conv2d-10           [-1, 183, 34, 8]         301,584
             ReLU-11           [-1, 183, 34, 8]               0
      BatchNorm2d-12           [-1, 183, 34, 8]             366
         Upsample-13          [-1, 183, 68, 16]               0
           Conv2d-14           [-1, 26,

In [4]:
# Keypoint Estimator 

class Estimator(nn.Module):

    
    def __init__(self, shape_data,  
                dropout = None,
                kernel_size = [3, 3, 3,3,3],
                n_kernels =[8, 8, 8,8,8],   
                num_layers = 2,  
                pad = [1, 1, 1,1,1], #fix la 1,k doi 
                stride = [1, 1, 1,1,1], #fix la 1, k doi                              
                maxPooling = [3,3,2,2,2],
                n_basis_kernels = 5, 
                temperature = 31,
                d_linear = 128,
                pool_dim = 'time'): 
        super(Estimator, self).__init__()
        
        self.num_layers  = num_layers # no. dcnn layers 
        # self.est_n_input_ch = est_n_input_ch  # no. channels
        self.shape_data = shape_data
        self.dropout = dropout
        self.kernel_size = kernel_size # kernel size
        self.n_kernels = n_kernels  # no. filters for each conv layer
        self.pad = pad
        self.stride = stride
        self.maxPooling = maxPooling   # pooling dimensions for each poooling layers 
        self.n_basis_kernels = n_basis_kernels  # no. kernels
        
        self.temperature = temperature
        self.pool_dim = pool_dim # dimension for pooling
        self.d_linear = d_linear
        # self.est_n_filt_last = est_kernel_size[-1]

        
        self.h_conv = [0]*self.num_layers
        self.w_conv = [0]*self.num_layers
        self.h_pool = [0]*self.num_layers
        self.w_pool = [0]*self.num_layers
        in_dim = [0]*self.num_layers
        out_dim = [0]*self.num_layers


        cnn = nn.Sequential()

        def conv(i, dropout=None):
            in_dim[i] = self.shape_data[0] if i == 0 else self.n_kernels[i-1] #kenh dau vao cua conv
            out_dim[i] = self.n_kernels[i] #kenh dau ra cua conv 

            cnn.add_module( "conv{0}".format(i),
                            Dynamic_conv2d(in_dim[i],
                                           out_dim[i],
                                           self.kernel_size[i],
                                           self.stride[i],
                                           self.pad[i], 
                                           n_basis_kernels = self.n_basis_kernels, 
                                           temperature=self.temperature, 
                                           pool_dim=self.pool_dim)
                            ) 

            
            cnn.add_module("batchNorm{0}".format(i), 
                           nn.BatchNorm2d(out_dim[i], momentum = 0.99)
                           )
                
            cnn.add_module("activ{0}".format(i), 
                           nn.ReLU())
            
            if dropout is not None: 
                cnn.add_module("dropout{0}".format(i),
                               nn.Dropout(dropout))
                 
            cnn.add_module("pooling{0}".format(i), 
                           nn.MaxPool2d(self.maxPooling[i]))
            
            if i == 0:
                self.h_conv[0] = int((self.shape_data[1] - self.kernel_size[0] + 3 ))
                self.w_conv[0] = int((self.shape_data[2] - self.kernel_size[0] + 3)) 

                self.h_pool[0] = int((self.h_conv[0] - self.maxPooling[0])/self.maxPooling[0] + 1)
                self.w_pool[0] = int((self.w_conv[0] - self.maxPooling[0])/self.maxPooling[0] + 1)
            
            else:
                self.h_conv[i] = int((self.h_pool[i-1] - self.kernel_size[i] + 3 ))
                self.w_conv[i] = int((self.w_pool[i-1] - self.kernel_size[i] + 3 ))
                
                self.h_pool[i] = int((self.h_conv[i] - self.maxPooling[i])/self.maxPooling[i] + 1)
                self.w_pool[i] = int((self.w_conv[i] - self.maxPooling[i])/self.maxPooling[i] + 1)
        
        
        for i in range(self.num_layers):  
            conv(i, dropout= self.dropout)       
        self.cnn = cnn
        
        self.fc1 = nn.Linear(in_features= out_dim[self.num_layers-1]*self.h_pool[self.num_layers-1]*self.w_pool[self.num_layers-1], out_features= self.d_linear)
        self.batch_norm = nn.BatchNorm1d(num_features=self.d_linear)
        self.fc2 = nn.Linear(in_features=self.d_linear, out_features=34)
        #self.drop = nn.Dropout(p=dropout)

    def forward(self, x):  # x size : [batch, channel, frames, freqs] 32, 3, 10,114   ```` 32,136 
        batch = x.shape[0]
        # print(x.shape)
        
        x = self.cnn(x)

        x = x.view(x.size(0), -1)
        # print(x.shape)
        x = self.fc1(x)
        x = self.batch_norm(x)
        x = nn.functional.relu(x)
        # x = self.drop(x)
        x = self.fc2(x)
        x = x.reshape(batch, 17, 2)

        return x


model = Estimator(shape_data = (3,136,32)).to("cuda")

summary(model, (3, 136, 32))


----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv1d-1               [-1, 4, 136]              36
       BatchNorm1d-2               [-1, 4, 136]               8
              ReLU-3               [-1, 4, 136]               0
            Conv1d-4               [-1, 5, 136]              25
       attention2d-5               [-1, 5, 136]               0
    Dynamic_conv2d-6           [-1, 8, 136, 32]           1,080
       BatchNorm2d-7           [-1, 8, 136, 32]              16
              ReLU-8           [-1, 8, 136, 32]               0
         MaxPool2d-9            [-1, 8, 45, 10]               0
           Conv1d-10                [-1, 4, 45]              96
      BatchNorm1d-11                [-1, 4, 45]               8
             ReLU-12                [-1, 4, 45]               0
           Conv1d-13                [-1, 5, 45]              25
      attention2d-14                [-1

In [5]:

class CombinedModel(nn.Module):
    def __init__(self, denoiser, predictor):
        super(CombinedModel, self).__init__()
        # Denoiser
        self.denoiser = denoiser
        # FD CNN
        self.predictor = predictor

    def forward(self, x):
        with torch.no_grad():
            encoded = self.denoiser(x)
        output = self.predictor(encoded)
        return output



device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

AE = Denoiser(shape_data = (3,136,32), n_kernels = [26,183], kernel_size = [3,3], maxpooling = [2,2])

CNN_input_shape = (AE.encoder.n_kernels[-1], AE.encoder.h_pool2, AE.encoder.w_pool2)

CNN = Estimator(shape_data=CNN_input_shape, 
                              dropout=0.1,
                              n_basis_kernels=16,
                              kernel_size=[2,2,2,2,2],
                              n_kernels=[16,16,16,16,16],
                              num_layers= 5,
                              pad=[1,1,1,1,1,1],
                              stride = [1,1,1,1,1],
                              maxPooling= (3,3,2,2,2),
                              temperature=30, 
                              pool_dim='time',
                              d_linear = 256
                              )

model = CombinedModel(AE.encoder, CNN)
model = model.to(device)
summary(model, (3, 136, 32))

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 26, 136, 32]             728
       BatchNorm2d-2          [-1, 26, 136, 32]              52
              ReLU-3          [-1, 26, 136, 32]               0
         MaxPool2d-4           [-1, 26, 68, 16]               0
            Conv2d-5          [-1, 183, 68, 16]          43,005
       BatchNorm2d-6          [-1, 183, 68, 16]             366
              ReLU-7          [-1, 183, 68, 16]               0
         MaxPool2d-8           [-1, 183, 34, 8]               0
           Encoder-9           [-1, 183, 34, 8]               0
           Conv1d-10               [-1, 45, 35]          16,470
      BatchNorm1d-11               [-1, 45, 35]              90
             ReLU-12               [-1, 45, 35]               0
           Conv1d-13               [-1, 16, 35]             736
      attention2d-14               [-1,

# Trainer

In [6]:
class Denoise_Trainer:
    def __init__(self, model, train_loader, val_loader, test_loader, criterion, optimizer,
                 scheduler, model_save_path="checkpoints"):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.criterion = criterion
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.model_save_path = model_save_path
        self.metric = dict()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.train_epoch_idx = 0
        self.val_epoch_idx = 0

    def train_epoch(self):
        # print(f"Training Denoiser. Epoch: {self.train_epoch_idx}")
        self.metric[self.train_epoch_idx] = dict()
        self.model.train()
        losses = []
        
        for iter, (data, label) in enumerate(self.train_loader):
            data = data.to(self.device)
            label = label.to(self.device)
            predict = self.model(data)
            loss = self.criterion(predict, label)

            losses.append(loss.item())
            # print(f"Iter {iter + self.train_epoch_idx * len(self.train_loader)}: MSE - {losses[-1]}")

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        self.metric[self.train_epoch_idx]["denoiser_train_loss"] = sum(losses) / len(losses)
        
        self.train_epoch_idx += 1

    def val_epoch(self):
        # print(f"Validating Denoiser. Epoch: {self.val_epoch_idx}")
        self.metric[self.train_epoch_idx] = dict()
        self.model.eval()
        losses = []
        for data, label in self.val_loader:
            data = data.to(self.device)
            label = label.to(self.device)
            with torch.no_grad():
                predict = self.model(data)
            loss = self.criterion(predict, label)

            losses.append(loss.item())

        self.metric[self.val_epoch_idx]["denoiser_val_loss"] = sum(losses) / len(losses)
        
        # print(f"Val loss: {sum(losses) / len(losses)}")
        if self.val_epoch_idx == 0 or \
                self.metric[self.val_epoch_idx]["denoiser_val_loss"] < self.metric[self.val_epoch_idx - 1]["denoiser_val_loss"]:
            self.save_model()
            # print(f"Denoiser Model is saved at epoch {self.val_epoch_idx}")
        self.val_epoch_idx += 1

    def test_epoch(self):
        # print(f"Testing Denoiser")
        # self.model.load_state_dict(torch.load("/home/nxhoang/Work/HPE-VinUni/src/model/checkpoints/model_best.pth")['model_state_dict'])
        self.model.eval()
        losses = []
        for data, label in tqdm(self.test_loader):
            data = data.to(self.device)
            label = label.to(self.device)
            with torch.no_grad():
                predict = self.model(data)
            loss = self.criterion(predict, label)

            losses.append(loss.item())

        self.metric["test"] = sum(losses) / len(losses)
        
        # print(f"Denoiser test result: {self.metric['test']}")

    def save_model(self):
        state_dict = dict()
        state_dict["model_state_dict"] = self.model.state_dict()
        state_dict["optimizer_state_dict"] = self.optimizer.state_dict()
        state_dict["train_loss_history"] = torch.tensor([self.metric[i]["denoiser_train_loss"] for i in range(self.train_epoch_idx)])
        state_dict["val_loss_history"] = torch.tensor([self.metric[i]["denoiser_val_loss"] for i in range(self.val_epoch_idx)])

        save_path = os.path.join(self.model_save_path, f"model_best.pth")
        torch.save(state_dict, save_path)


class Estimastor_Trainer:
    def __init__(self, model, train_loader, val_loader, test_loader, criterion, optimizer,
                 scheduler = None, model_save_path="checkpoints"):
        self.model = model
        self.train_loader = train_loader
        self.val_loader = val_loader
        self.test_loader = test_loader
        self.criterion = criterion
        self.optimizer = optimizer
        # self.scheduler = scheduler
        self.model_save_path = model_save_path
        self.metric = dict()
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.train_epoch_idx = 0
        self.val_epoch_idx = 0
        self.min_val_loss = 9999

    def train_epoch(self):
        # print(f"Training Estimator.  Epoch {self.train_epoch_idx}")
        self.metric[self.train_epoch_idx] = dict()
        self.model.train()
        losses = []

        for iter, (data, gt) in enumerate(self.train_loader):
            # print(data.shape)
            data = np.transpose(data, (0, 1, 3, 2))
            # print('data_shape', data.shape)
            data = data.to(self.device)
            confidence = gt[:, :, 2:].to(self.device)
            # print('shape confidence', confidence.shape)
            label = gt[:, :, 0:2].to(self.device)
            # print('shape label', label.shape)
            predict = self.model(data)
            # print('shape predict', predict.shape)

            loss = self.criterion(torch.mul(predict, confidence), torch.mul(label, confidence)) / 32

            losses.append(loss.item())
            # print(f"Iter {iter + self.train_epoch_idx * len(self.train_loader)}: MSE - {losses[-1]}")

            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

        # self.scheduler.step()

        self.metric[self.train_epoch_idx]["train_loss"] = sum(losses) / len(losses)
         
        self.train_epoch_idx += 1

    def val_epoch(self):
        # print(f"Validating Estimator. Epoch {self.val_epoch_idx}")
        self.metric[self.train_epoch_idx] = dict()
        self.model.eval()

        pck_50_iter = []
        pck_40_iter = []
        pck_30_iter = []
        pck_20_iter = []
        pck_10_iter = []
        pck_5_iter = []

        error_iter = []


        losses = []
        for data, gt in self.val_loader:
            data = np.transpose(data, (0, 1, 3, 2)) # [batch size, channels, freq, time]
            data = data.to(self.device)
            label = gt[:, :, 0:2].to(self.device) #xy_keypoint

            confidence = gt[:, :, 2:3].to(self.device)
            with torch.no_grad():
                predict = self.model(data)
            loss = self.criterion(torch.mul(confidence, predict), torch.mul(confidence, label))

            losses.append(loss.item())

            predict = predict.cpu()
            label = label.cpu()

            error_iter.append(calculate_error(predict,label))


            predict = torch.transpose(predict, 1, 2)
            label = torch.transpose(label, 1, 2)

            pck_50_iter.append(compute_pck_pckh(predict, label, 0.5))
            pck_40_iter.append(compute_pck_pckh(predict, label, 0.4))
            pck_30_iter.append(compute_pck_pckh(predict, label, 0.3))
            pck_20_iter.append(compute_pck_pckh(predict, label, 0.2))
            pck_10_iter.append(compute_pck_pckh(predict, label, 0.1))
            pck_5_iter.append(compute_pck_pckh(predict, label, 0.05))

            

            
        error = np.mean(error_iter,0)*1000   
        self.metric[self.val_epoch_idx]["val loss"] = sum(losses) / len(losses)
        self.metric[self.val_epoch_idx]["pck_50"] = sum(pck_50_iter) / len(pck_50_iter)
        self.metric[self.val_epoch_idx]["pck_40"] = sum(pck_40_iter) / len(pck_40_iter)
        self.metric[self.val_epoch_idx]["pck_30"] = sum(pck_30_iter) / len(pck_30_iter)
        self.metric[self.val_epoch_idx]["pck_20"] = sum(pck_20_iter) / len(pck_20_iter)
        self.metric[self.val_epoch_idx]["pck_10"] = sum(pck_10_iter) / len(pck_10_iter)
        self.metric[self.val_epoch_idx]["pck_5"] = sum(pck_5_iter) / len(pck_5_iter)
        self.metric[self.val_epoch_idx]["mpjpe"] = error[0]
        self.metric[self.val_epoch_idx]["pampjpe"] = error[1]

        loss_avg = sum(losses) / len(losses)
        print(f"Val loss: {loss_avg}")
        print("val_pck_50: ", sum(pck_50_iter) / len(pck_50_iter))
        print("val_pck_40: ", sum(pck_40_iter) / len(pck_40_iter))
        print("val_pck_30: ", sum(pck_30_iter) / len(pck_30_iter))
        print("val_pck_20: ", sum(pck_20_iter) / len(pck_20_iter))
        print("val_pck_10: ", sum(pck_10_iter) / len(pck_10_iter))
        print("val_pck_5: ", sum(pck_5_iter) / len(pck_5_iter))
        print("val_mpjpe: ", error[0])
        print("val_pampjpe: ", error[1])

        logs = "Val loss: " + str(sum(losses) / len(losses)) + ", "
        logs += "pck_50: " + str(sum(pck_50_iter) / len(pck_50_iter)) + ", "
        logs += "pck_40: " + str(sum(pck_40_iter) / len(pck_40_iter)) + ", "
        logs += "pck_30: " + str(sum(pck_30_iter) / len(pck_30_iter)) + ", "
        logs += "pck_20: " + str(sum(pck_20_iter) / len(pck_20_iter)) + ", "
        logs += "pck_10: " + str(sum(pck_10_iter) / len(pck_10_iter)) + ", "
        logs += "pck_5: " + str(sum(pck_5_iter) / len(pck_5_iter)) + "\n"
        
        # with open("/home/nxhoang/Work/HPE/src/model/logs/combined_model.txt", "a") as f:
        
        #     f.write(logs)

        if self.val_epoch_idx == 0 or \
                self.metric[self.val_epoch_idx]["val loss"] < self.min_val_loss:
            self.min_val_loss = self.metric[self.val_epoch_idx]["val loss"]
            self.save_model()
            # print(f"Estimator Model is saved at epoch {self.val_epoch_idx}")
        self.val_epoch_idx += 1

    def test_epoch(self):
        # print(f"Estmator Testing")
        self.model.load_state_dict(torch.load("/home/nxhoang/Work/HPE-VinUni/src/model/checkpoints/model_best.pth")['model_state_dict'])
        self.model.eval()
        pck_50_iter = []
        pck_40_iter = []
        pck_30_iter = []
        pck_20_iter = []
        pck_10_iter = []
        pck_5_iter = []
        error_iter = []
        losses = []
        for data, gt in tqdm(self.test_loader):
            data = np.transpose(data, (0, 1, 3, 2))  # [batch size, channels, freq, time]
            data = data.to(self.device)
            label = gt[:, :, 0:2].to(self.device)
            confidence = gt[:, :, 2:].to(self.device)
            with torch.no_grad():
                predict = self.model(data)
            loss = self.criterion(torch.mul(predict, confidence), torch.mul(label, confidence))

            losses.append(loss.item())

            predict = predict.cpu()
            label = label.cpu()
            error_iter.append(calculate_error(predict,label))


            predict = torch.transpose(predict, 1, 2)
            label = torch.transpose(label, 1, 2)

            pck_50_iter.append(compute_pck_pckh(predict, label, 0.5))
            pck_40_iter.append(compute_pck_pckh(predict, label, 0.4))
            pck_30_iter.append(compute_pck_pckh(predict, label, 0.3))
            pck_20_iter.append(compute_pck_pckh(predict, label, 0.2))
            pck_10_iter.append(compute_pck_pckh(predict, label, 0.1))
            pck_5_iter.append(compute_pck_pckh(predict, label, 0.05))

           

        error = np.mean(error_iter,0)*1000
        self.metric["test"] = dict()
        self.metric["test"]["loss"] = sum(losses) / len(losses)
        self.metric["test"]["pck_50"] = sum(pck_50_iter) / len(pck_50_iter)
        self.metric["test"]["pck_40"] = sum(pck_40_iter) / len(pck_40_iter)
        self.metric["test"]["pck_30"] = sum(pck_30_iter) / len(pck_30_iter)
        self.metric["test"]["pck_20"] = sum(pck_20_iter) / len(pck_20_iter)
        self.metric["test"]["pck_10"] = sum(pck_10_iter) / len(pck_10_iter)
        self.metric["test"]["pck_5"] = sum(pck_5_iter) / len(pck_5_iter)
        self.metric["test"]["mpjpe_mean"] = error[0]
        self.metric["test"]["pampjpe_mean"] = error[1]
        

    def save_model(self):
        state_dict = dict()
        state_dict["model_state_dict"] = self.model.state_dict()
        state_dict["optimizer_state_dict"] = self.optimizer.state_dict()
        state_dict["train_loss_history"] = torch.tensor([self.metric[i]["train_loss"] for i in range(self.train_epoch_idx)])
        state_dict["val_loss_history"] = torch.tensor([self.metric[i]["val loss"] for i in range(self.val_epoch_idx)])
        state_dict["pck_50"] = self.metric[self.val_epoch_idx]["pck_50"]
        state_dict["pck_40"] = self.metric[self.val_epoch_idx]["pck_40"]
        state_dict["pck_30"] = self.metric[self.val_epoch_idx]["pck_30"]
        state_dict["pck_20"] = self.metric[self.val_epoch_idx]["pck_20"]
        state_dict["pck_10"] = self.metric[self.val_epoch_idx]["pck_10"]
        state_dict["pck_5"] = self.metric[self.val_epoch_idx]["pck_5"]
        state_dict["mpjpe"] = self.metric[self.val_epoch_idx]["mpjpe"]
        state_dict["pampjpe"] = self.metric[self.val_epoch_idx]["pampjpe"]
        save_path = os.path.join(self.model_save_path, f"model_best.pth")
        torch.save(state_dict, save_path)


In [7]:
with open('/home/nxhoang/Work/HPE-VinUni/src/model/configs/config_withoutBO.yaml') as f:
  config = yaml.safe_load(f)

In [8]:
other_config = {'modality': 'wifi-csi',
                        'protocol': 'protocol2',
                        'data_unit': 'frame',
                        'random_split': {'ratio': 0.8,
                                         'random_seed': 42,
                                         'train_dataset': {'split': 'training',
                                                            'scenes': 'None',
                                                            'subjects': 'None',
                                                          'actions': 'all'},
                                          'val_dataset': {'split': 'validation',
                                                          'scenes': 'None',
                                                          'subjects': 'None',
                                                          'actions': 'all'}},
                        'cross_scene_split': {'train_dataset': {'split': 'training',
                                              'scenes': ['E01', 'E02', 'E03'],
                                              'subjects': 'None',
                                              'actions': 'all'},
                                              'val_dataset': {'split': 'validation',
                                                              'scenes': ['E04'],
                                                              'subjects': 'None',
                                                              'actions': 'all'}},
                        'cross_subject_split': {'train_dataset': {'split': 'training',
                                                                  'scenes': 'None',
                                                                  'subjects': ['S01','S02','S03','S04','S06','S07','S08','S09','S11','S12','S13','S14','S16','S17','S18','S19','S21','S22','S23','S24','S26','S27','S28','S29','S31','S32','S33','S34','S36','S37','S38','S39'],
                                                                  'actions': 'all'},
                                                'val_dataset': {'split': 'validation',
                                                                'scenes': 'None',
                                                                'subjects': ['S05', 'S10', 'S15', 'S20', 'S25', 'S30', 'S35', 'S40'],
                                                                'actions': 'all'}},
                        'manual_split': {'train_dataset': {'split': 'training',
                                                           'scenes': 'None',
                                                           'subjects': ['S01','S02','S03','S04','S05','S06','S07','S08','S09','S10','S11','S12','S13','S14','S15','S16','S17','S18','S19','S20','S21','S22','S23','S24','S25','S26','S27','S28','S29','S30','S31','S32','S33','S34','S35','S36','S37','S38','S39','S40'],
                                                           'actions': ['A01','A02','A03','A04','A05','A06','A07','A08','A09','A10','A11','A12','A13','A14','A15','A16','A17','A18','A19','A20','A21']},
                                          'val_dataset': {'split': 'validation',
                                                          'scenes': 'None',
                                                          'subjects': ['S01','S02','S03','S04','S05','S06','S07','S08','S09','S10','S11','S12','S13','S14','S15','S16','S17','S18','S19','S20','S21','S22','S23','S24','S25','S26','S27','S28','S29','S30','S31','S32','S33','S34','S35','S36','S37','S38','S39','S40'],
                                                          'actions': ['A22', 'A23', 'A24', 'A25', 'A26', 'A27']}},
                        'split_to_use': 'random_split',
                        'init_rand_seed': 0,
                        'data_root': '/home/nxhoang/Work/HPE-VinUni/Data',
                        'AE_checkpoint': '/home/nxhoang/Work/HPE-VinUni/src/model/checkpoints/AE_model_best.pth',
                        'combined_checkpoint': '/home/nxhoang/Work/HPE-VinUni/src/model/checkpoints/combined_model_best.pth'}


## Load dataset

In [9]:
data_shape = (3,136,32)

In [10]:
ae_train_dataset, ae_test_dataset = ae_make_dataset(other_config["data_root"], other_config)
rng_generator = torch.manual_seed(other_config['init_rand_seed'])
train_loader = ae_make_dataloader(ae_train_dataset, is_training=True, generator=rng_generator, batch_size = config['ae_batch_size'])
val_data, test_data = train_test_split(ae_test_dataset, test_size=0.5, random_state=41)
val_loader = ae_make_dataloader(val_data, is_training=False, generator=rng_generator, batch_size = config['ae_batch_size'])
test_loader = ae_make_dataloader(test_data, is_training=False, generator=rng_generator, batch_size = config['ae_batch_size'])


In [11]:
 #Initialize autoencoder denoiser
denoiser = Denoiser(shape_data=data_shape, 
                            n_kernels=(config['ae_n_kernels_1'], config['ae_n_kernels_2']), 
                            kernel_size= [config['ae_kernel_size_1'],config['ae_kernel_size_2']], 
                            maxpooling=([config['ae_maxpooling_1'], config['ae_maxpooling_2']])
                            ) 
denoiser.to(device)


# Train autoencoder denoiser 
criterion_ae = nn.MSELoss().to(device)
optimizer_ae = torch.optim.RMSprop(denoiser.parameters(), lr=config['ae_lr'], momentum=config['ae_momentum'])
# optimizer_ae = torch.optim.Adam(denoiser.parameters(), lr=config['ae_lr'])
        
n_epochs_ae = config['ae_n_epochs']

        
schedule_ae = LambdaLR(optimizer_ae, lr_lambda=lambda epoch: 1 if epoch < 45 else torch.exp(-0.1))

trainer_ae = Denoise_Trainer(denoiser, train_loader, val_loader, test_loader, criterion=criterion_ae,
                                     optimizer=optimizer_ae, scheduler=schedule_ae)
        
print("START TRAINING DENOISER AT: ", time.perf_counter_ns())
for epoch in range(n_epochs_ae):
        trainer_ae.train_epoch()
        trainer_ae.val_epoch()
print("END TRAINING DENOISER AT", time.perf_counter_ns())
trainer_ae.test_epoch()

START TRAINING DENOISER AT:  15805580368786


 19%|█▉        | 23/121 [00:00<00:00, 222.21it/s]

END TRAINING DENOISER AT 15854150867688


100%|██████████| 121/121 [00:00<00:00, 225.61it/s]


In [12]:
 # Extract Encoder part
en_denoiser = denoiser.encoder
en_denoiser.eval()
for param in en_denoiser.parameters():
    param.requires_grad = False  # Freeze parameters

#================================================ Initialize estimator ======================================#
cnn_input_shape = (en_denoiser.n_kernels[-1], en_denoiser.h_pool2, en_denoiser.w_pool2)
        
estimator = Estimator(shape_data=cnn_input_shape, 
                              dropout=config['est_dropout'],
                              n_basis_kernels=config['est_n_basis_kernels'],
                              kernel_size=(config['est_kernel_size_1'], config['est_kernel_size_2'], config['est_kernel_size_3']),
                              n_kernels=(config['est_n_kernels_1'], config['est_n_kernels_2'], config['est_n_kernels_3']),
                              num_layers=config['est_num_layers'],
                              pad=[1,1,1,1,1],
                              stride = [1,1,1,1,1],
                              maxPooling= (config['est_maxpooling_1'],  config['est_maxpooling_2'], config['est_maxpooling_3']),
                              temperature=config['est_temperature'], 
                              pool_dim='time',
                              d_linear = config['est_d_hidden']
                              )

In [13]:
# Initialize a combined model
combined_model = CombinedModel(en_denoiser, estimator).to(device)

est_train_dataset, est_test_dataset = est_make_dataset(other_config["data_root"], other_config)
rng_generator1 = torch.manual_seed(other_config['init_rand_seed'])
train_loader1 = est_make_dataloader(est_train_dataset, is_training=True, generator=rng_generator1,
                                                 batch_size = config['est_batch_size'])
val_data1, test_data1 = train_test_split(est_test_dataset, test_size=0.5, random_state=41)
val_loader1 = est_make_dataloader(val_data1, is_training=False, generator=rng_generator1,
                                               batch_size = config['est_batch_size'])
test_loader1 = est_make_dataloader(test_data1, is_training=False, generator=rng_generator1,
                                                batch_size = config['est_batch_size'])

        # Training combined_model with Encoder parameters being frozen
criterion_cb = nn.MSELoss().to(device)
        # optimizer_cb = torch.optim.SGD(combined_model.parameters(), lr=config['est_lr'], momentum=config['est_momentum'])
optimizer_cb = torch.optim.Adam(combined_model.parameters(), lr=config['est_lr'])

n_epochs_cb = config["est_n_epochs"]
      


trainer_cb = Estimastor_Trainer(combined_model, train_loader1, val_loader1, test_loader1, criterion=criterion_cb,
                                        optimizer=optimizer_cb, scheduler=None)
        
print("START TRAINING ESTIMATOR AT: ", time.perf_counter_ns())
for epoch in range(n_epochs_cb):
        trainer_cb.train_epoch()
        trainer_cb.val_epoch()
print("END TRAINING ESTIMATOR AT: ", time.perf_counter_ns())    
trainer_cb.test_epoch()


START TRAINING ESTIMATOR AT:  15868821974072
Val loss: 0.23916999287292606
val_pck_50:  80.09322837741651
val_pck_40:  71.03313917780227
val_pck_30:  56.74216920374707
val_pck_20:  35.467932049869134
val_pck_10:  11.804580951232953
val_pck_5:  3.16953881962621
val_mpjpe:  178.76235603309067
val_pampjpe:  90.32659576216138
Val loss: 0.21971766015545266
val_pck_50:  81.49221010125359
val_pck_40:  72.9345862320338
val_pck_30:  59.525621211599386
val_pck_20:  39.13606169926986
val_pck_10:  13.63734530697525
val_pck_5:  3.630173893327824
val_mpjpe:  170.19553189394904
val_pampjpe:  88.49437643876477
