In [1]:
import torch

print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA device: {torch.cuda.get_device_name(0)}")

CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Ti Laptop GPU


In [2]:
import torch
import torch.nn as nn
import numpy as np

# x = torch.randn(1, 5, 2)
x = torch.from_numpy(np.array([[[1,2], [3,4], [5,6], [7, 8], [9, 10]]])).float()
print(x.shape)
print(x)

conv1d = nn.Conv1d(in_channels=5, out_channels=3, kernel_size=2)

output = conv1d(x)
print(output.shape)
print(output)

torch.Size([1, 5, 2])
tensor([[[ 1.,  2.],
         [ 3.,  4.],
         [ 5.,  6.],
         [ 7.,  8.],
         [ 9., 10.]]])
torch.Size([1, 3, 1])
tensor([[[1.4459],
         [1.3017],
         [0.9227]]], grad_fn=<ConvolutionBackward0>)


In [3]:
print(x)

tensor([[[ 1.,  2.],
         [ 3.,  4.],
         [ 5.,  6.],
         [ 7.,  8.],
         [ 9., 10.]]])


In [4]:
print(output)

tensor([[[1.4459],
         [1.3017],
         [0.9227]]], grad_fn=<ConvolutionBackward0>)


In [5]:
from scripts.parsers import parse_sequences as parse_sequence_info

file_path = 'gait3d\\ListOfSequences.txt'
sequences = parse_sequence_info(file_path)

mocap_keys = []
par_cam_keys = []
par_cam_person = set()
par_after_cloth_change_keys = []
par_after_cloth_change_person = set()

for key, params in sequences.items():
    if params['MoCap_data']:
        mocap_keys.append(key)
        if key[-1] in ["1", "3", "5", "7"]:
            par_cam_keys.append(key)
            par_cam_person.add(key[:-2])
        if key[-1] in ["5", "7"]:
            par_after_cloth_change_keys.append(key)
            par_after_cloth_change_person.add(key[:-2])

print(f"Number of sequences with mocap data: {len(mocap_keys)}")
print(f"Number of sequences with mocap data and parallel cameras: {len(par_cam_keys)}")
print(f"Number of sequences with mocap data, parallel cameras and after clothing change: {len(par_after_cloth_change_keys)}")
print(f"Number of unique participants with mocap data and parallel cameras: {len(par_cam_person)}")
print(f"Number of unique participants with mocap data, parallel cameras and after clothing change: {len(par_after_cloth_change_person)}")
par_after_cloth_change_person

Number of sequences with mocap data: 152
Number of sequences with mocap data and parallel cameras: 76
Number of sequences with mocap data, parallel cameras and after clothing change: 12
Number of unique participants with mocap data and parallel cameras: 32
Number of unique participants with mocap data, parallel cameras and after clothing change: 6


{'p26', 'p27', 'p28', 'p29', 'p30', 'p31'}

In [6]:
import random

random.seed(42)

without_clothing_change = []
while len(without_clothing_change) < 6:
    random_person = random.choice(list(par_cam_person))
    if random_person not in par_after_cloth_change_person:
        without_clothing_change.append(random_person)
        par_cam_person.remove(random_person)

with_clothing_change = []
while len(with_clothing_change) < 4:
    random_person = random.choice(list(par_after_cloth_change_person))
    with_clothing_change.append(random_person)
    par_after_cloth_change_person.remove(random_person)


test_seq_set = ([f'{p_seq}s{seq_idx}' for p_seq in without_clothing_change[:3] for seq_idx in [1, 3]] +
                [f'{p_seq}s{seq_idx}' for p_seq in with_clothing_change[:2] for seq_idx in [5, 7]])

valid_seq_set = ([f'{p_seq}s{seq_idx}' for p_seq in without_clothing_change[3:] for seq_idx in [1, 3]] +
                [f'{p_seq}s{seq_idx}' for p_seq in with_clothing_change[2:] for seq_idx in [5, 7]])

print(f"test sequences: {test_seq_set}")
print(f"valid sequences: {valid_seq_set}")
# without_clothing_change + [only_after_clothing_change] + [only_before_clothing_change]

test sequences: ['p6s1', 'p6s3', 'p19s1', 'p19s3', 'p32s1', 'p32s3', 'p29s5', 'p29s7', 'p30s5', 'p30s7']
valid sequences: ['p23s1', 'p23s3', 'p12s1', 'p12s3', 'p8s1', 'p8s3', 'p26s5', 'p26s7', 'p28s5', 'p28s7']


In [7]:
train_seq_set = ([f'{p_seq}s{seq_idx}' for p_seq in list(par_cam_person) for seq_idx in [1, 3]] +
                 [f'{p_seq}s{seq_idx}' for p_seq in list(par_after_cloth_change_person) for seq_idx in [5, 7]])

print(f"train sequences: {train_seq_set}")

train sequences: ['p17s1', 'p17s3', 'p11s1', 'p11s3', 'p25s1', 'p25s3', 'p26s1', 'p26s3', 'p20s1', 'p20s3', 'p9s1', 'p9s3', 'p10s1', 'p10s3', 'p31s1', 'p31s3', 'p22s1', 'p22s3', 'p1s1', 'p1s3', 'p13s1', 'p13s3', 'p27s1', 'p27s3', 'p24s1', 'p24s3', 'p3s1', 'p3s3', 'p5s1', 'p5s3', 'p2s1', 'p2s3', 'p14s1', 'p14s3', 'p7s1', 'p7s3', 'p21s1', 'p21s3', 'p30s1', 'p30s3', 'p29s1', 'p29s3', 'p18s1', 'p18s3', 'p4s1', 'p4s3', 'p16s1', 'p16s3', 'p15s1', 'p15s3', 'p28s1', 'p28s3', 'p31s5', 'p31s7', 'p27s5', 'p27s7']


In [8]:
for key, params in sequences.items():
    if params['MoCap_data']:
        if key[-1] in ["1", "3", "5", "7"]:
            print(f"{key} | {'train' if key in train_seq_set else '     '} | {'valid' if key in valid_seq_set else '     '} | {'test' if key in test_seq_set else '    '} |")

p1s1 | train |       |      |
p1s3 | train |       |      |
p2s1 | train |       |      |
p2s3 | train |       |      |
p3s1 | train |       |      |
p3s3 | train |       |      |
p4s1 | train |       |      |
p4s3 | train |       |      |
p5s1 | train |       |      |
p5s3 | train |       |      |
p6s1 |       |       | test |
p6s3 |       |       | test |
p7s1 | train |       |      |
p7s3 | train |       |      |
p8s1 |       | valid |      |
p8s3 |       | valid |      |
p9s1 | train |       |      |
p9s3 | train |       |      |
p10s1 | train |       |      |
p10s3 | train |       |      |
p11s1 | train |       |      |
p11s3 | train |       |      |
p12s1 |       | valid |      |
p12s3 |       | valid |      |
p13s1 | train |       |      |
p13s3 | train |       |      |
p14s1 | train |       |      |
p14s3 | train |       |      |
p15s1 | train |       |      |
p15s3 | train |       |      |
p16s1 | train |       |      |
p16s3 | train |       |      |
p17s1 | train |       |   

In [9]:
print(f"Train size: {len(train_seq_set)} | {100*len(train_seq_set)/72:.2f}%")
print(f"Test size: {len(test_seq_set)} | {100*len(test_seq_set)/72:.2f}%")
print(f"Valid size: {len(valid_seq_set)} | {100*len(valid_seq_set)/72:.2f}%")

Train size: 56 | 77.78%
Test size: 10 | 13.89%
Valid size: 10 | 13.89%


In [10]:
import json

selected_names_file = "./datasets/mediapipe/selected_joint_names.json"
input_data_file = "./datasets/mediapipe/dataset_v2.json"
output_data_file = "./datasets/mocap/dataset_v2.json"

with open(input_data_file, 'r') as file:
    raw_input = json.load(file)

with open(output_data_file, 'r') as file:
    raw_output = json.load(file)

with open(selected_names_file, 'r') as file:
    selected_names = json.load(file)

selected_names.pop('15')
selected_names.pop('16')
selected_names.pop('13')
selected_names.pop('14')
selected_names

{'27': 'lfoot',
 '28': 'rfoot',
 '25': 'ltibia',
 '26': 'rtibia',
 '23': 'lfemur',
 '24': 'rfemur',
 '11': 'lhumerus',
 '12': 'rhumerus'}

In [11]:
triang_data_file = "./datasets/mediapipe/triangulation.json"

with open(triang_data_file, 'r') as file:
    triangulation_data = json.load(file)


In [12]:
sequences['p1s1']

{'start_frame': 195,
 'number_of_frames': 135,
 'frame_offset': 0,
 'MoCap_data': True}

In [13]:
input_frames_data = {f"c{c_idx}": [] for c_idx in range(1, 5)}
output_frames_data = []
img_width = 960
img_height = 540

not_found = 0
seq_keys_list = train_seq_set + test_seq_set + valid_seq_set

for seq_key in seq_keys_list:
    for f_idx in range(sequences[seq_key]['number_of_frames']):
    # for f_idx in range(2):
        curr_output_array = []
        output_frame_dict = raw_output[seq_key][f_idx]
        for point_idx, joint_name in selected_names.items():
            curr_output_array.append(output_frame_dict[joint_name])

        curr_output_array_np = np.array(curr_output_array)
        # print(curr_output_array_np)
        
        curr_input_arrays = {f"c{c_idx}": [] for c_idx in range(1, 5)}

        all_found = True
        
        for c_idx in range(1, 5):
            input_frame_list = raw_input[seq_key][f"c{c_idx}"][str(f_idx)]
            if [None, None] in input_frame_list:
                all_found = False
                break
                
            for point_idx, joint_name in selected_names.items(): 
                pixel_coords = input_frame_list[int(point_idx)]
                curr_input_arrays[f"c{c_idx}"].append(pixel_coords)

                # curr_input_arrays[f"c{c_idx}"].append([pixel_coords[0]/img_width, pixel_coords[1]/img_height])
                # conversion from pixels to propotions if needed

        # print(curr_input_arrays)

        if all_found:
            for c_idx in range(1, 5):
                input_frames_data[f"c{c_idx}"].append(np.array(curr_input_arrays[f"c{c_idx}"]))
            #     print(np.array(curr_input_arrays[f"c{c_idx}"]).shape)

            # print(curr_output_array_np.shape)    
            output_frames_data.append(curr_output_array_np)
        else:
            not_found += 1

print(f"Frames with all found mocaps: {len(output_frames_data)}")
print(f"Frames with at least one not found mocap: {not_found}")
print(f"Proportion: {100*len(output_frames_data)/(len(output_frames_data) + not_found):.2f}%")
# print(input_frames_data['c4'][0])

Frames with all found mocaps: 6035
Frames with at least one not found mocap: 3495
Proportion: 63.33%


In [14]:
import torch
from torch.utils.data import Dataset

class MoCapInputDataset(Dataset):
    def __init__(self, seq_keys_list, sequences, selected_names, raw_input, raw_output):
        self.img_width = 960
        self.img_height = 540
        self.input_frames_data = {f"c{c_idx}": [] for c_idx in range(1, 5)}
        self.output_frames_data = []
        self.not_found = 0
              
        for seq_key in seq_keys_list:
            for f_idx in range(sequences[seq_key]['number_of_frames']):
                curr_output_array = []
                output_frame_dict = raw_output[seq_key][f_idx]
                for point_idx, joint_name in selected_names.items():
                    curr_output_array.append(output_frame_dict[joint_name])
        
                curr_output_array_np = np.array(curr_output_array)*255
                # 255 multiplier added to mocap to obtain distance in mm
                curr_input_arrays = {f"c{c_idx}": [] for c_idx in range(1, 5)}
        
                all_found = True
                
                for c_idx in range(1, 5):
                    input_frame_list = raw_input[seq_key][f"c{c_idx}"][str(f_idx)]
                    if [None, None] in input_frame_list:
                        all_found = False
                        break
                        
                    for point_idx, joint_name in selected_names.items(): 
                        pixel_coords = input_frame_list[int(point_idx)]
                        curr_input_arrays[f"c{c_idx}"].append(pixel_coords)
                        # curr_input_arrays[f"c{c_idx}"].append(
                        #     [pixel_coords[0]/self.img_width, 
                        #      pixel_coords[1]/self.img_height])
        
                if all_found:
                    for c_idx in range(1, 5):
                        self.input_frames_data[f"c{c_idx}"].append(np.array(curr_input_arrays[f"c{c_idx}"]))
 
                    self.output_frames_data.append(curr_output_array_np)
                else:
                    self.not_found += 1

        self.length = len(self.output_frames_data)

    def __len__(self):
        return self.length

    def __getitem__(self, idx):
        inputs = [torch.from_numpy(self.input_frames_data[f"c{c_idx}"][idx]).float() for c_idx in range(1, 5)]  # each: (12, 2)
        target = torch.from_numpy(self.output_frames_data[idx]).float()  # (12, 3)
        return inputs, target

In [15]:
# Optuna results
best_params = {'lr': 0.005589010994074508, 'weight_decay': 1.1906353862455155e-05, 'dropout': 0.2214033785244307, 'batch_size': 64, 'activation': 'gelu'}

In [16]:
from torch.utils.data import DataLoader

batch_size = best_params['batch_size']

train_ds = MoCapInputDataset(train_seq_set, sequences, selected_names, raw_input, raw_output)
valid_ds = MoCapInputDataset(valid_seq_set, sequences, selected_names, raw_input, raw_output)
test_ds = MoCapInputDataset(test_seq_set, sequences, selected_names, raw_input, raw_output)

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(valid_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

In [17]:
train_ds[1]

([tensor([[0.5763, 0.7128],
          [0.6832, 0.6859],
          [0.5925, 0.6036],
          [0.6394, 0.5961],
          [0.6204, 0.4968],
          [0.6190, 0.4932],
          [0.6346, 0.3300],
          [0.6179, 0.3310]]),
  tensor([[0.4995, 0.6134],
          [0.5165, 0.6481],
          [0.4981, 0.5399],
          [0.5199, 0.5501],
          [0.4977, 0.4576],
          [0.5212, 0.4579],
          [0.4863, 0.3278],
          [0.5287, 0.3244]]),
  tensor([[0.4229, 0.6689],
          [0.3215, 0.6541],
          [0.4081, 0.5586],
          [0.3760, 0.5602],
          [0.3785, 0.4515],
          [0.3695, 0.4531],
          [0.3767, 0.2948],
          [0.3661, 0.2951]]),
  tensor([[0.5145, 0.4479],
          [0.5037, 0.4243],
          [0.5159, 0.3783],
          [0.5023, 0.3725],
          [0.5195, 0.3039],
          [0.5010, 0.3042],
          [0.5302, 0.2048],
          [0.4948, 0.2030]])],
 tensor([[ 460.8066,   78.6099,  132.6600],
         [1144.6643,  202.4733,   26.7038],
       

In [77]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class CustomNet(nn.Module):
    def __init__(self):
        super(CustomNet, self).__init__()
        dropout = 0
        # shape (8, 2) -> reshape to (2, 8) 
        self.conv1d1 = nn.ModuleList([
            nn.Conv1d(in_channels=2, out_channels=8, kernel_size=2, padding=1) for _ in range(4)
        ])
        self.conv1d2 = nn.ModuleList([
            nn.Conv1d(in_channels=8, out_channels=1, kernel_size=2) for _ in range(4)
        ])
        
        self.bn1 = nn.BatchNorm1d(32)
        self.fc1 = nn.Linear(32, 48)
        self.dropout1 = nn.Dropout(p=dropout)
        self.bn2 = nn.BatchNorm1d(48)
        self.fc2 = nn.Linear(48, 32)
        self.dropout2 = nn.Dropout(p=dropout)
        self.bn3 = nn.BatchNorm1d(32)
        self.fc3 = nn.Linear(32, 24)

    def forward(self, x):
        # x: 4 tensors of shape (batch, 8, 2)
        conv_outs = []
        for i, xi in enumerate(x):
            xi = xi.permute(0, 2, 1)  # reshape to (batch, 2, 8) 
            # conv = self.conv1d[i](xi)     # (batch, 1, 8)
            conv = self.conv1d1[i](xi)
            conv = self.conv1d2[i](conv)
            conv = conv.squeeze(1)     # (batch, 8)
            conv_outs.append(conv)

        concat = torch.cat(conv_outs, dim=1)  # (batch, 28)

        out = self.bn1(concat)
        out = F.gelu(self.bn2(self.fc1(out)))
        out = self.dropout1(out)
        out = F.gelu(self.bn3(self.fc2(out)))
        out = self.dropout2(out)
        out = self.fc3(out)  # (batch, 24)
        out = out.view(-1, 8, 3)  # reshape to (batch, 8, 3)
        return out


In [78]:
import torch
import torch.nn as nn

class MPJPE(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, predictions, targets):
        # shape (batch, 8, 3)
        # compute euclidean distance for each point pair
        distances = torch.norm(predictions - targets, dim=2)
        mean_distance = distances.mean()
        return mean_distance


In [79]:
model = CustomNet()
optimizer = torch.optim.AdamW(model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])
# criterion = torch.nn.MSELoss()
criterion = MPJPE()

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    train_loss = 0.0

    for inputs, targets in train_loader:
        inputs = [inp.float() for inp in inputs]
        targets = targets.float()

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()
        train_loss += loss.item() * targets.size(0)

    avg_train_loss = train_loss / len(train_loader.dataset)

    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs = [inp.float() for inp in inputs]
            targets = targets.float()

            outputs = model(inputs)
            loss = criterion(outputs, targets)
            val_loss += loss.item() * targets.size(0)

    avg_val_loss = val_loss / len(val_loader.dataset)

    print(f"Epoch {epoch+1}: Train MPJPE = {avg_train_loss:.4f}, Val MPJPE = {avg_val_loss:.4f}")


Epoch 1: Train MPJPE = 1618.7815, Val MPJPE = 1616.9589
Epoch 2: Train MPJPE = 1586.5022, Val MPJPE = 1571.3214
Epoch 3: Train MPJPE = 1522.8847, Val MPJPE = 1482.3061
Epoch 4: Train MPJPE = 1430.8790, Val MPJPE = 1352.7088
Epoch 5: Train MPJPE = 1314.2452, Val MPJPE = 1191.2985
Epoch 6: Train MPJPE = 1177.9182, Val MPJPE = 1048.2659
Epoch 7: Train MPJPE = 1030.5642, Val MPJPE = 885.8311
Epoch 8: Train MPJPE = 878.0416, Val MPJPE = 746.0581
Epoch 9: Train MPJPE = 742.7503, Val MPJPE = 671.9715
Epoch 10: Train MPJPE = 633.2009, Val MPJPE = 624.3766
Epoch 11: Train MPJPE = 546.8538, Val MPJPE = 527.1913
Epoch 12: Train MPJPE = 479.3043, Val MPJPE = 493.6472
Epoch 13: Train MPJPE = 410.8661, Val MPJPE = 324.2700
Epoch 14: Train MPJPE = 359.4240, Val MPJPE = 255.1743
Epoch 15: Train MPJPE = 336.0443, Val MPJPE = 249.3298
Epoch 16: Train MPJPE = 305.0816, Val MPJPE = 235.5810
Epoch 17: Train MPJPE = 281.5029, Val MPJPE = 261.3451
Epoch 18: Train MPJPE = 265.8526, Val MPJPE = 226.7297
Epoch 

In [80]:
model.eval()
test_loss = 0.0
with torch.no_grad():
    for inputs, targets in test_loader:
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        test_loss += loss.item() * targets.size(0)
test_loss /= len(test_loader.dataset)

print(f"Test loss = {test_loss:.4f}")

Test loss = 133.9710


In [81]:
def predict_single(model, inputs_list, device="cpu"):
    model.eval()
    inputs = [torch.from_numpy(inp).float().unsqueeze(0).to(device) for inp in inputs_list]
    
    with torch.no_grad():
        output = model(inputs) 
    
    return output.squeeze(0).cpu().numpy()

In [82]:
list(raw_output[test_seq_set[0]][0].values())

[[8.542352676391602, 3.7042770385742188, 0.4601781368255615],
 [8.733415603637695, 2.050076961517334, 0.3684786558151245],
 [8.922904014587402, 0.40957820415496826, 0.27750110626220703],
 [8.619648933410645, 3.694295883178711, -0.6643123030662537],
 [8.59278678894043, 2.0004396438598633, -0.4504869282245636],
 [8.83590316772461, 0.44832444190979004, -0.34511709213256836],
 [8.306090354919434, 5.630198001861572, 0.5446023941040039],
 [8.643503189086914, 4.524703025817871, 0.8668829202651978],
 [8.496593475341797, 3.8861234188079834, 0.7753106355667114],
 [8.430327415466309, 5.595487594604492, -0.6955798864364624],
 [8.854900360107422, 4.437718391418457, -0.8193653225898743],
 [8.724615097045898, 3.779153823852539, -0.794681966304779]]

In [83]:
test_seq = test_seq_set[2]
print(test_seq)
frame = 90

bvh_sample_data = list(raw_output[test_seq][frame].values())
triangulation_sample_all_data = triangulation_data[test_seq][frame]
triangulation_sample_data = [triangulation_sample_all_data[int(j_idx)] for j_idx in selected_names.keys()]
print(bvh_sample_data)
print()
print(triangulation_sample_data)

p19s1
[[-7.520870685577393, 3.7407069206237793, 0.9504961967468262], [-7.368579387664795, 2.096409797668457, 0.7714738249778748], [-7.216275691986084, 0.45193934440612793, 0.5923908352851868], [-7.585234642028809, 3.6390230655670166, -0.24816995859146118], [-8.271810531616211, 2.0671019554138184, -0.2194407731294632], [-8.13848876953125, 0.6275912523269653, -0.049029216170310974], [-7.470393657684326, 5.788812160491943, 0.9987093806266785], [-7.426072120666504, 4.562010765075684, 1.3400733470916748], [-7.77842903137207, 3.962614059448242, 1.3499772548675537], [-7.44651985168457, 5.737953186035156, -0.41605842113494873], [-7.228679656982422, 4.530491828918457, -0.6374579668045044], [-7.36409330368042, 3.844463348388672, -0.6903193593025208]]

[[170.03086081600878, -1844.6219957094079, 130.88436771912143], [17.02853537503636, -1944.996260944025, 163.4218074608411], [186.44276651431238, -1878.2360880658093, 503.01631521967727], [-41.32293539639176, -2011.1317421384556, 523.3997112667856],

In [84]:
img_width = 960
img_height = 540

mp_input_sample = []

for c_idx in range(1, 5):
    all_frames_for_camera = raw_input[test_seq][f"c{c_idx}"][str(frame)]
    camera_mp_input_sample = []
    
    for point_idx, joint_name in selected_names.items(): 
        pixel_coords = all_frames_for_camera[int(point_idx)]
        camera_mp_input_sample.append(pixel_coords)
        # camera_mp_input_sample.append([pixel_coords[0]/img_width, pixel_coords[1]/img_height])

    mp_input_sample.append(np.array(camera_mp_input_sample))
    
mp_input_sample

[array([[0.20789899, 0.7136336 ],
        [0.20637479, 0.68947649],
        [0.19463433, 0.603342  ],
        [0.19536977, 0.59146768],
        [0.18979931, 0.49922755],
        [0.20980054, 0.49864772],
        [0.17801417, 0.34159309],
        [0.21851808, 0.343532  ]]),
 array([[0.50281024, 0.52718848],
        [0.51541841, 0.52079028],
        [0.50254595, 0.47092509],
        [0.51958108, 0.46615925],
        [0.50304776, 0.41416296],
        [0.51714069, 0.41404751],
        [0.49629128, 0.33432806],
        [0.52493501, 0.33382931]]),
 array([[0.77794898, 0.65248728],
        [0.80610824, 0.65606916],
        [0.7854656 , 0.55615252],
        [0.82918954, 0.56113464],
        [0.78692073, 0.45205042],
        [0.80421245, 0.45428443],
        [0.78226632, 0.3010366 ],
        [0.81265831, 0.30195358]]),
 array([[0.52748507, 0.60908717],
        [0.50190574, 0.61400777],
        [0.53221762, 0.51364052],
        [0.49170411, 0.51178759],
        [0.53315943, 0.38555178],
        

In [85]:
predicted = predict_single(model, mp_input_sample, 'cpu')
predicted

array([[-1683.3273  ,   155.53522 ,   170.442   ],
       [-1914.3871  ,   123.715096,    38.835804],
       [-1839.3185  ,   531.7227  ,   215.95714 ],
       [-1953.0719  ,   525.3908  ,    -8.404432],
       [-1834.1058  ,   943.9152  ,   252.93167 ],
       [-1835.9868  ,   941.33215 ,   -62.155464],
       [-1819.9902  ,  1415.7582  ,   306.61847 ],
       [-1802.6725  ,  1410.7279  ,   -97.65706 ]], dtype=float32)

In [86]:
import plotly.graph_objects as go
import plotly.io as pio
pio.renderers.default = 'iframe'

SCALE_FACTOR = 255

x = [vec[2]*SCALE_FACTOR for vec in bvh_sample_data]
y = [vec[0]*SCALE_FACTOR for vec in bvh_sample_data]
z = [vec[1]*SCALE_FACTOR for vec in bvh_sample_data]

# x_t = [vec[0]/SCALE_FACTOR for vec in triangulation_sample_data]
# y_t = [vec[1]/SCALE_FACTOR for vec in triangulation_sample_data]
# z_t = [vec[2]/SCALE_FACTOR for vec in triangulation_sample_data]

x_t = [vec[0] for vec in triangulation_sample_data]
y_t = [vec[1] for vec in triangulation_sample_data]
z_t = [vec[2] for vec in triangulation_sample_data]

x_p = [vec[2] for vec in predicted]
y_p = [vec[0] for vec in predicted]
z_p = [vec[1] for vec in predicted]
    
fig = go.Figure(
    data=[
        go.Scatter3d(
            x=x, y=y, z=z,
            mode='markers',
            marker=dict(size=5, color='blue'),
            hoverinfo='text',
            name='Joints BVH'),
        go.Scatter3d(
            x=x_t, y=y_t, z=z_t,
            mode='markers',
            marker=dict(size=5, color='red'),
            hoverinfo='text',
            name='Joints triangulation mediapipe'),
        go.Scatter3d(
            x=x_p, y=y_p, z=z_p,
            mode='markers',
            marker=dict(size=5, color='green'),
            hoverinfo='text',
            name='Predicted by NN'),
        ]
)

fig.update_layout(scene=dict(
    xaxis_title='X',
    yaxis_title='Y',
    zaxis_title='Z',
    xaxis=dict(range=[-6000, 6000]),
    yaxis=dict(range=[-6000, 6000]),
    zaxis=dict(range=[-6000, 6000]),
    aspectmode='cube', 
),
title='3D joints plot from bvh file',
width=800,
height=800
)

fig.show()