# Basic Prototype 2 - Paperspace only


Builds on Basic Prototype 1, adds yolov8 pose detection. Landmarks will be saved in a datastructure which then will be visualised and probably exported as a JSON so that it can be trained on Google Colab. This prototype focuses on testing if an AI model can be trained to interpolate and output the correct landmarks to continue the video sequence after being trained on the sequences of the videos.

Pose detection wise, Openpose is better since it has more landmarks and also tracks the foot, it is also based on C++ so it's quicker, but I cannot get openpose to work yet, so I will implement using YOLOv8 for now just to test the AI model structure. 

Mediapipe is also a possibility but it can only detect 1 person at a time and is much slower. 

Still the broadcast on OBS has to be started manually first.

In [15]:
import glob
files = glob.glob('/datasets/dance_prototype_2/*')
print(files)

['/datasets/dance_prototype_2/idle_bob_2.MOV', '/datasets/dance_prototype_2/output.MOV']


In [16]:
!pip install pytchat
!pip install opencv-python
!pip install glob2
!pip install ultralytics
!pip install torch
!pip install numpy
!pip install pandas
!pip install keras==2.3.1
!pip install tensorflow==2.10.1
!pip install tensorflow-probability==0.18.0
!pip install keras-mdn-layer


[0mCollecting keras==2.3.1
  Using cached Keras-2.3.1-py2.py3-none-any.whl (377 kB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: keras 2.10.0
    Uninstalling keras-2.10.0:
      Successfully uninstalled keras-2.10.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow 2.10.1 requires keras<2.11,>=2.10.0, but you have keras 2.3.1 which is incompatible.[0m[31m
[0mSuccessfully installed keras-2.3.1
Collecting keras<2.11,>=2.10.0
  Using cached keras-2.10.0-py2.py3-none-any.whl (1.7 MB)
Installing collected packages: keras
  Attempting uninstall: keras
    Found existing installation: Keras 2.3.1
    Uninstalling Keras-2.3.1:
      Successfully uninstalled Keras-2.3.1
Successfully installed keras-2.10.0
[0m

In [17]:
# imports

import pytchat
import cv2
import glob
import ultralytics
import torch
import time
import numpy as np
from collections import defaultdict
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
import mdn
import random
import tensorflow as tf

files = glob.glob('/datasets/dance_prototype_2/*')
print(files)


['/datasets/dance_prototype_2/idle_bob_2.MOV', '/datasets/dance_prototype_2/output.MOV']


In [18]:
from PIL import Image

def draw_skeleton(result):
    """draw frame from YOLOv8 results"""
    for r in result:
        im_array = r.plot()  # plot a BGR numpy array of predictions
        im = Image.fromarray(im_array[..., ::-1])  # RGB PIL image
    return np.array(im)[..., ::-1]  # Convert PIL Image back to BGR numpy array

In [14]:
# test YOLOv8 pose recognition with 1 file first

"""Check hardware and load model"""


# Check if GPU is available otherwise use CPU for torch
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# load model
video_path = files[1]
model = ultralytics.YOLO('yolov8n-pose.pt')

# If GPU is available set model to use half-precision floating-point numbers
if torch.cuda.is_available():
    model.to(device)


# predict

cap = cv2.VideoCapture(video_path)
start_time = time.time()

# resize cv2 window

# Get the original video dimensions
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))

# Define the desired width and calculate the height to maintain the aspect ratio
desired_width = 540  # You can change this value
aspect_ratio = height / width
desired_height = int(desired_width * aspect_ratio)

# Get video, pose analyse and display pose detection frame by frame
pose_results = defaultdict(int)
i=0

while(cap.isOpened() and time.time()):
    ret, frame = cap.read()
    if ret == True:
        result = model.predict(frame)
        pose_results[i] = result
        i = i+1
        annotated_frame = draw_skeleton(result)
        # Resize the frame while maintaining the aspect ratio
        resized_frame = cv2.resize(annotated_frame, (desired_width, desired_height))
        # no showing the frame because cloud
        # cv2.imshow('Frame', resized_frame)
        
        # # Press Q on keyboard to exit
        # if cv2.waitKey(25) & 0xFF == ord('q'):
        #     break
    else:
        break

cap.release()
cv2.destroyAllWindows 





0: 640x384 1 person, 8.1ms
Speed: 1.9ms preprocess, 8.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 8.4ms
Speed: 2.1ms preprocess, 8.4ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 7.3ms
Speed: 1.9ms preprocess, 7.3ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 7.3ms
Speed: 1.8ms preprocess, 7.3ms inference, 1.0ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 7.2ms
Speed: 1.8ms preprocess, 7.2ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 6.9ms
Speed: 1.8ms preprocess, 6.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 7.1ms
Speed: 1.7ms preprocess, 7.1ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x384 1 person, 7.0ms
Speed: 1.8ms preprocess, 7.0ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 384)

0: 640x

<function destroyAllWindows>

In [19]:
def define_keypoints():
    return {
    "nose": 0,
    "left_eye": 1,
    "right_eye": 2,
    "left_ear": 3,
    "right_ear": 4,
    "left_shoulder": 5,
    "right_shoulder": 6,
    "left_elbow": 7,
    "right_elbow": 8,
    "left_wrist": 9,
    "right_wrist": 10,
    "left_hip": 11,
    "right_hip": 12,
    "left_knee": 13,
    "right_knee": 14,
    "left_ankle": 15,
    "right_ankle": 16
}

In [20]:
from collections import defaultdict

# LSTM - adapted from AI for media 
# https://git.arts.ac.uk/tbroad/AI-4-Media-22-23/blob/main/Week%205.1%20LSTM%20for%20forecasting%20and%20movement%20generation/Generating_Movement_Sequences_with_LSTM.ipynb

# define keypoints
keypoints = {
    "nose": 0,
    "left_eye": 1,
    "right_eye": 2,
    "left_ear": 3,
    "right_ear": 4,
    "left_shoulder": 5,
    "right_shoulder": 6,
    "left_elbow": 7,
    "right_elbow": 8,
    "left_wrist": 9,
    "right_wrist": 10,
    "left_hip": 11,
    "right_hip": 12,
    "left_knee": 13,
    "right_knee": 14,
    "left_ankle": 15,
    "right_ankle": 16
}


# convert tensor of xy coordinate per frame into a pandas time series
keypoints_dict_master = defaultdict(dict)

for frame, results in pose_results.items():
    for idx, person in enumerate(results):
        # Check if person has the attribute 'keypoints' and it has the attribute 'xy'
        if hasattr(person, 'keypoints') and hasattr(person.keypoints, 'xy'):
            tensor_values = person.keypoints.xy
            
            # Convert tensor to dictionary format
            keypoint_coordinates = {key: tensor_values[0][value] for key, value in keypoints.items() if value < len(tensor_values[0])}
            
            # Use idx as a unique identifier for each person
            keypoints_dict_master[frame][idx] = keypoint_coordinates


# GAN - Temporal GANs

In [21]:
# https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html
# https://github.com/pfnet-research/tgan2
# prototyping with this one for now because it seems easy
# https://github.com/amunozgarza/tsb-gan

In [22]:
# actually going to just try adapt a simple DCGAN from pytorch tutorials first because I'm super confused
# https://pytorch.org/tutorials/beginner/dcgan_faces_tutorial.html

#%matplotlib inline
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML

# Set random seed for reproducibility
manualSeed = 999
#manualSeed = random.randint(1, 10000) # use if you want new results
print("Random Seed: ", manualSeed)
random.seed(manualSeed)
torch.manual_seed(manualSeed)
torch.use_deterministic_algorithms(True) # Needed for reproducible results

Random Seed:  999


In [23]:
# Data preparation
# convert to pandas of each keypoint coordinate for each frame

# Flatten the dictionary
rows = []
for frame, persons in keypoints_dict_master.items():
    for person, keypoints in persons.items():
        row = {'frame': frame, 'person': person}
        for keypoint, coordinates in keypoints.items():
            row[f'{keypoint}_x'] = coordinates[0]
            row[f'{keypoint}_y'] = coordinates[1]
        rows.append(row)

# Convert to DataFrame
df = pd.DataFrame(rows)

df

Unnamed: 0,frame,person,nose_x,nose_y,left_eye_x,left_eye_y,right_eye_x,right_eye_y,left_ear_x,left_ear_y,...,right_hip_x,right_hip_y,left_knee_x,left_knee_y,right_knee_x,right_knee_y,left_ankle_x,left_ankle_y,right_ankle_x,right_ankle_y
0,0,0,"tensor(548.0327, device='cuda:0')","tensor(253.8700, device='cuda:0')","tensor(581.6564, device='cuda:0')","tensor(223.7671, device='cuda:0')","tensor(507.9041, device='cuda:0')","tensor(227.8313, device='cuda:0')","tensor(628.7388, device='cuda:0')","tensor(260.6475, device='cuda:0')",...,"tensor(434.2474, device='cuda:0')","tensor(925.0669, device='cuda:0')","tensor(577.7192, device='cuda:0')","tensor(1317.8802, device='cuda:0')","tensor(438.9311, device='cuda:0')","tensor(1312.3335, device='cuda:0')","tensor(548.4530, device='cuda:0')","tensor(1634.7609, device='cuda:0')","tensor(450.7137, device='cuda:0')","tensor(1627.8787, device='cuda:0')"
1,1,0,"tensor(546.6844, device='cuda:0')","tensor(253.2305, device='cuda:0')","tensor(580.1163, device='cuda:0')","tensor(222.3893, device='cuda:0')","tensor(506.0245, device='cuda:0')","tensor(226.8354, device='cuda:0')","tensor(627.4680, device='cuda:0')","tensor(258.5272, device='cuda:0')",...,"tensor(433.1236, device='cuda:0')","tensor(927.5045, device='cuda:0')","tensor(577.2100, device='cuda:0')","tensor(1318.4789, device='cuda:0')","tensor(438.3355, device='cuda:0')","tensor(1311.6891, device='cuda:0')","tensor(549.2289, device='cuda:0')","tensor(1634.0977, device='cuda:0')","tensor(450.1472, device='cuda:0')","tensor(1625.4445, device='cuda:0')"
2,2,0,"tensor(546.6228, device='cuda:0')","tensor(252.8871, device='cuda:0')","tensor(579.6549, device='cuda:0')","tensor(222.0839, device='cuda:0')","tensor(505.6246, device='cuda:0')","tensor(226.8332, device='cuda:0')","tensor(626.6255, device='cuda:0')","tensor(259.0842, device='cuda:0')",...,"tensor(433.4555, device='cuda:0')","tensor(926.9253, device='cuda:0')","tensor(576.4531, device='cuda:0')","tensor(1318.7943, device='cuda:0')","tensor(438.6046, device='cuda:0')","tensor(1311.5265, device='cuda:0')","tensor(550.3529, device='cuda:0')","tensor(1634.6509, device='cuda:0')","tensor(451.1976, device='cuda:0')","tensor(1625.0098, device='cuda:0')"
3,3,0,"tensor(544.1322, device='cuda:0')","tensor(252.4958, device='cuda:0')","tensor(578.2780, device='cuda:0')","tensor(221.6393, device='cuda:0')","tensor(503.2679, device='cuda:0')","tensor(226.2783, device='cuda:0')","tensor(627.1838, device='cuda:0')","tensor(259.2163, device='cuda:0')",...,"tensor(433.1123, device='cuda:0')","tensor(926.6111, device='cuda:0')","tensor(576.8198, device='cuda:0')","tensor(1317.3354, device='cuda:0')","tensor(437.7052, device='cuda:0')","tensor(1310.5420, device='cuda:0')","tensor(550.5244, device='cuda:0')","tensor(1633.7761, device='cuda:0')","tensor(452.7463, device='cuda:0')","tensor(1624.3308, device='cuda:0')"
4,4,0,"tensor(541.8792, device='cuda:0')","tensor(252.8985, device='cuda:0')","tensor(576.3433, device='cuda:0')","tensor(222.9055, device='cuda:0')","tensor(501.8237, device='cuda:0')","tensor(226.7444, device='cuda:0')","tensor(625.3895, device='cuda:0')","tensor(261.7505, device='cuda:0')",...,"tensor(433.0430, device='cuda:0')","tensor(925.6976, device='cuda:0')","tensor(576.6772, device='cuda:0')","tensor(1317.0862, device='cuda:0')","tensor(437.0998, device='cuda:0')","tensor(1309.3882, device='cuda:0')","tensor(549.8689, device='cuda:0')","tensor(1632.8353, device='cuda:0')","tensor(453.4622, device='cuda:0')","tensor(1622.8385, device='cuda:0')"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,259,0,"tensor(577.7954, device='cuda:0')","tensor(227.9273, device='cuda:0')","tensor(609.9917, device='cuda:0')","tensor(195.5345, device='cuda:0')","tensor(539.4564, device='cuda:0')","tensor(202.7698, device='cuda:0')","tensor(664.3377, device='cuda:0')","tensor(226.4945, device='cuda:0')",...,"tensor(495.2859, device='cuda:0')","tensor(912.8978, device='cuda:0')","tensor(647.8175, device='cuda:0')","tensor(1296.1439, device='cuda:0')","tensor(508.0186, device='cuda:0')","tensor(1288.1680, device='cuda:0')","tensor(632.4800, device='cuda:0')","tensor(1611.2267, device='cuda:0')","tensor(527.8955, device='cuda:0')","tensor(1597.7391, device='cuda:0')"
260,260,0,"tensor(576.9205, device='cuda:0')","tensor(226.2275, device='cuda:0')","tensor(608.9510, device='cuda:0')","tensor(193.7941, device='cuda:0')","tensor(538.2885, device='cuda:0')","tensor(201.4135, device='cuda:0')","tensor(663.2939, device='cuda:0')","tensor(225.0246, device='cuda:0')",...,"tensor(494.5893, device='cuda:0')","tensor(913.1488, device='cuda:0')","tensor(648.5759, device='cuda:0')","tensor(1296.0393, device='cuda:0')","tensor(508.2615, device='cuda:0')","tensor(1288.7631, device='cuda:0')","tensor(633.2435, device='cuda:0')","tensor(1612.1533, device='cuda:0')","tensor(528.1224, device='cuda:0')","tensor(1598.9583, device='cuda:0')"
261,261,0,"tensor(578.3176, device='cuda:0')","tensor(223.7889, device='cuda:0')","tensor(610.1107, device='cuda:0')","tensor(191.1844, device='cuda:0')","tensor(538.9859, device='cuda:0')","tensor(199.0859, device='cuda:0')","tensor(663.7563, device='cuda:0')","tensor(222.7586, device='cuda:0')",...,"tensor(494.8425, device='cuda:0')","tensor(913.6420, device='cuda:0')","tensor(649.2749, device='cuda:0')","tensor(1295.1833, device='cuda:0')","tensor(508.4467, device='cuda:0')","tensor(1287.7594, device='cuda:0')","tensor(633.3896, device='cuda:0')","tensor(1610.9912, device='cuda:0')","tensor(527.0237, device='cuda:0')","tensor(1597.7084, device='cuda:0')"
262,262,0,"tensor(579.3990, device='cuda:0')","tensor(224.5129, device='cuda:0')","tensor(610.7571, device='cuda:0')","tensor(191.7813, device='cuda:0')","tensor(539.7208, device='cuda:0')","tensor(199.9005, device='cuda:0')","tensor(663.5959, device='cuda:0')","tensor(222.7588, device='cuda:0')",...,"tensor(496.1383, device='cuda:0')","tensor(913.8532, device='cuda:0')","tensor(649.9473, device='cuda:0')","tensor(1295.8721, device='cuda:0')","tensor(509.9584, device='cuda:0')","tensor(1288.4834, device='cuda:0')","tensor(633.6974, device='cuda:0')","tensor(1612.2711, device='cuda:0')","tensor(527.1512, device='cuda:0')","tensor(1598.1741, device='cuda:0')"


In [24]:
from sklearn.preprocessing import MinMaxScaler
import torch
import traceback
from tqdm import tqdm

global HEIGHT, WIDTH,nc
HEIGHT = 256
WIDTH = 256
nc = 3  #number of channels

# Functions for slicing up data
def slice_sequence_examples(sequence, num_steps):
    xs = []
    for i in range(len(sequence) - num_steps - 1):
        example = sequence[i: i + num_steps]
        xs.append(example)
        
        # output is list of list of num_steps number of rows (e.g. num_setps =  50 will be 50 first rows, all columns)
    return xs

def seq_to_singleton_format(examples):
    # Takes the sliced sequences and separates each sequence into input (all elements except the last one) and output (just the last element).
    # up until last sequence used as primer
    
    xs = []
    ys = []
    for ex in examples:
        xs.append(ex[:-1])
        ys.append(ex[-1])
    return (xs,ys)  

def keypoints_to_image(scaled_keypoints_df):
    colors = {
    "nose": (255, 0, 0),        # Red
    "left_eye": (0, 255, 0),    # Green
    "right_eye": (0, 0, 255),   # Blue
    "left_ear": (255, 255, 0),  # Yellow
    "right_ear": (255, 0, 255), # Magenta
    "left_shoulder": (0, 255, 255),  # Cyan
    "right_shoulder": (255, 165, 0), # Orange
    "left_elbow": (255, 69, 0),     # Red-Orange
    "right_elbow": (0, 128, 0),     # Green (Lime)
    "left_wrist": (255, 20, 147),   # Deep Pink
    "right_wrist": (255, 140, 0),   # Dark Orange
    "left_hip": (0, 128, 128),      # Teal
    "right_hip": (255, 99, 71),     # Tomato
    "left_knee": (0, 255, 0),       # Green (using a different shade)
    "right_knee": (255, 69, 0),     # Red-Orange (using a different shade)
    "left_ankle": (0, 255, 255),    # Cyan (using a different shade)
    "right_ankle": (255, 165, 0)    # Orange (using a different shade)
}   
    # data frame coloumns are in the format of keypoint_x and keypoint_y
    # takes keypoints dataframe and returns a list of frames of keypoints, each keypoint body part is drawn in a different colour according to dict
    # on black (0 pad) background - turn to 4D
    
    frames = []
    
    for i,row in tqdm(scaled_keypoints_df.iterrows()):
        # match column name to keypoint name
        # black background
        img = np.zeros((HEIGHT,WIDTH,3), np.uint8)
        for keypoint in keypoints.keys():
            
            # draw circle at x,y coordinates
            try:
                cv2.circle(img, (int(float(row[f"{keypoint}_x"])*WIDTH), int(float(row[f"{keypoint}_y"])*HEIGHT)), 5, colors[keypoint], -1)
            except Exception as e:
                print(i)
                print(keypoint)
                traceback.print_exc()
                continue
            
        frames.append(img)
    
    return frames
        



# Normalising our data with min max
sc = MinMaxScaler()
scaled = sc.fit_transform(df.values)
scaled_df= pd.DataFrame(scaled, columns=df.columns)
    
# converting df to images
frames = keypoints_to_image(scaled_df) 
        
# Turning our dataframe structure into an array, excluding the first 2 columns of person and frame
seq = np.array(frames)


# Defining and using our window size to create our inputs X and outputs y
SEQ_LEN = 50
slices = slice_sequence_examples(seq, SEQ_LEN+1)
X, y = seq_to_singleton_format(slices)

X = np.array(X)
y = np.array(y)

print("Number of training examples:")
print("X:", X.shape)
print("y:", y.shape)


# X: (number of frames, sequence length, 17 coordinates, x and y so 34)
# y: (number of frames, 17 coordinates, x and y so 34) - no sequence length cause generative


264it [00:00, 4952.77it/s]


Number of training examples:
X: (212, 50, 256, 256, 3)
y: (212, 256, 256, 3)


In [25]:
np.shape(X[-1])

(50, 256, 256, 3)

In [26]:
# BatchSize×Depth×Height×Width×Channels

# Number of workers for dataloader
workers = 2

# Batch size during training
batch_size = 16

# Spatial size of training images. All images will be resized to this
#   size using a transformer.
image_size = HEIGHT

# Number of frames for seed
nf = 50

# Size of z latent vector (i.e. size of genaerator input) (was 100)
nz = 100

# Size of feature maps in generator
ngf = 64

# Size of feature maps in discriminator
ndf = 64

# Number of training epochs
num_epochs = 5

# Learning rate for optimizers
lr = 0.0002

# Beta1 hyperparameter for Adam optimizers
beta1 = 0.5

# Number of GPUs available. Use 0 for CPU mode.
ngpu = 1

In [27]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# Convert to PyTorch tensors
X_tensor = torch.from_numpy(X)
y_tensor = torch.from_numpy(y)

# Create the dataset
dataset = TensorDataset(X_tensor, y_tensor)
# Create the dataloader
dataloader = DataLoader(dataset, batch_size=batch_size,
                                         shuffle=True, num_workers=workers)

# Decide which device we want to run on
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

# Plot some training images
real_batch = next(iter(dataloader))
# plt.figure(figsize=(8,8))
# plt.axis("off")
# plt.title("Training Images")
# plt.imshow(np.transpose(vutils.make_grid(real_batch[0].to(device)[:64], padding=2, normalize=True).cpu(),(1,2,0)))

In [28]:
# custom weights initialization called on ``netG`` and ``netD``
def weights_init(m):
    classname = m.__class__.__name__
    if classname.find('Conv') != -1:
        nn.init.normal_(m.weight.data, 0.0, 0.02)
    elif classname.find('BatchNorm') != -1:
        nn.init.normal_(m.weight.data, 1.0, 0.02)
        nn.init.constant_(m.bias.data, 0)

In [29]:
import torch.nn as nn

class Generator(nn.Module):
    def __init__(self, ngpu):
        super(Generator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            
            # Input shape: [batch, nz, 1, 1, 1]
            # Output shape: [batch, ngf * 8, 4, 4, 4]
            # [16, 512, 2, 2, 4]
            nn.ConvTranspose3d(nz, ngf * 8, (2, 2, 4), (1, 1, 2), 0, bias=False),
            nn.BatchNorm3d(ngf * 8),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf * 4, 8, 8, 10]
            # [16, 256, 4, 4, 12]
            nn.ConvTranspose3d(ngf * 8, ngf * 4, (2, 2, 6), (2, 2, 2), 0, bias=False),
            nn.BatchNorm3d(ngf * 4),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf * 2, 16, 16, 15]
            # [16, 128, 8, 8, 17]
            nn.ConvTranspose3d(ngf * 4, ngf * 2, (2, 2, 6), (2, 2, 1), 0, bias=False),
            nn.BatchNorm3d(ngf * 2),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf, 32, 32, 20]
            # [16, 64, 16, 16, 22]
            nn.ConvTranspose3d(ngf * 2, ngf, (2, 2, 6), (2, 2, 1), 0, bias=False),
            nn.BatchNorm3d(ngf),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf, 64, 64, 22]
            # [16, 64, 32, 32, 24]
            nn.ConvTranspose3d(ngf, ngf, (2, 2, 3), (2, 2, 1), 0, bias=False),
            nn.BatchNorm3d(ngf),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf, 128, 128, 46]
            # [16, 64, 64, 64, 50]
            nn.ConvTranspose3d(ngf, ngf, (2, 2, 4), (2, 2, 2), 0, bias=False),
            nn.BatchNorm3d(ngf),
            nn.ReLU(True),
            
            # Output shape: [batch, ngf, 128, 128, 46]
            # [16, 3, 128, 128, 50]
            nn.ConvTranspose3d(ngf, ngf, (2, 2, 1), (2, 2, 1), 0, bias=False),
            nn.BatchNorm3d(ngf),
            nn.ReLU(True),
            
            
            # Desired Output shape: [batch, nc, 256, 256, 50]
            # [16, 3, 256, 256, 50]
            nn.ConvTranspose3d(ngf, nc, (2, 2, 1), (2, 2, 1), 0, bias=False),
            nn.Tanh()
        )
        
    def forward(self, input):
        x = input
        print(x.shape)
        for layer in self.main:
            print(layer)
            print(f"before: {x.shape}")
            x = layer(x)
            print(f"after: {x.shape}")
        return x




In [30]:
# Create the generator
netG = Generator(ngpu).to(device)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1):
    netG = nn.DataParallel(netG, list(range(ngpu)))

# Apply the ``weights_init`` function to randomly initialize all weights
#  to ``mean=0``, ``stdev=0.02``.
netG.apply(weights_init)

# Print the model
print(netG)

Generator(
  (main): Sequential(
    (0): ConvTranspose3d(100, 512, kernel_size=(2, 2, 4), stride=(1, 1, 2), bias=False)
    (1): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): ConvTranspose3d(512, 256, kernel_size=(2, 2, 6), stride=(2, 2, 2), bias=False)
    (4): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): ConvTranspose3d(256, 128, kernel_size=(2, 2, 6), stride=(2, 2, 1), bias=False)
    (7): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (8): ReLU(inplace=True)
    (9): ConvTranspose3d(128, 64, kernel_size=(2, 2, 6), stride=(2, 2, 1), bias=False)
    (10): BatchNorm3d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (11): ReLU(inplace=True)
    (12): ConvTranspose3d(64, 64, kernel_size=(2, 2, 3), stride=(2, 2, 1), bias=False)
    (13): BatchNorm3d(64, eps=1e-05, momentum=0.1, affin

In [31]:
class Discriminator(nn.Module):
    def __init__(self, ngpu):
        super(Discriminator, self).__init__()
        self.ngpu = ngpu
        self.main = nn.Sequential(
            # Input is [64, 50, 256, 256, 3]
            # kernel size 4,4,4 - 4 frames 4x4 pixels 
            nn.Conv3d(in_channels=nc, out_channels=ndf, kernel_size= (4, 4, 4), stride=(1, 2, 2), padding=1, bias=False),
            nn.LeakyReLU(0.2, inplace=True),
            # torch.Size([64, 64, 128, 128, 1])
            nn.Conv3d(ndf, ndf*2, (4, 4, 4), (2, 2, 2), padding = (1,1,1), bias=False),
            nn.BatchNorm3d(ndf * 2),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv3d(ndf * 2, ndf * 4, (4, 4, 4), (2, 2, 2), 1, bias=False),
            nn.BatchNorm3d(ndf * 4),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv3d(ndf * 4, ndf * 8, (4, 4, 4), (2, 2, 2), 1, bias=False),
            nn.BatchNorm3d(ndf * 8),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Conv3d(ndf * 8, ndf*16, (3, 4, 4), (2, 2, 2), 1, bias=False),
            nn.Flatten(),
            # Flattened Size=Channels×Depth×Height×Width
            nn.Linear(1024*3*8*8, 1),  # Adjusted the input size to the linear layer
            nn.Sigmoid()
        )
    def forward(self, input):
        x = input
        print(x.shape)
        for layer in self.main:
            print(layer)
            print(f"before: {x.shape}")
            x = layer(x)
            print(f"after: {x.shape}")
        return x

In [32]:
# Create the Discriminator
netD = Discriminator(ngpu).to(device)

# Handle multi-GPU if desired
if (device.type == 'cuda') and (ngpu > 1):
    netD = nn.DataParallel(netD, list(range(ngpu)))

# Apply the ``weights_init`` function to randomly initialize all weights
# like this: ``to mean=0, stdev=0.2``.
netD.apply(weights_init)

# Print the model
print(netD)

Discriminator(
  (main): Sequential(
    (0): Conv3d(3, 64, kernel_size=(4, 4, 4), stride=(1, 2, 2), padding=(1, 1, 1), bias=False)
    (1): LeakyReLU(negative_slope=0.2, inplace=True)
    (2): Conv3d(64, 128, kernel_size=(4, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
    (3): BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (4): LeakyReLU(negative_slope=0.2, inplace=True)
    (5): Conv3d(128, 256, kernel_size=(4, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
    (6): BatchNorm3d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (7): LeakyReLU(negative_slope=0.2, inplace=True)
    (8): Conv3d(256, 512, kernel_size=(4, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
    (9): BatchNorm3d(512, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): LeakyReLU(negative_slope=0.2, inplace=True)
    (11): Conv3d(512, 1024, kernel_size=(3, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias

In [33]:
# Initialize the ``BCELoss`` function
criterion = nn.BCELoss()

# Create batch of latent vectors that we will use to visualize
#  the progression of the generator
fixed_noise = torch.randn(64, nz, 1, 1, 1, device=device)

# Establish convention for real and fake labels during training
real_label = 1.
fake_label = 0.

# Setup Adam optimizers for both G and D
optimizerD = optim.Adam(netD.parameters(), lr=lr, betas=(beta1, 0.999))
optimizerG = optim.Adam(netG.parameters(), lr=lr, betas=(beta1, 0.999))

In [34]:
for i,data in enumerate(dataloader, 0):
    print(i)
    print(data[0].shape)
    print(data[1].shape)
  

0
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
1
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
2
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
3
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
4
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
5
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
6
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
7
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
8
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
9
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
10
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
11
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
12
torch.Size([16, 50, 256, 256, 3])
torch.Size([16, 256, 256, 3])
13
torch.Size([4, 50, 256, 256, 3])
torch.Size([4, 256, 256, 3])


In [35]:
# Training Loop

# Lists to keep track of progress
img_list = []
G_losses = []
D_losses = []
iters = 0

print("Starting Training Loop...")
# For each epoch
for epoch in range(num_epochs):
    # For each batch in the dataloader
    for i, data in enumerate(dataloader, 0):
        print(i)
        
        try:
            ############################
            # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
            ###########################
            ## Train with all-real batch
            netD.zero_grad()
            # Format batch
            # batch, depth, height, width, rgb
            real_cpu_a = data[0].float().to(device)
            # batch, rgb, depth, height, width
            real_cpu = real_cpu_a.permute(0, 4, 1, 2, 3)
            # batch size
            b_size = real_cpu.size(0)
            label = torch.full((b_size,), real_label, dtype=torch.float, device=device)
            # Forward pass real batch through D
            print('Discriminator')
            # batch, rgb, depth,height, width
            output = netD(real_cpu).view(-1)
            # Calculate loss on all-real batch bvNC|
            errD_real = criterion(output, label)
            # Calculate gradients for D in backward pass
            errD_real.backward()
            D_x = output.mean().item()

            ## Train with all-fake batch
            # Generate batch of latent vectors
            noise = torch.randn(b_size, nz, 1, 1, 1, device=device)
            # Generate fake image batch with G
            print('Generator')
            # batch, rgb, height, width, depth
            fake = netG(noise)
            # batch, rgb, depth,height, width
            fake = fake.permute(0, 1, 4, 2, 3)
            label.fill_(fake_label)
            # Classify all fake batch with D
            print('Classify all fake batch with D')
             # batch, rgb, depth,height, width
            output = netD(fake.detach()).view(-1)
            # Calculate D's loss on the all-fake batch
            errD_fake = criterion(output, label)
            # Calculate the gradients for this batch, accumulated (summed) with previous gradients
            errD_fake.backward()
            D_G_z1 = output.mean().item()
            # Compute error of D as sum over the fake and the real batches
            errD = errD_real + errD_fake
            # Update D
            optimizerD.step()

            ############################
            # (2) Update G network: maximize log(D(G(z)))
            ###########################
            netG.zero_grad()
            label.fill_(real_label)  # fake labels are real for generator cost
            # Since we just updated D, perform another forward pass of all-fake batch through D
            output = netD(fake).view(-1)
            # Calculate G's loss based on this output
            errG = criterion(output, label)
            # Calculate gradients for G
            errG.backward()
            D_G_z2 = output.mean().item()
            # Update G
            optimizerG.step()

            # Output training stats
            if i % 50 == 0:
                print('[%d/%d][%d/%d]\tLoss_D: %.4f\tLoss_G: %.4f\tD(x): %.4f\tD(G(z)): %.4f / %.4f'
                    % (epoch, num_epochs, i, len(dataloader),
                        errD.item(), errG.item(), D_x, D_G_z1, D_G_z2))

            # Save Losses for plotting later
            G_losses.append(errG.item())
            D_losses.append(errD.item())

            # Check how the generator is doing by saving G's output on fixed_noise
            if (iters % 500 == 0) or ((epoch == num_epochs-1) and (i == len(dataloader)-1)):
                with torch.no_grad():
                    fake = netG(fixed_noise).detach().cpu()
                img_list.append(vutils.make_grid(fake, padding=2, normalize=True))

            iters += 1
        
        except RuntimeError as e:
        
            # visualise current frame
           
            print(np.shape(real_cpu))
            plt.imshow(np.transpose(vutils.make_grid(real_cpu, padding=2, normalize=True).cpu(),(1,2,0)))
            traceback.print_exc()

Starting Training Loop...
0
Discriminator
torch.Size([16, 3, 50, 256, 256])
Conv3d(3, 64, kernel_size=(4, 4, 4), stride=(1, 2, 2), padding=(1, 1, 1), bias=False)
before: torch.Size([16, 3, 50, 256, 256])
after: torch.Size([16, 64, 49, 128, 128])
LeakyReLU(negative_slope=0.2, inplace=True)
before: torch.Size([16, 64, 49, 128, 128])
after: torch.Size([16, 64, 49, 128, 128])
Conv3d(64, 128, kernel_size=(4, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
before: torch.Size([16, 64, 49, 128, 128])
after: torch.Size([16, 128, 24, 64, 64])
BatchNorm3d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
before: torch.Size([16, 128, 24, 64, 64])
after: torch.Size([16, 128, 24, 64, 64])
LeakyReLU(negative_slope=0.2, inplace=True)
before: torch.Size([16, 128, 24, 64, 64])
after: torch.Size([16, 128, 24, 64, 64])
Conv3d(128, 256, kernel_size=(4, 4, 4), stride=(2, 2, 2), padding=(1, 1, 1), bias=False)
before: torch.Size([16, 128, 24, 64, 64])
after: torch.Size([16, 256, 12,

RuntimeError: The size of tensor a (50) must match the size of tensor b (256) at non-singleton dimension 2

In [None]:
noise = torch.randn(nf,b_size, nz, 1, 1, device=device)

In [None]:
torch.randn(b_size,nf, nz, 1, 1, device=device).shape

torch.Size([50, 64, 256, 1, 1])

In [None]:
nz

256

In [None]:
plt.figure(figsize=(10,5))
plt.title("Generator and Discriminator Loss During Training")
plt.plot(G_losses,label="G")
plt.plot(D_losses,label="D")
plt.xlabel("iterations")
plt.ylabel("Loss")
plt.legend()
plt.show()

In [None]:
import cv2
import numpy as np

def frames_to_video(frames, output_path='temp_video.mp4', fps=30):
    num_frames, height, width = frames.shape[0:3]
    fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # specify the video codec
    out = cv2.VideoWriter(output_path, fourcc, fps, (width, height))

    for i in range(num_frames):
        frame = frames[i]
        # Convert the frame range from [-1, 1] (as in Tanh output) to [0, 255]
        frame = (((frame + 1) / 2) * 255).astype(np.uint8)
        out.write(frame)

    out.release()
    
fake_video = fake[0].detach().cpu().numpy()  # Take one generated video from the batch
frames_to_video(fake_video, 'temp_video.mp4')



In [None]:
from IPython.display import Video

Video("temp_video.mp4")

# Transformers

In [None]:
# transformers taken from Aksan et al 2021
# https://github.com/eth-ait/motion-transformer
# body pose data deep inertial poser https://dip.is.tue.mpg.de/download.php

In [None]:
# https://docs.ultralytics.com/modes/predict/#keypoints

for person in result:
   print(person.keypoints.xy)

In [None]:
# GAN