In [1]:
import boto3, json, os, glob, cv2
import numpy as np
import math
import re
import time

### Batchify

Goal here is to import the individual arrays from the videos and convert them to one large array of size `batch_size x 28`. There are a few ways to approach this.

1. Choose a predetermined number of frames and either a) pad the remainder of the frames to reach the predetermined number at the end of the video, b) delete the remainder, c) pad if the remainder is at least half of the predetermined number, and delete if the remainder is less than half, d) start the next video where the previous one left off (maybe also including a token to indicate the video is changing)

2. Determine the largest video we have and pad all other videos to reach that length.

I think #1d is a bad idea. For now (in part because I'm using a smaller test dataset) I'm going with #1c, but we can play with this to see how it affects accuracy, if at all. Ultimately, we may decide to implement #2.

In [2]:
#Initialize s3 interface.
s3 = boto3.resource('s3')
# Select our bucket - the OpenPose files for the YouTube videos are in dancegan-yt-vids.
# dancegan_yt_vids = s3.Bucket('dancegan-yt-vids')

from s3fs.core import S3FileSystem
s3fs = S3FileSystem()

We will ultimately have three numpy arrays saved in the `Models` directory: 

1. YouTube videos from Everybody Dance Now: `vids_yt_all.npy`; saved already in `Models` directory
2. YouTube videos from vid2vid: `vids_v2v_all.npy`; individual arrays (300-1888) are now in `Models/DataPreprocessing/NumpyPoses`; individual arrays for (0-299) are on s3 in "`vid-numpy`" bucket with prefix `/vid2vid-poses/`
3. numpy arrays stitched together from Dancing to Music: `vids_d2m_all.npy`; are on s3 in "`dancegan-d2m`" bucket with prefix `/output/ballet/-0APC9DynPE/`, eg, there are three directories: `ballet`, `hiphop`, `zumba` and each of those includes a unique directory (eg, `-0APC9DynPE`) for individual videos within it; I will save these as three separate numpy files: `vids_d2m_ballet.npy`, `vids_d2m_hiphop.npy`, `vids_d2m_zumba.npy` and concatenate them to make `vids_d2m_all.npy`.

### 1. vid2vid arrays

In [6]:
# First, get the 300 from s3.
def single_vid_batchify_s3(video,length):
    '''Takes the video number in string format, gets the 2d numpy array of 
    keypoints and returns a 3d numpy array of mini-batches 
    of passed-in length. Remainders less than half of the length are deleted.
    Remainders of at least half of the length are padded.'''
    # Get the numpy array for the video from s3.
    key = f'vid2vid-poses/poses{video}.npy'
    bucket = 'vid-numpy'
#     key = f'numpy-arrays/poses{video}.npy'
#     bucket = 'dancegan-yt-vids'
    poses = np.load(s3fs.open(f'{bucket}/{key}'))
#     poses = np.load(f'poses{video}.npy')
    # Reshape to 3d array.
    # Make sure the number of frames dimension is divisible by the mini-batch length.
    n_padded_frames = length - (poses.shape[0] % length)
    # Create padded frames to fill the last batch.
    # What number do we want to use for the padding? Will zero work?
    # Using a variable here (instead of np.zeros) so we can easily change it.
    fill_value = 0
    padded_frames = np.full((n_padded_frames,28),fill_value)
    # Append the padded_frames to the end of the original array.
    padded_poses = np.append(poses,padded_frames, axis=0)
    # Now reshape into batches of batch_size frames.
    n_all_frames = padded_poses.shape[0]
    padded_poses = padded_poses.reshape(int(n_all_frames/length),length,28)
    
    return padded_poses

In [7]:
mini_batch_vids_v2v_s3 = {}
length = 300
for i in range(0,300):
    video_number = f'{i:05d}'
    print(f'Creating mini-batch for video-{video_number}.')
    mini_batch_vids_v2v_s3[i] = single_vid_batchify_s3(video_number,length)
    print('-'*72)

Creating mini-batch for video-00000.
------------------------------------------------------------------------
Creating mini-batch for video-00001.
------------------------------------------------------------------------
Creating mini-batch for video-00002.
------------------------------------------------------------------------
Creating mini-batch for video-00003.
------------------------------------------------------------------------
Creating mini-batch for video-00004.
------------------------------------------------------------------------
Creating mini-batch for video-00005.
------------------------------------------------------------------------
Creating mini-batch for video-00006.
------------------------------------------------------------------------
Creating mini-batch for video-00007.
------------------------------------------------------------------------
Creating mini-batch for video-00008.
------------------------------------------------------------------------
Creating m

------------------------------------------------------------------------
Creating mini-batch for video-00077.
------------------------------------------------------------------------
Creating mini-batch for video-00078.
------------------------------------------------------------------------
Creating mini-batch for video-00079.
------------------------------------------------------------------------
Creating mini-batch for video-00080.
------------------------------------------------------------------------
Creating mini-batch for video-00081.
------------------------------------------------------------------------
Creating mini-batch for video-00082.
------------------------------------------------------------------------
Creating mini-batch for video-00083.
------------------------------------------------------------------------
Creating mini-batch for video-00084.
------------------------------------------------------------------------
Creating mini-batch for video-00085.
----------

------------------------------------------------------------------------
Creating mini-batch for video-00152.
------------------------------------------------------------------------
Creating mini-batch for video-00153.
------------------------------------------------------------------------
Creating mini-batch for video-00154.
------------------------------------------------------------------------
Creating mini-batch for video-00155.
------------------------------------------------------------------------
Creating mini-batch for video-00156.
------------------------------------------------------------------------
Creating mini-batch for video-00157.
------------------------------------------------------------------------
Creating mini-batch for video-00158.
------------------------------------------------------------------------
Creating mini-batch for video-00159.
------------------------------------------------------------------------
Creating mini-batch for video-00160.
----------

------------------------------------------------------------------------
Creating mini-batch for video-00227.
------------------------------------------------------------------------
Creating mini-batch for video-00228.
------------------------------------------------------------------------
Creating mini-batch for video-00229.
------------------------------------------------------------------------
Creating mini-batch for video-00230.
------------------------------------------------------------------------
Creating mini-batch for video-00231.
------------------------------------------------------------------------
Creating mini-batch for video-00232.
------------------------------------------------------------------------
Creating mini-batch for video-00233.
------------------------------------------------------------------------
Creating mini-batch for video-00234.
------------------------------------------------------------------------
Creating mini-batch for video-00235.
----------

FileNotFoundError: vid-numpy/vid2vid-poses/poses00298.npy

In [8]:
# Next, get 301-1888 in Models/DataPreprocessing/NumpyPoses.
def single_vid_batchify_local(video,length):
    '''Takes the video number in string format, gets the 2d numpy array of 
    keypoints and returns a 3d numpy array of mini-batches 
    of passed-in length. Remainders less than half of the length are deleted.
    Remainders of at least half of the length are padded.'''
    # Get the numpy array for the video.
    filename = f'poses{video}.npy'
    poses = np.load(f'./NumpyPoses/{filename}')
    # Reshape to 3d array.
    # Make sure the number of frames dimension is divisible by the mini-batch length.
    n_padded_frames = length - (poses.shape[0] % length)
    # Create padded frames to fill the last batch.
    # What number do we want to use for the padding? Will zero work?
    # Using a variable here (instead of np.zeros) so we can easily change it.
    fill_value = 0
    padded_frames = np.full((n_padded_frames,28),fill_value)
    # Append the padded_frames to the end of the original array.
    padded_poses = np.append(poses,padded_frames, axis=0)
    # Now reshape into batches of batch_size frames.
    n_all_frames = padded_poses.shape[0]
    padded_poses = padded_poses.reshape(int(n_all_frames/length),length,28)
    
    return padded_poses

In [21]:
length = 300
for i in range(301,1888):
    video_number = f'{i:05d}'
    print(f'Creating mini-batch for video-{video_number}.')
    mini_batch_vids_v2v_s3[i] = single_vid_batchify_s3(video_number,length)
    print('-'*72)

Creating mini-batch for video-01888.


FileNotFoundError: vid-numpy/vid2vid-poses/poses01888.npy

In [26]:
np.save('mini_batch_vids_v2v_s3.npy',mini_batch_vids_v2v_s3)

In [25]:
type(mini_batch_vids_v2v_s3)

dict

In [5]:
mini_batch_vids_v2v = np.load('mini_batch_vids_v2v_s3.npy', allow_pickle=True)

In [13]:
mini_batch_vids_v2v[()][0].shape

(11, 300, 28)

In [15]:
len(mini_batch_vids_v2v[()])

1874

In [28]:
vids_v2v_all = np.empty((0,300,28))

In [33]:
for key in mini_batch_vids_v2v[()]:
    vids_v2v_all = np.append(vids_v2v_all,mini_batch_vids_v2v[()][key], axis=0)

In [34]:
vids_v2v_all.shape

(38279, 300, 28)

In [35]:
np.save('../vids_v2v_all.npy',vids_v2v_all)

### 2. D2M arrays

Goal is to save as `vids_d2m_all.npy` in the `Models/` directory; are on s3 in "`dancegan-d2m`" bucket with prefix `/output/ballet/-0APC9DynPE/`, eg, there are three directories: `ballet`, `hiphop`, `zumba` and each of those includes a unique directory (eg, `-0APC9DynPE`) for individual videos within it; I will save these as three separate numpy files: `vids_d2m_ballet.npy`, `vids_d2m_hiphop.npy`, `vids_d2m_zumba.npy` and concatenate them to make `vids_d2m_all.npy`.