### KMeans Clustering

Use KMeans clustering on our arrays of poses to get a limited vocabulary to feed into the language model. Ultimately, we will want about 50k poses.

In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
from matplotlib.colors import LogNorm
from sklearn.preprocessing import MinMaxScaler
import time, cv2, math, json, glob, os

In [2]:
# Function from D2M takes a numpy array, "pose" of the 14 D2M keypoints,
# saves a rendered figure as "outfile".
def vis_single(pose, outfile):
  colors = [[255, 0, 0], [255, 85, 0], [255, 170, 0], [255, 255, 0], [170, 255, 0], [85, 255, 0], [0, 255, 0], \
          [0, 255, 85], [0, 255, 170], [0, 255, 255], [0, 170, 255], [0, 85, 255], [0, 0, 255], [85, 0, 255], \
          [170, 0, 255], [255, 0, 255], [255, 0, 170], [255, 0, 85]]

  # find connection in the specified sequence, center 29 is in the position 15
  limbSeq = [[2,3], [2,6], [3,4], [4,5], [6,7], [7,8], [2,9], [9,10], \
           [10,11], [2,12], [12,13], [13,14], [2,1], [1,15], [15,17], \
           [1,16], [16,18], [3,17], [6,18]]

  neglect = [14,15,16,17]

  for t in range(1):
    #break
    canvas = np.ones((256,500,3), np.uint8)*255

    thisPeak = pose
    for i in range(18):
      if i in neglect:
        continue
      if thisPeak[i,0] == -1:
        continue
      cv2.circle(canvas, tuple(thisPeak[i,0:2].astype(int)), 4, colors[i], thickness=-1)

    for i in range(17):
      limbid = np.array(limbSeq[i])-1
      if limbid[0] in neglect or limbid[1] in neglect:
        continue
      X = thisPeak[[limbid[0],limbid[1]], 1]
      Y = thisPeak[[limbid[0],limbid[1]], 0]
      if X[0] == -1 or Y[0]==-1 or X[1]==-1 or Y[1]==-1:
        continue
      stickwidth = 4
      cur_canvas = canvas.copy()
      mX = np.mean(X)
      mY = np.mean(Y)
      length = ((X[0] - X[1]) ** 2 + (Y[0] - Y[1]) ** 2) ** 0.5
      angle = math.degrees(math.atan2(X[0] - X[1], Y[0] - Y[1]))
      polygon = cv2.ellipse2Poly((int(mY),int(mX)), (int(length/2), stickwidth), int(angle), 0, 360, 1)
      cv2.fillConvexPoly(cur_canvas, polygon, colors[i])
      canvas = cv2.addWeighted(canvas, 0.4, cur_canvas, 0.6, 0)
    cv2.imwrite(outfile,canvas)
    return canvas

In [3]:
# Taken from Stephanie's notebook.
def make_video(images, outvid=None, fps=5, size=None,
               is_color=True, format='MP42'):
    """
    Create a video from a list of images.
 
    @param      outvid      output video
    @param      images      list of images to use in the video
    @param      fps         frame per second
    @param      size        size of each frame
    @param      is_color    color
    @param      format      see http://www.fourcc.org/codecs.php
    @return                 see http://opencv-python-tutroals.readthedocs.org/en/latest/py_tutorials/py_gui/py_video_display/py_video_display.html
 
    The function relies on http://opencv-python-tutroals.readthedocs.org/en/latest/.
    By default, the video will have the size of the first image.
    It will resize every image to this size before adding them to the video.
    MODIFIED FROM: http://www.xavierdupre.fr/blog/2016-03-30_nojs.html
    """
    from cv2 import VideoWriter, VideoWriter_fourcc, imread, resize
    fourcc = VideoWriter_fourcc(*format)
    vid = None
    for image in images:
        #print(image)
        if not os.path.exists(image):
            raise FileNotFoundError(image)
        img = imread(image)
        if vid is None:
            if size is None:
                size = img.shape[1], img.shape[0]
            vid = VideoWriter(outvid, fourcc, float(fps), size, is_color)
        if size[0] != img.shape[1] and size[1] != img.shape[0]:
            img = resize(img, size)
        vid.write(img)
    vid.release()
    return vid

In [7]:
def get_mean_poses(dataset, dataset_name, k):
    #Normalize arrays to 0 to 1 range.
    scaler = MinMaxScaler(feature_range=(0, 1))
    scaler = scaler.fit(dataset)
    data =  scaler.transform(dataset)

    # Run KMeans with k clusters.
    start_time = time.time()
    kmeans = KMeans(n_clusters=k, random_state=0)
    clusters = kmeans.fit(data)
    print(f"Time to fit {k} kmeans clusters:{(time.time()-start_time)/60} minutes")

    #Get the labels for the cluster of each pose.
    cluster_labels = clusters.labels_

    #Compute the mean pose for each cluster.
    #Initialize dictionary of empty lists with keys for each cluster.
    pose_clusters = {}
    number_of_clusters = k
    for cluster_n in range(number_of_clusters):
        pose_clusters[cluster_n] = []
    #Fill dictionary with list of arrays in cluster.
    for i in range(cluster_labels.shape[0]):
        pose_clusters[cluster_labels[i]].append(dataset[i].tolist())

    #Render a pose for each cluster and save the array.
    pose_vocab = {}
    for i in (range(number_of_clusters)):
        cluster_mean = np.mean(np.array(pose_clusters[i]),axis=0)
        pose_vocab[i] = cluster_mean.tolist()
        pose = cluster_mean.reshape(14,2)
        img = vis_single(pose, 'rendered/outfile_'+str(i)+'.jpg')

    #Save vocab (means) to json dictionary.
    with open(f'pose_vocab_{dataset_name}_{k}.json', 'w') as fp:
        json.dump(pose_vocab, fp)
    print(f'vocab poses saved to pose_vocab_{dataset_name}_{k}.json')
    #Save clusters (all poses) to json dictionary
    with open(f'pose_clusters_{dataset_name}_{k}.json', 'w') as fp:
        json.dump(pose_clusters, fp)
    print(f'all poses with labeled clusters saved to pose_clusters_{dataset_name}_{k}.json')

    #Render pose vocabulary to video.
    outfile_name = f'mean_poses_{dataset_name}_{k}'
    fps = 2
    images = glob.glob("rendered/*.jpg")
    make_video(images, outvid = f'{outfile_name}.avi',fps=fps)
    print(f'video of pose vocab saved to {outfile_name}.avi')


In [4]:
#Import the D2M hiphop poses.
d2m_hiphop = np.load('../vids_d2m_hiphop.npy')
d2m_hiphop = d2m_hiphop.reshape(-1,28)
print('shape of raw data:',d2m_hiphop.shape)

shape of raw data: (901500, 28)


In [8]:
#Import all D2M poses.
vids_d2m_all = np.load('../vids_d2m_all.npy')
vids_d2m_all = vids_d2m_all.reshape(-1,28)
print('shape of raw data:',vids_d2m_all.shape)

shape of raw data: (6390600, 28)


In [6]:
get_mean_poses(d2m_hiphop, 'd2m_hiphop', k=20)

Time to fit 20 kmeans clusters:2.9488515575726826 minutes
vocab poses saved to pose_vocab_[[288.34825397  44.82094737 288.34825397 ... 155.85936842 297.86583292
  193.92968421]
 [288.07242941  47.26929825 288.07242941 ... 155.13519298 297.59000835
  193.20550877]
 [286.79660485  46.54512281 286.79660485 ... 157.58354386 305.83176274
  195.65385965]
 ...
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]
 [  0.           0.           0.         ...   0.           0.
    0.        ]]_20.json
all poses with labeled clusters saved to pose_clusters_d2m_hiphop_20.json
video of pose vocab saved to mean_poses_d2m_hiphop_20.avi


In [9]:
get_mean_poses(vids_d2m_all, 'd2m_all', k=15000)

MemoryError: Unable to allocate 714. GiB for an array with shape (6390600, 15000) and data type float64

In [10]:
import joblib
# save the model to disk
dataset_name = 'youtube'
filename = f'kmeans_model_{dataset_name}.sav'
joblib.dump(clusters, filename)

NameError: name 'clusters' is not defined