In [1]:
import cv2
import mediapipe as mp
import time
import numpy as np
# import numpy as np
import random
from tqdm import tqdm
import scipy.ndimage.interpolation as inter
from scipy.signal import medfilt 
from scipy.spatial.distance import cdist

from keras.optimizers import *
from keras.models import Model
from keras.layers import *
from keras.layers.core import *
from tensorflow.keras.callbacks import *
from keras.layers.convolutional import *
import tensorflow as tf

In [11]:
random.seed(1234)

class Config():
    def __init__(self):
        self.frame_l = 32 # the length of frames
        self.joint_n = 20 # the number of joints
        self.joint_d = 3 # the dimension of joints
        self.clc_num = 20 # the number of class
        self.feat_d = 190
        self.filters = 64
        self.nd = 60
C = Config()

In [12]:
def zoom(p,target_l=32,joints_num=20,joints_dim=3):
    l = p.shape[0]
    p_new = np.empty([target_l,joints_num,joints_dim]) 
    for m in range(joints_num):
        for n in range(joints_dim):
            p[:,m,n] = medfilt(p[:,m,n],3)
            p_new[:,m,n] = inter.zoom(p[:,m,n],target_l/l)[:target_l]         
    return p_new

def sampling_frame(p,C):
    full_l = p.shape[0] # full length
    if random.uniform(0,1)<0.5: # aligment sampling
        valid_l = np.round(np.random.uniform(0.9,1)*full_l)
        s = random.randint(0, full_l-int(valid_l))
        e = s+valid_l # sample end point
        p = p[int(s):int(e),:,:]    
    else: # without aligment sampling
        valid_l = np.round(np.random.uniform(0.9,1)*full_l)
        index = np.sort(np.random.choice(range(0,full_l),int(valid_l),replace=False))
        p = p[index,:,:]
    p = zoom(p,C.frame_l,C.joint_n,C.joint_d)
    return p

from scipy.spatial.distance import cdist
def get_CG(p,C):
    M = []
    iu = np.triu_indices(C.joint_n,1,C.joint_n)
    for f in range(C.frame_l):
        #distance max 
        d_m = cdist(p[f],np.concatenate([p[f],np.zeros([1,C.joint_d])]),'euclidean')       
        d_m = d_m[iu] 
        M.append(d_m)
    M = np.stack(M)   
    return M

def norm_train(p):
    # normolize to start point, use the center for hand case
    # p[:,:,0] = p[:,:,0]-p[:,3:4,0]
    # p[:,:,1] = p[:,:,1]-p[:,3:4,1]
    # p[:,:,2] = p[:,:,2]-p[:,3:4,2]
    # # return p
       
    p[:,:,0] = p[:,:,0]-np.mean(p[:,:,0])
    p[:,:,1] = p[:,:,1]-np.mean(p[:,:,1])
    p[:,:,2] = p[:,:,2]-np.mean(p[:,:,2])
    return p
def norm_train2d(p):
    # normolize to start point, use the center for hand case
    # p[:,:,0] = p[:,:,0]-p[:,3:4,0]
    # p[:,:,1] = p[:,:,1]-p[:,3:4,1]
    # p[:,:,2] = p[:,:,2]-p[:,3:4,2]
    # # return p
       
    p[:,:,0] = p[:,:,0]-np.mean(p[:,:,0])
    p[:,:,1] = p[:,:,1]-np.mean(p[:,:,1])
    # p[:,:,2] = p[:,:,2]-np.mean(p[:,:,2])
    return p
# def normlize_test(p):
#     # normolize to start point, use the center for hand case
#     p[:,:,0] = p[:,:,0]-p[:,1:2,0]
#     p[:,:,1] = p[:,:,1]-p[:,1:2,1]
#     p[:,:,2] = p[:,:,2]-p[:,1:2,2]
#     # p[:,:,0] = p[:,:,0]-np.mean(p[:,:,0])
#     # p[:,:,1] = p[:,:,1]-np.mean(p[:,:,1])
#     # p[:,:,2] = p[:,:,2]-np.mean(p[:,:,2])
#     return p
#     return p

In [13]:
drop_rate = 0.1
def poses_diff(x):
    H, W = x.get_shape()[1],x.get_shape()[2]
    x = tf.subtract(x[:,1:,...],x[:,:-1,...])
    x = tf.image.resize(x,size=[H,W]) 
    return x
def poses_diff_2(x):
    H, W = x.get_shape()[1],x.get_shape()[2]
    # x = tf.subtract(x[:,1:,...],x[:,:-1,...])
    x = tf.image.resize(x,size=[H,W]) 
    return x
def pose_motion_2(D, frame_l):
    x_1 = Lambda(lambda x: poses_diff_2(x))(D)
    x_1 = Reshape((frame_l,-1))(x_1)
    return x_1

def pose_motion(P,frame_l):
    P_diff_slow = Lambda(lambda x: poses_diff(x))(P)
    P_diff_slow = Reshape((frame_l,-1))(P_diff_slow)
    P_fast = Lambda(lambda x: x[:,::2,...])(P)
    P_diff_fast = Lambda(lambda x: poses_diff(x))(P_fast)
    P_diff_fast = Reshape((int(frame_l/2),-1))(P_diff_fast)
    x_1 = Reshape((frame_l,-1))(P)
    return P_diff_slow,P_diff_fast
# def reshape_x_2(D, frame_l):
#     x_1 = Lambda(lambda y: poses_diff_2(y))(D)
#     x_1 = Reshape((frame_l, -1))(D)

def c1D(x,filters,kernel):
    x = Conv1D(filters, kernel_size=kernel,padding='same',use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)
    return x

def block(x,filters):
    x = c1D(x,filters,3)
    x = c1D(x,filters,3)
    return x
    
def d1D(x,filters):
    x = Dense(filters,use_bias=False)(x)
    x = BatchNormalization()(x)
    x = LeakyReLU(alpha=0.2)(x)
    return x

def build_FM(frame_l=32,joint_n=20,joint_d=3,feat_d=190,filters=16, nd=60):   
    M = Input(shape=(frame_l,feat_d))
    P = Input(shape=(frame_l,joint_n,joint_d))
    # D = Input(shape =(frame_l, joint_n, joint_d))
    # x_ = pose_motion_2(D, frame_l)
    diff_slow,diff_fast = pose_motion(P,frame_l)
    


    x = c1D(M,filters*2,1)
    x = SpatialDropout1D(drop_rate)(x)
    x = c1D(x,filters,3)
    x = SpatialDropout1D(drop_rate)(x)
    x = c1D(x,filters,1)
    x = MaxPooling1D(2)(x)
    x = SpatialDropout1D(drop_rate)(x)

    
    # x_1 = c1D(x_1, filters*2,1)
    # x_1 = SpatialDropout1D(drop_rate)(x_1)
    # x_1 = c1D(x_1, filters, 3)
    # x_1 = SpatialDropout1D(drop_rate)(x_1)
    # x_1 = c1D(x_1, filters,1)
    # x_1 = MaxPooling1D(2)(x_1)
    # x_1 = SpatialDropout1D(drop_rate)(x_1)

    x_d_slow = c1D(diff_slow,filters*2,1)
    x_d_slow = SpatialDropout1D(drop_rate)(x_d_slow)
    x_d_slow = c1D(x_d_slow,filters,3)
    x_d_slow = SpatialDropout1D(drop_rate)(x_d_slow)
    x_d_slow = c1D(x_d_slow,filters,1)
    x_d_slow = MaxPool1D(2)(x_d_slow)
    x_d_slow = SpatialDropout1D(drop_rate)(x_d_slow)

    # x = c1D(diff_fast,filters*2,1)
    # x = SpatialDropout1D(drop_rate)(x)
    # x = c1D(x,filters,3) 
    # x = SpatialDropout1D(drop_rate)(x)
    # x = c1D(x,filters,1) 
    # x = SpatialDropout1D(drop_rate)(x)

    x_d_fast = c1D(diff_fast,filters*2,1)
    x_d_fast = SpatialDropout1D(drop_rate)(x_d_fast)
    x_d_fast = c1D(x_d_fast,filters,3) 
    x_d_fast = SpatialDropout1D(drop_rate)(x_d_fast)
    x_d_fast = c1D(x_d_fast,filters,1) 
    x_d_fast = SpatialDropout1D(drop_rate)(x_d_fast)
   
    x = concatenate([x,x_d_slow,x_d_fast])
    x = block(x,filters*2)
    x = MaxPool1D(2)(x)
    x = SpatialDropout1D(drop_rate)(x)
    
    x = block(x,filters*4)
    x = MaxPool1D(2)(x)
    x = SpatialDropout1D(drop_rate)(x)

    x = block(x,filters*8)
    x = SpatialDropout1D(drop_rate)(x)
    
    return Model(inputs=[M,P],outputs=x)


def build_DD_Net(C):
    M = Input(name='M', shape=(C.frame_l,C.feat_d))  
    P = Input(name='P', shape=(C.frame_l,C.joint_n,C.joint_d)) 
    # D = Input(name ='D', shape =(C.frame_l, C.joint_n,C.joint_d))
    FM = build_FM(C.frame_l,C.joint_n,C.joint_d,C.feat_d,C.filters)
    
    x = FM([M,P])

    x = GlobalMaxPool1D()(x)
    
    x = d1D(x,128)
    x = Dropout(0.5)(x)
    x = d1D(x,128)
    x = Dropout(0.5)(x)
    x = Dense(20, activation='softmax')(x)
    
    ######################Self-supervised part
    model = Model(inputs=[M,P],outputs=x)
    return model

In [14]:
import pickle
Train = pickle.load(open("train-MSRAfull.pkl", "rb"))
Test = pickle.load(open("test-MSRAfull.pkl", "rb"))

In [15]:
DD_Net = build_DD_Net(C)
DD_Net.summary()

DD_Net.load_weights('msr-full.h5')

Model: "model_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
M (InputLayer)                  [(None, 32, 190)]    0                                            
__________________________________________________________________________________________________
P (InputLayer)                  [(None, 32, 20, 3)]  0                                            
__________________________________________________________________________________________________
model_2 (Functional)            (None, 4, 512)       1733376     M[0][0]                          
                                                                 P[0][0]                          
__________________________________________________________________________________________________
global_max_pooling1d_1 (GlobalM (None, 512)          0           model_2[0][0]              

In [16]:
X_1 =[]
X_0 =[]
Y =[]
for i in tqdm(range(len(Train['pose']))): 
    p = np.copy(Train['pose'][i]).reshape([-1,20,3])
 
    p = zoom(p,target_l=C.frame_l,joints_num=C.joint_n,joints_dim=C.joint_d)
    
    # X_2.append(x_)
    p = norm_train(p)
    M = get_CG(p,C)
   
    X_0.append(M)
    # p = norm_train2d(p)
    X_1.append(p)
    
# for i in tqdm(range(len(y_train))): 
    
    label = np.zeros(20)
    label[Train['label'][i]-1] = 1   
    # label[Train_1['label'][i]] = 1

    Y.append(label)

X_0 = np.stack(X_0)  
X_1 = np.stack(X_1) 
# X_2 = np.stack(X_2) 
Y = np.stack(Y)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 284/284 [00:02<00:00, 141.15it/s]


In [17]:
X_test_0 = []
X_test_1 = []
# X_test_2 = []
Y_test = []
for i in tqdm(range(len(Test['pose']))): 
    p = np.copy(Test['pose'][i]).reshape([-1,20,3])
    p = zoom(p,target_l=C.frame_l,joints_num=C.joint_n,joints_dim=C.joint_d)
    p = norm_train(p)
    # X_test_2.append(x_)
    M = get_CG(p,C)
    X_test_0.append(M)
    # p = norm_train2d(p)
    X_test_1.append(p)

# for i in tqdm(range(len(y_test))):    
    label = np.zeros(20)
    label[Test['label'][i]-1] = 1   
    # label[le.transform(Test['label'])[i]] = 1   
    Y_test.append(label)

X_test_0 = np.stack(X_test_0) 
X_test_1 = np.stack(X_test_1) 
# X_test_2 = np.stack(X_test_2)  
Y_test = np.stack(Y_test)

100%|██████████████████████████████████████████████████████████████████████████████████████████| 283/283 [00:01<00:00, 143.11it/s]


In [22]:
%%time
DD_Net.predict([X_test_0, X_test_1])

Wall time: 296 ms


array([[9.1901278e-01, 4.3546259e-03, 1.1197896e-03, ..., 2.4988432e-03,
        4.6630856e-03, 8.7476347e-04],
       [1.8763350e-02, 4.3924125e-03, 1.0850975e-02, ..., 1.0427498e-01,
        2.0046495e-02, 9.5603261e-03],
       [9.8102552e-01, 1.1803084e-03, 1.3193022e-04, ..., 4.6765321e-04,
        8.4972644e-04, 3.6899225e-04],
       ...,
       [2.2094119e-04, 9.4077346e-04, 5.8280741e-04, ..., 8.5396208e-03,
        1.5819821e-03, 9.7802907e-01],
       [8.9630258e-04, 1.1736358e-03, 3.3459577e-04, ..., 4.7978181e-03,
        2.0080572e-03, 9.7634965e-01],
       [1.1246058e-02, 1.1092593e-02, 1.4209203e-02, ..., 2.0558046e-02,
        6.1133686e-02, 4.5707768e-01]], dtype=float32)