In [1]:
import json
from pprint import pprint
import glob
import os
import re
import cv2
import pandas as pd
import numpy as np
import sys
from copy import deepcopy
from PIL import Image
from tkinter import *
import datetime
import time

In [2]:
#a class to store the body keypoints from JSON
class Points(): 
    def __init__(self):
        self.frameNumber = -1
        self.index = -1
        self.xLeft = []
        self.xRight = []
        self.yLeft = []
        self.yRight = []
        self.xBody = []
        self.yBody = []
        self.xPalmLeft = 0
        self.xPalmRight = 0
        self.yPalmLeft = 0
        self.yPalmRight = 0
        #orientation = orientation between thumbs and other fingers
        self.lOrientation = []
        self.rOrientation = []
        self.xLCenter = 0
        self.xRCenter = 0
        self.yLCenter = 0
        self.yRCenter = 0

In [3]:
def LoadTransition():
    trans = []
    #get transition json files
    files = glob.glob("../jsons/transitions/*.json")
    ctrTrans = 0
    for f in files:
        fName = os.path.basename(f)
        #number is the transition frame number. assumed that a json filename represent the frame number
        number = re.split("(\d+)", fName)
        number = int(number[1])
        trans.append(Points())
        with open(f) as df:
            data = json.load(df)
            frameNumber = number
            #frame number is the order of the frame, index is the index of the array because it will be sorted so the order might be change
            trans[ctrTrans].frameNumber = number
            trans[ctrTrans].index = ctrTrans
            
            #get the wrist keypoint (index 4 and 7) from the json
            trans[ctrTrans].xBody.append(data['people'][0]['pose_keypoints_2d'][4*3])
            trans[ctrTrans].xBody.append(data['people'][0]['pose_keypoints_2d'][7*3])
            trans[ctrTrans].yBody.append(data['people'][0]['pose_keypoints_2d'][4*3+1])
            trans[ctrTrans].yBody.append(data['people'][0]['pose_keypoints_2d'][7*3+1])
            
            
            #get the palm center data from the hand json index 0
            trans[ctrTrans].xPalmLeft = (data['people'][0]['hand_left_keypoints_2d'][0])
            trans[ctrTrans].yPalmLeft = (data['people'][0]['hand_left_keypoints_2d'][1])
            trans[ctrTrans].xPalmRight = (data['people'][0]['hand_right_keypoints_2d'][0])
            trans[ctrTrans].yPalmRight = (data['people'][0]['hand_right_keypoints_2d'][1])
            
            xLeft = []; yLeft = []; xRight = []; yRight = []
            #get the fingertip positions. (index 4,8,12,16,20 from openpose. each keypoint has x,y,c)
            for j in range(4,21,4):
                xLeft.append(data['people'][0]['hand_left_keypoints_2d'][j*3])
                yLeft.append(data['people'][0]['hand_left_keypoints_2d'][j*3+1])
                xRight.append(data['people'][0]['hand_right_keypoints_2d'][j*3])
                yRight.append(data['people'][0]['hand_right_keypoints_2d'][j*3+1])
            #calculate the fingertip orientation
            for j in range(1,5):
                trans[ctrTrans].lOrientation.append(np.cross([xLeft[0] , yLeft[0]] ,[xLeft[j] , yLeft[j]]))
                trans[ctrTrans].rOrientation.append(np.cross([xRight[0], yRight[0]],[xRight[j],yRight[j]]))
            
            #Center are the average of 5 fingertips from a hand
            trans[ctrTrans].xLCenter = np.sum(xLeft)/5
            trans[ctrTrans].xRCenter = np.sum(xRight)/5
            trans[ctrTrans].yLCenter = np.sum(yLeft)/5
            trans[ctrTrans].yRCenter = np.sum(yRight)/5
        ctrTrans+=1
    return trans

In [5]:
temp = []
transition  = []

In [6]:
#define the distance class. the distance value is stored in distance property.
class Distance(object):
    def __init__(self):
        self.distance = 0
        self.From = -1
        self.index = -1

In [7]:
def distance(framesA, framesB, framesC):
    arr_distance = []
    frames = framesA+framesB
    
    #get the trajectory range from the palmcenter, retrieved from the avg of fingertips position
    min_x_L = min(frames[2].xLCenter, frames[3].xLCenter) 
    max_x_L = max(frames[2].xLCenter, frames[3].xLCenter)
    min_y_L = min(frames[2].yLCenter, frames[3].yLCenter)
    max_y_L = max(frames[2].yLCenter, frames[3].yLCenter)   
    min_x_R = min(frames[2].xRCenter, frames[3].xRCenter)
    max_x_R = max(frames[2].xRCenter, frames[3].xRCenter)
    min_y_R = min(frames[2].yRCenter, frames[3].yRCenter)
    max_y_R = max(frames[2].yRCenter, frames[3].yRCenter)
    
    for i in range(len(framesC)):
        const = 0.7 #initial weight
        ctrConst = 0.1 #variabe to change the weight
        temp_distance = 0
        flag = False
        for j in range(len(frames)):
            wrist_distance = 0
            
            #get the L2 distance of wrist
            for k in range(len(framesC[i].xBody)):
                wristX = framesC[i].xBody[k]-frames[j].xBody[k]
                wristY = framesC[i].yBody[k]-frames[j].yBody[k]
                wrist_distance += np.sqrt(np.power(wristX,2)+np.power(wristY,2))
            
            #get the L2 distance of palm center
            palm_left =np.sqrt(np.power(framesC[i].xPalmLeft-frames[j].xPalmLeft,2)+np.power(framesC[i].yPalmLeft-frames[j].yPalmLeft,2))
            palm_right=np.sqrt(np.power(framesC[i].xPalmRight-frames[j].xPalmRight,2)+np.power(framesC[i].yPalmRight-frames[j].yPalmRight,2))
            palm_distance =  palm_left + palm_right
            
            #check the orientation difference from 2 frames. if different, then the distance will be increased.
            orientation = 0
            for k in range(0,4):
                if framesC[i].lOrientation[k] * frames[j].lOrientation[k] < 0:
                    orientation += 1
                if framesC[i].rOrientation[k] * frames[j].rOrientation[k] < 0:
                    orientation += 1
            if orientation >1:
                palm_distance += 50
            
            #change the weight
            if j >= (int)(len(frames)/2):
                ctrConst = -0.2
            const += ctrConst
            
            #the total distance from the iteration is saved in this variable
            temp_distance += const * palm_distance
            
            #check if the frames compared is same, then skip the iteration
            if(palm_left == 0.0 and palm_right == 0.0 and wrist_distance == 0.0):
                flag = True
        if(flag):
            continue
        else:
            #append the distance array by the calculation result
            arr_distance.append(Distance())
            arr_distance[len(arr_distance)-1].From = framesC[i].frameNumber
            arr_distance[len(arr_distance)-1].distance = (temp_distance) 
            arr_distance[len(arr_distance)-1].palm_left = palm_left
            arr_distance[len(arr_distance)-1].palm_right = palm_right
            arr_distance[len(arr_distance)-1].index = i
    #sort the distance array by the distance
    arr_distance.sort(key = lambda x: x.distance, reverse = False)
    
    #if the trajectory satisfied, return the frame with the smallest distance
    for i in range(len(arr_distance)):
        if min_x_L-20 <= framesC[arr_distance[i].index].xLCenter <= max_x_L+20:
            if min_y_L-20 <= framesC[arr_distance[i].index].yLCenter <= max_y_L+20:
                if min_x_R-20 <= framesC[arr_distance[i].index].xRCenter <= max_x_R+20:
                    if min_y_R-20 <= framesC[arr_distance[i].index].yRCenter <= max_y_R+20:
                        return arr_distance[i]


In [8]:
#crossfading function
def trans_smoothing(frameA, frameB, weightA):
    new_img = cv2.addWeighted(frameA, weightA, frameB, 1.0-weightA, 0)
    return new_img

In [9]:
def divide_left(frameA, frameB, frameT, threshold):
    global temp
    #call distance function
    frame = distance(frameA, frameB, frameT)
    #if the returned frame is available, continue the iteration
    if frame:
        #at least one frame is returned, after get the 1st frame, the threshold is changed into the 1st distance
        if(frame.distance >= threshold):
            if threshold == 1000:
                threshold = frame.distance
            else:
                return
        
        new_frameB = deepcopy(frameB)
        #insert the returned frame into the 1st index of frameB, the last index of frameB is removed
        new_frameB.insert(0,deepcopy(frameT[frame.index]))
        new_frameB.pop()
        #insert the returned frame into the 1st index of temp. temp is a global variable to store the transition sequence
        temp.insert(0, frameT[frame.index])
        frameT.pop(frame.index)
        #recursively call the function
        divide_left(frameA,new_frameB,frameT,threshold-100)

In [10]:
def divide_right(frameA,frameB,frameT, threshold):
    global temp
    frame = distance(frameA, frameB, frameT)
    if frame:
        if(frame.distance >= threshold):
            if threshold == 1000:
                threshold = frame.distance
            else:
                return
        new_frameA = deepcopy(frameA)
        #append the returned frame into the last index of frameA and delete the 1st index of frameA
        new_frameA.append(deepcopy(frameT[frame.index]))
        new_frameA.pop(0)
        #append the returned frame into the last index of temp.
        temp.append(frameT[frame.index])
        frameT.pop(frame.index)
        #recursively call the function
        divide_right(new_frameA, frameB, frameT, threshold-120)

R :  53.152127197451115
R :  53.68161060096693
R :  62.17241731286414
R :  63.13998703209069
R :  48.942293057706124
R :  35.5034735443986
R :  33.38878473335145
R :  38.92210931101864
R :  38.31790376278826

In [11]:
#a function to read the video frames
def readVideo(vid):
    cap = vid
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    buf = np.empty((frameCount, 800, 900, 3), np.dtype('uint8'))
    fc = 0
    ret = True

    while (fc < frameCount  and ret):
        ret, temp = cap.read()
        #crop the frame. for x from 500 to 1400, y from 0 to 800
        buf[fc] = temp[0:800,500:1400]
        fc += 1

    cap.release()
    return buf

In [12]:
#a function to read the video frames from the transition because transition has a big data, so cannot store everything in the array
def readLargeVideo(vid, trans_idx):
    cap = vid
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    buf = np.empty((len(trans_idx), 800, 900, 3), np.dtype('uint8'))
    
    fc = 0
    ret = True

    while (fc < frameCount  and ret):
        ret, temp = cap.read()
        for i in range(len(trans_idx)):
            #if the current readed frame index is same with the transition frame number then fill the array
            if fc == trans_idx[i].frameNumber:
                buf[i] = temp[0:800,500:1400]
        fc += 1
    cap.release()
    return buf

In [13]:
def set_data():
    global temp
    threshold = inputSlider.get()
    text = inputText.get()
    input_text = text.split(" ")
    #call the function defined before
    trans = LoadTransition()
    #to measure the time
    start = time.time()
    #define json files basepath
    dir_path = "../jsons/"
    words = []
    for i in range(len(input_text)):
        ctr = 0
        words.append([])
        for filename in os.listdir(dir_path+input_text[i]):
            if input_text[i] in filename:
                number = int(re.split("(\d+)", os.path.basename(filename))[1])

                #3 is the n from n-frames. get the 3 first and 3 last frames from a gloss. so 1 array contain 6 frames.
                if (number < 3 or number >= len(os.listdir(dir_path+input_text[i]))-3):
                    words[i].append(Points())
                    with open(dir_path +input_text[i]+"/"+input_text[i]+"_"+ ((str)(number)).zfill(12)+"_keypoints.json") as df:
                        data = json.load(df)
                        xLeft = []; yLeft = []; xRight = []; yRight = []

                        for j in range(4,21,4):
                            xLeft.append(data['people'][0]['hand_left_keypoints_2d'][j*3])
                            yLeft.append(data['people'][0]['hand_left_keypoints_2d'][j*3+1])
                            xRight.append(data['people'][0]['hand_right_keypoints_2d'][j*3])
                            yRight.append(data['people'][0]['hand_right_keypoints_2d'][j*3+1])
                        for j in range(1,5):
                            words[i][ctr].lOrientation.append(np.cross([xLeft[0] , yLeft[0]] ,[xLeft[j] , yLeft[j]]))
                            words[i][ctr].rOrientation.append(np.cross([xRight[0], yRight[0]],[xRight[j],yRight[j]]))

                        words[i][ctr].xBody.append(data['people'][0]['pose_keypoints_2d'][4*3])
                        words[i][ctr].xBody.append(data['people'][0]['pose_keypoints_2d'][7*3])
                        words[i][ctr].yBody.append(data['people'][0]['pose_keypoints_2d'][4*3+1])
                        words[i][ctr].yBody.append(data['people'][0]['pose_keypoints_2d'][7*3+1])
                        words[i][ctr].frameNumber = number 

                        words[i][ctr].xPalmLeft = (data['people'][0]['hand_left_keypoints_2d'][0])
                        words[i][ctr].yPalmLeft = (data['people'][0]['hand_left_keypoints_2d'][1])
                        words[i][ctr].xPalmRight = (data['people'][0]['hand_right_keypoints_2d'][0])
                        words[i][ctr].yPalmRight = (data['people'][0]['hand_right_keypoints_2d'][1])

                        words[i][ctr].xLCenter = np.sum(xLeft)/np.count_nonzero(xLeft)
                        words[i][ctr].xRCenter = np.sum(xRight)/np.count_nonzero(xRight)
                        words[i][ctr].yLCenter = np.sum(yLeft)/np.count_nonzero(yLeft)
                        words[i][ctr].yRCenter = np.sum(yRight)/np.count_nonzero(yRight)

                        ctr+=1
                        


    for i in range(0, len(words)-1):
        frameA = []
        frameB = []
        #arrange frameA and frameB content
        for j in range(0, (int)(len(words[i])/2)):
            frameA.append(deepcopy(words[i][(int)(len(words[i])/2)+j]))
        for j in range(0, (int)(len(words[i+1])/2)):
            frameB.append(deepcopy(words[i+1][j]))
        #call divide left and divide right
        divide_left(frameA,frameB,deepcopy(trans), threshold)
        divide_right(frameA,frameB,deepcopy(trans), threshold)
        transition.append(temp.copy())
        temp = []
        
    #read gloss video and save to array of images
    videos = []
    for i in range(len(input_text)):
        videos.append(readVideo(cv2.VideoCapture('./ASL/'+input_text[i]+'.mp4')))
        
    #read transition video and save to array of images
    written_trans = []
    for i in range(0, len(transition)):
        written_trans.append([])
        written_trans[i] = (readLargeVideo(cv2.VideoCapture('./ASL/transitions.mp4'),transition[i]))
        
    #arrange the output frames from the separated array of frames
    output = []
    for i in range(0, len(input_text)):
        for j in range(0, len(videos[i])):
            output.append(videos[i][j])
        if i < len(input_text)-1:
            prev = videos[i][j]
            for j in range(0, len(written_trans[i])):
                if j > 0:
                    prev = written_trans[i][j-1]
                output.append(trans_smoothing(prev, written_trans[i][j], 0.5))
                output.append(written_trans[i][j])
                
    #write the video
    height,width,channels = videos[0][0].shape
    out = cv2.VideoWriter('outputThesis/output.mp4',cv2.VideoWriter_fourcc('F','M','P', '4'), 10, (width,height))
    for i in range(0, len(output)):
            out.write(output[i])
    out.release()
    
    
    end = time.time()
    print("Time for processing: "+str(end-start))

In [14]:
root = Tk()
root.title("Text to ASL Video")
root.geometry("500x100") #You want the size of the app to be 500x500
root.resizable(0, 0) #Don't allow resizing in the x or y direction
inputText = Entry(root)
inputText.pack()
inputSlider = Scale(root, from_=0, to=2000, orient=HORIZONTAL)
inputSlider.pack()
Button(root, text="Get ASL Result", command=set_data).pack()


root.mainloop()

Time for processing: 59.02824640274048
