In [6]:
# just in case
!pip install tqdm



In [2]:
# mount google drive
from google.colab import drive
drive.mount('/content/drive/')
# drive.flush_and_unmount()

Mounted at /content/drive/


In [3]:
import keras
from keras.models import load_model
import numpy as np
from sklearn.metrics import accuracy_score

In [5]:
def ExtractFrames(file_path, pos=[0,0.05,0.1,0.15,.02,.25,.30,.35,.40,.45,.50,.55,.60,.65,.70,.75,.80,.85,.9,.95]):
    # Extracts frames from file_path at the positions (relative between 0 and 1) in pos
    
    import os
    
    if not len(pos):
        print("[ExtractFrames]: Invalid positions")
        return None
    
    if not os.path.isfile(file_path) :
        print("[ExtractFrames]: Invalid file path")
        return None
    
    import cv2
    
    # container for frames
    arr = np.empty((len(pos),224,224,3))
    
    cap = cv2.VideoCapture(file_path)
    total_frames = cap.get(cv2.CAP_PROP_FRAME_COUNT)
    
    for k,i in enumerate(pos):
        # get frame number
        position = int(i * total_frames)
        
        # set frame pointer at i and extract frame
        cap.set(cv2.CAP_PROP_POS_FRAMES, i)
        ret, frame = cap.read()
        
        # preprocessing
        frame = cv2.resize(frame, (224,224))
        frame = frame * 1/255.
        frame = np.float32(frame)
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
        
        # insert in container
        arr[k] = frame
        
    # cleanup
    cap.set(cv2.CAP_PROP_POS_FRAMES, 0)
    cap.release()
    
    return arr

In [7]:
# Load our already trained model
model = load_model(r"/content/drive/MyDrive/CSCE636/v5/bidirectionalLSTM_model_V5_lr_0.0005.h5")

In [8]:
# Get frames from our desired test video
TestVideo = ExtractFrames(r"/content/drive/MyDrive/CSCE636/v5/DemoApplyTestVideo_v5.mp4")

In [9]:
# need to reshape input to include sample size.
# in this case sample size is 1
TestVideo = np.expand_dims(TestVideo, axis=0)
TestVideo.shape

(1, 20, 224, 224, 3)

In [10]:
# perform prediction. Note this command works for only a single video
pred = model(TestVideo)

In [11]:
# convert tensor to array
pred = pred.numpy()

In [12]:
# The first index represents shaking hand, the second is not shaking hand
pred

array([[0.9943394 , 0.00566066]], dtype=float32)

In [13]:
index_max = np.argmax(pred, axis=1)
# "Shaking_Hand" - 1, "not Shaking_Hand" - 0
# if argmax is index 0, then it predicted Shaking hand, hence
# assign a 1 or else assign a 0
lookup = {1:0, 0:1}
predicted_labels = np.array([lookup[i] for i in index_max])

In [14]:
# a value of one means we detected the action!
predicted_labels[0]

1