In [None]:
import cv2
import numpy as np
import onnxruntime
import json
import time

def preprocess_video(video, image_size=(224,224), frame_num=50, dtype=np.float32):
    # thwc
    # video = load_video(path, image_size=image_size)
    
    # set t=frame_num
    t,h,w,c = video.shape
    if t < frame_num:
        fill_n = frame_num - t
        video = np.concatenate([video, torch.zeros((fill_n, h, w, c), dtype=torch.uint8)], axis=0)
    elif t > frame_num:
        video = video[:frame_num]
    
    # cthw
    video = np.transpose(video, (3,0,1,2))
    
    # ncthw
    video = np.expand_dims(video, axis=0)
    
    video = video.astype(dtype)
    if dtype == np.float32:
        video /= 255
    
    return video

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

def print_topk(out):
    ados_dict = {0: 'abnormal', 1: 'normal'}
    abnormal_list = ["doing_sudoku", "drawing", "taking_photo", "writing"]
    doubt_list = ["calligraphy", "calculating", "talking_on_cell_phone", "texting"]

    a = [log_softmax(o) for o in out[0]]
    b = np.argsort(a, axis=1)

    k = 5
    for i in range(len(b)):
        topk = [classes_dict[int(p)] for p in b[i][-k:][::-1]]
        print(topk)
        if any([tk in abnormal_list for tk in topk]):
            print("pred_: abnormal")
        elif any([tk in doubt_list for tk in topk]):
            print("pred_: doubt")
        else:
            print("pred_: normal")

In [None]:


# load class names
with open("54classes.json", 'r') as f:
    classes = json.load(f)
classes_dict = {}
for key, value in classes.items():
    classes_dict[value] = key

# onnx inference session
session_float16 = onnxruntime.InferenceSession("movinet_f16.onnx", providers=["CPUExecutionProvider"])
input_name = session_float16.get_inputs()[0].name
output_name = session_float16.get_outputs()[0].name
print(input_name, output_name)


cap = cv2.VideoCapture(0, cv2.CAP_DSHOW)

fps = 5
image_size=(398, 224)

cap.set(cv2.CAP_PROP_FPS, fps)
cap.set(cv2.CAP_PROP_FRAME_WIDTH, image_size[0])
cap.set(cv2.CAP_PROP_FRAME_HEIGHT, image_size[1])

buf_idx = 0
while True:
    buf = np.zeros((50, 224, 224, 3), np.dtype('uint8'))
    prev = time.time()
    toggle_break = False
    while(buf_idx < 50):
        time_elapsed = time.time() - prev
        res, frame = cap.read()
        
        if time_elapsed > 1./fps:
            prev = time.time()
            if frame.shape[:2][::-1] != image_size:
                frame = cv2.resize(frame, dsize=image_size)
            buf[buf_idx] = cv2.cvtColor(frame[:, 87:-87], cv2.COLOR_BGR2RGB)
            buf_idx += 1
            
            cv2.imshow('frame', frame[:, 87:-87])
            if cv2.waitKey(1) & 0xFF == ord('q'):
                toggle_break = True
                break
    else:
        input_data = preprocess_video(buf, dtype=np.float32)
        out2 = session_float16.run([output_name], {input_name: input_data.astype(np.float16)})
        print_topk(out2)
        buf_idx = 0
        # buf[:40] = buf[10:]
        time.sleep(1)
    if toggle_break:
        break
    
cv2.destroyAllWindows()
cap.release()