In [32]:
import torch
from movinets import MoViNet
from movinets.config import _C
import numpy as np

num_classes = 54

model = MoViNet(_C.MODEL.MoViNetA2, causal = False, pretrained = False )
model.classifier[3] = torch.nn.Conv3d(2048, 14, (1,1,1))
model.load_state_dict(torch.load('fine14_iter18_a2.pth', map_location=torch.device('cpu')))
# movinet_iter5_a2 현재 sota

model.eval()

#ncthw
data = torch.zeros((4, 3, 50, 224, 224))

# onnx float32
torch.onnx.export(model,
                  data, 
                  "fine14_iter18_a2.onnx", 
                  export_params=True,
                  opset_version=13, 
                  do_constant_folding=False,
                  input_names = ['Input'],
                  output_names = ['Output'],
                  dynamic_axes= {'Input' : {0 : 'batch_size'},
                                 'Output' : {0 : 'batch_size'}}
                 )

In [33]:
import cv2
import mediapy as media

def load_video(file_path, image_size=(224, 224), original_fps=30, new_fps=5, start_time=None, end_time=None, gray=False):
    """Loads a video file into a TF tensor."""
    cap = cv2.VideoCapture(file_path)
    
    frameCount = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    frameWidth = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH))
    frameHeight = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
    fc = 0
    ret = True

    if start_time and end_time:
        start_frame = original_fps * start_time
        end_frame = original_fps * end_time
    else:
        start_frame = 0
        end_frame = frameCount
    
    fps_factor = original_fps / new_fps
    now_frame = 0
    if gray:
        buf = np.zeros((int((end_frame - start_frame) / fps_factor), image_size[1], image_size[0]), np.dtype('uint8'))
    else:
        buf = np.zeros((int((end_frame - start_frame) / fps_factor), image_size[1], image_size[0], 3), np.dtype('uint8'))
    
    while (fc < frameCount  and ret):
        ret, tmp = cap.read()
        now_frame += 1
        if start_frame > now_frame:
            continue
        if end_frame < now_frame:
            break
        if now_frame % fps_factor == 0:
            tmp = cv2.resize(tmp, dsize=image_size)
            if gray:
                buf[fc] = cv2.cvtColor(tmp, cv2.COLOR_BGR2GRAY)
            else:
                buf[fc] = cv2.cvtColor(tmp, cv2.COLOR_BGR2RGB)
            fc += 1
    cap.release()
    
    return buf

In [34]:
import glob
paths = glob.glob('/home/workspaces/mark/Video_Data/kinetics-datasets-downloader/downloader/dataset/val/sample/adjusting_glasses/*.mp4')

for path in paths[:1]:
    video = load_video(path, image_size=(224, 224))
    media.show_video(video, fps=5)
    print(video.shape)

0
This browser does not support the video tag.


(50, 224, 224, 3)


In [35]:
def preprocess_video(path, image_size=(224,224), frame_num=50, dtype=np.float32):
    # thwc
    video = load_video(path, image_size=image_size)
    
    # set t=frame_num
    t,h,w,c = video.shape
    if t < frame_num:
        fill_n = frame_num - t
        video = np.concatenate([video, torch.zeros((fill_n, h, w, c), dtype=torch.uint8)], axis=0)
    elif t > frame_num:
        video = video[:frame_num]
    
    # cthw
    video = np.transpose(video, (3,0,1,2))
    
    # ncthw
    video = np.expand_dims(video, axis=0)
    
    video = video.astype(dtype)
    if dtype == np.float32:
        video /= 255
    
    return video

import os
def preprocess_func(video_names, image_size=(224,224)):
    
    video_list = []
    for video_name in video_names:
        video = preprocess_video(video_name, image_size)
        video_list.append(video)
    
    video_list = np.concatenate(np.expand_dims(video_list, axis=0), axis=0)
    return video_list

In [5]:
from onnxruntime.quantization import quantize_static, CalibrationDataReader, QuantType
class MovinetDataReader(CalibrationDataReader):
    def __init__(self, video_names, ncthw_data_list=None):
        self.video_names = video_names
        self.preprocess_flag = True
        self.enum_data_dicts = []
        self.datasize = 0
        self.ncthw_data_list = ncthw_data_list
        
    def get_next(self):
        if self.preprocess_flag:
            self.preprocess_flag = False
            if not self.ncthw_data_list:
                self.ncthw_data_list = preprocess_func(self.video_names)
            self.datasize = len(self.ncthw_data_list)
            self.enum_data_dicts = iter([{'Input': ncthw_data} for ncthw_data in self.ncthw_data_list])
        return next(self.enum_data_dicts, None)

In [23]:
video_names = glob.glob("/home/workspaces/mark/Video_Data/kinetics-datasets-downloader/downloader/dataset/val/sample/**/[0-9]*.mp4")
# ncthw_data_list = preprocess_func(video_names)

In [24]:
# float 16 quantization
import onnx
from onnxmltools.utils.float16_converter import convert_float_to_float16_model_path
new_onnx_model = convert_float_to_float16_model_path('fine_iter2_a2.onnx')
onnx.save(new_onnx_model, 'fine_iter2_a2_f16.onnx')

In [25]:
# uint 8 quantization
dr = MovinetDataReader(video_names)

quantize_static('fine_iter2_a2.onnx',
                'fine_iter2_a2_uint8.onnx',
                dr,
                activation_type=QuantType.QUInt8,
                weight_type=QuantType.QUInt8)


In [41]:
import onnxruntime

session_float32 = onnxruntime.InferenceSession("fine14_iter18_a2.onnx", providers=["CPUExecutionProvider"])
# session_float16 = onnxruntime.InferenceSession("fine_iter2_a2_f16.onnx", providers=["CPUExecutionProvider"])
# session_uint8 = onnxruntime.InferenceSession("fine_iter2_a2_uint8.onnx", providers=["CPUExecutionProvider"])
#best-quant-dynamic.onnx

input_name = session_float32.get_inputs()[0].name
output_name = session_float32.get_outputs()[0].name

print(input_name, output_name)

Input Output


In [69]:
img = cv2.imread('cap_8da8ccf3667aeb0.png') #hwc
img = cv2.resize(img, (224, 224))

img = np.transpose(img, (2,0,1)) #chw
img = np.expand_dims(img, axis=0) # nchw
img = img[:, :, None, :, :] # nc1hw
img = np.repeat(img, 50, axis=2) #ncthw

input_data = img/255
input_data = input_data.astype(np.float32)

out = session_float32.run([output_name], {input_name: input_data})
out

[array([[-1.8742428 , -0.84161174,  0.25327837, -2.1319807 , -2.3982642 ,
         -1.0563264 , -0.5731517 , -2.341021  , -1.4404964 , -1.7678291 ,
         -1.682383  , -2.170051  , -0.310915  , -3.2597356 ]],
       dtype=float32)]

In [71]:
input_data.ravel()[0:10]

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 0.9843137 , 0.93333334, 0.9372549 ],
      dtype=float32)

In [39]:
out = session_float32.run([output_name], {input_name: input_data})

In [30]:
import json

# with open("54classes.json", 'r') as f:
with open("14classes.json", 'r') as f:
    classes = json.load(f)
classes_dict = classes
# for key, value in classes.items():
#     classes_dict[value] = key

def log_softmax(x):
    e_x = np.exp(x - np.max(x))
    return np.log(e_x / e_x.sum())

def print_topk(out):
    # ados_dict = {0: 'abnormal', 1: 'normal'}
    abnormal_list = ["texting", "drawing", "taking_photo"]
    doubt_list = ["calling"]

    # abnormal_list = ['그림그리기', '글씨쓰기', '사진찍기']
    # doubt_list = ['계산기사용(실제계산기or휴대폰계산기)', '통화하기(이어폰or핸드폰을귀에대고)', '휴대폰문자하기']
    
    a = [log_softmax(o) for o in out[0]]
    b = np.argsort(a, axis=1)

    k = 2
    for i in range(len(b)):
        topk = [classes_dict[str(p)] for p in b[i][-k:][::-1]]
        # topk = [ados_dict[int(p)] for p in b[i][-k:][::-1]]
        print(topk)
        if any([tk in abnormal_list for tk in topk]):
        # if topk[0] == 'abnormal':
            print("pred_: abnormal")
        elif any([tk in doubt_list for tk in topk]):
            print("pred_: doubt")
        else:
            print("pred_: normal")


# paths = glob.glob('/home/workspaces/mark/Video_Data/kinetics-datasets-downloader/downloader/dataset/val/normal/rolling_eyes/*.mp4')
paths = glob.glob('/home/workspaces/datasets/ados_act/ados/all/**/*.mp4')

import random

for path in random.sample(paths, 30):

    video = load_video(path, image_size=(224, 224))
    media.show_video(video, fps=5)
    
    input_data = preprocess_video(path, dtype=np.float32)
    # input_data2 = preprocess_video(paths[0], dtype=np.uint8)

    out = session_float32.run([output_name], {input_name: input_data})
    # out2 = session_float16.run([output_name], {input_name: input_data.astype(np.float16)})
    # out3 = session_uint8.run([output_name], {input_name: input_data})

    print("result of float32:")
    # print(out[0][0][:10])
    print_topk(out)
    # print("result of float16:")
    # print(out2[0][0][:10])
    # print_topk(out2)
    # print("result of uint8:")
    # print(out3[0][0][:10])
    # print_topk(out3)
    print("\n")
    # break

0
This browser does not support the video tag.


result of float32:
['resting_chin', 'normal']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['yawning', 'drinking']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['taking_photo', 'drinking']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['stretching', 'eating']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['drinking', 'eating']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['fixing_hair', 'stretching']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['stretching', 'eating']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['taking_photo', 'calling']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['texting', 'calling']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['taking_photo', 'drinking']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['calling', 'fixing_hair']
pred_: doubt




0
This browser does not support the video tag.


result of float32:
['taking_photo', 'drinking']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['reading', 'stretching']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['resting_chin', 'normal']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['texting', 'eating']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['drinking', 'yawning']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['drawing', 'stretching']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['fixing_hair', 'stretching']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['calling', 'eating']
pred_: doubt




0
This browser does not support the video tag.


result of float32:
['calling', 'normal']
pred_: doubt




0
This browser does not support the video tag.


result of float32:
['texting', 'normal']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['moving_eye', 'calling']
pred_: doubt




0
This browser does not support the video tag.


result of float32:
['fixing_hair', 'taking_photo']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['eating', 'yawning']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['moving_eye', 'yawning']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['moving_eye', 'yawning']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['moving_eye', 'drawing']
pred_: abnormal




0
This browser does not support the video tag.


result of float32:
['fixing_hair', 'stretching']
pred_: normal




0
This browser does not support the video tag.


result of float32:
['resting_chin', 'calling']
pred_: doubt




0
This browser does not support the video tag.


result of float32:
['texting', 'calling']
pred_: abnormal




In [38]:
a = torch.from_numpy(out[0])

In [15]:
classes_dict['11']

'taking_photo'

In [10]:
classes

{'0': 'adjusting_glasses',
 '1': 'calling',
 '2': 'drawing',
 '3': 'drinking',
 '4': 'eating',
 '5': 'fixing_hair',
 '6': 'moving_eye',
 '7': 'normal',
 '8': 'reading',
 '9': 'resting_chin',
 '10': 'stretching',
 '11': 'taking_photo',
 '12': 'texting',
 '13': 'yawning'}

In [31]:
"{'adjusting_glasses': 0, 'calling': 1, 'drawing': 2, 'drinking': 3, 'eating': 4, 'fixing_hair': 5, 'moving_eye': 6, 'normal': 7, 'reading': 8, 'resting_chin': 9, 'stretching': 10, 'taking_photo': 11, 'texting': 12, 'yawning': 13}".replace("'", '"')

'{"adjusting_glasses": 0, "calling": 1, "drawing": 2, "drinking": 3, "eating": 4, "fixing_hair": 5, "moving_eye": 6, "normal": 7, "reading": 8, "resting_chin": 9, "stretching": 10, "taking_photo": 11, "texting": 12, "yawning": 13}'