In [1]:
!pip3 install --upgrade onnxruntime

In [2]:
!pip install mxnet-cu110
!pip install gluoncv

In [3]:

import numpy as np
import onnxruntime as rt
import urllib.request
import os.path
from PIL import Image
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model
    



In [4]:

def fetch_model():
    if not os.path.isfile("vgg16_ucf101.onnx"):
        urllib.request.urlretrieve("https://apache-mxnet.s3-us-west-2.amazonaws.com/onnx/models/gluoncv-vgg16_ucf101-b8e05551.onnx", filename="vgg16_ucf101.onnx")
    return "vgg16_ucf101.onnx"
    
def prepare_img(img_path, input_shape):
    # input_shape: BHWC
    height, width = input_shape[1], input_shape[2]
    img = Image.open(img_path).convert('RGB')
    img = img.resize((width, height))
    img = np.asarray(img)
    img = np.expand_dims(img, axis=0).astype('float32')

    return img
    
def prepare_label():
    from gluoncv.data import UCF101Attr
    return UCF101Attr().classes
    

**Make sure to replace the image you want to use**

In [5]:

model = fetch_model()
url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/ThrowDiscus.png'
im_fname = utils.download(url)


In [6]:

img = image.imread(im_fname)
img_path = "/content/ThrowDiscus.png"
img = prepare_img(img_path, (1, 224, 224, 3))
label = prepare_label()
    

In [7]:

# Create a onnx inference session and get the input name
onnx_session = rt.InferenceSession(model, None)
input_name = onnx_session.get_inputs()[0].name    
    

In [8]:

pred = onnx_session.run([], {input_name: img})[0]
    


(Optional) We use mxnet to process the result.

Feel free to process the result your own way


In [9]:

import mxnet as mx

pred = mx.nd.array(pred)
topK = 5
ind = mx.nd.topk(pred, k=topK)[0].astype('int')
print('The input is classified to be')
for i in range(topK):
    print('	[%s], with probability %.3f.'%
          (label[ind[i].asscalar()], mx.nd.softmax(pred)[0][ind[i]].asscalar()))
    

The input is classified to be
	[ThrowDiscus], with probability 0.999.
	[Hammering], with probability 0.001.
	[TennisSwing], with probability 0.000.
	[VolleyballSpiking], with probability 0.000.
	[Basketball], with probability 0.000.


In [10]:
pip install decord



In [11]:
from gluoncv import utils
url = 'https://raw.githubusercontent.com/overtunned/lane_detection/main/Dataset/Produce.mp4'
video_fname = utils.download(url)

from decord import VideoReader
vr = VideoReader(video_fname)

In [12]:
import matplotlib.pyplot as plt
import numpy as np
import mxnet as mx
from mxnet import gluon, nd, image
from mxnet.gluon.data.vision import transforms
from gluoncv.data.transforms import video
from gluoncv import utils
from gluoncv.model_zoo import get_model

In [13]:
from gluoncv.utils.filesystem import try_import_decord
decord = try_import_decord()

url = 'https://github.com/bryanyzhu/tiny-ucf101/raw/master/abseiling_k400.mp4'
video_fname = utils.download(url)
vr = decord.VideoReader(video_fname)
fast_frame_id_list = range(0, 64, 2)
slow_frame_id_list = range(0, 64, 16)
frame_id_list = list(fast_frame_id_list) + list(slow_frame_id_list)
video_data = vr.get_batch(frame_id_list).asnumpy()
clip_input = [video_data[vid, :, :, :] for vid, _ in enumerate(frame_id_list)]

In [14]:
transform_fn = video.VideoGroupValTransform(size=224, mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
clip_input = transform_fn(clip_input)
clip_input = np.stack(clip_input, axis=0)
clip_input = clip_input.reshape((-1,) + (36, 3, 224, 224))
clip_input = np.transpose(clip_input, (0, 2, 1, 3, 4))
print('Video data is downloaded and preprocessed.')

Video data is downloaded and preprocessed.


In [15]:
model_name = 'slowfast_4x16_resnet50_kinetics400'
net = get_model(model_name, nclass=400, pretrained=True)
print('%s model is successfully loaded.' % model_name)

slowfast_4x16_resnet50_kinetics400 model is successfully loaded.


In [16]:
pred = net(nd.array(clip_input))

classes = net.classes
topK = 5
ind = nd.topk(pred, k=topK)[0].astype('int')
print('The input video clip is classified to be')
for i in range(topK):
    print('\t[%s], with probability %.3f.'%
          (classes[ind[i].asscalar()], nd.softmax(pred)[0][ind[i]].asscalar()))

The input video clip is classified to be
	[abseiling], with probability 0.996.
	[rock_climbing], with probability 0.004.
	[ice_climbing], with probability 0.000.
	[paragliding], with probability 0.000.
	[climbing_a_rope], with probability 0.000.
