# Test cnn model

> Conv net, trained with fastai running in onnx.

In [1]:
from expoco.core import *
from expoco.viseme_image.data import *
from expoco.viseme_image.model import *
import numpy as np
from zipfile import ZipFile
from pathlib import Path
import cv2, time, math, json, shutil
import win32api, win32con
import onnx, onnxruntime

# Load model and data

In [2]:
model_path = Path('data/viseme_image_dataset_20220202_131034/model_20220202_134036/resnet_3_256_256.onnx')
dataset_path = model_path.parent.parent

In [3]:
onnx_model = onnx.load(model_path)
onnx.checker.check_model(onnx_model)

In [4]:
x = np.random.randn(1, 3, 256, 256).astype(np.float32)
ort_session = onnxruntime.InferenceSession(str(model_path))
# compute ONNX Runtime output prediction
ort_inputs = {ort_session.get_inputs()[0].name: x}
ort_outs = ort_session.run(None, ort_inputs)
ort_outs

[array([[2.8649325e-09, 3.4617631e-10, 2.3448924e-19, 1.0000000e+00]],
       dtype=float32)]

# make sure we get the right results with images we trained on

In [5]:
win32api.GetAsyncKeyState(win32con.VK_ESCAPE)
viseme_classifier = VisemeClassifier(model_path)
image_helper = ImageHelper()
image_display_helper = ImageDisplayHelper(np.zeros([1,1]), 'expoco: test')
vocab = ['AH', 'EE', 'NO_EXPRESSION', 'OO']
total, correct = 0, 0
with ZipFile(dataset_path/'data.zip') as zip_file:
    name_list = [n for n in zip_file.namelist() if n not in ['AH/', 'EE/', 'NO_EXPRESSION/', 'OO/']]
    for i in range(0, len(name_list), 100):
        total+=1
        actual = name_list[i].split('/')[0]
        zip_file.extract(name_list[i], '/temp')
        raw_image = cv2.imread(f'/temp/{name_list[i]}')
        class_name = viseme_classifier.predict([raw_image])[0]
        if actual == class_name:
            correct += 1
        else:
            image_display_helper.show(
                cv2.putText(raw_image, f'act:{actual} pred:{class_name}', (5,15), fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=.5, color=255))
            while True:
                time.sleep(.25)
                if win32api.GetAsyncKeyState(win32con.VK_ESCAPE):
                    image_display_helper.show(raw_image/2)
                    break
print('done total', total, 'correct', correct, 'acc', correct/total)

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x…

done total 100 correct 100 acc 1.0


# How long will inference take

worst case would be running inference one sample at a time ...

In [6]:
ort_session = onnxruntime.InferenceSession(str(model_path))
one_row_x = np.random.randn(1, 3, 256, 256).astype(np.float32)
ort_inputs = {ort_session.get_inputs()[0].name: one_row_x}

In [7]:
start_time, how_many = time.time(), 50
for i in range(how_many):
    ort_outs = ort_session.run(None, ort_inputs)
print(how_many, 'calls with', one_row_x.shape[0], 'rows took', time.time()-start_time, 'seconds')

50 calls with 1 rows took 2.1565802097320557 seconds


&uarr; quick (o:

how much quicker would processing 2 samples at a time be?

In [8]:
two_rows_x = np.random.randn(2, 3, 256, 256).astype(np.float32)
ort_inputs = {ort_session.get_inputs()[0].name: one_row_x}

In [9]:
start_time, how_many = time.time(), 50
for i in range(how_many):
    ort_outs = ort_session.run(None, ort_inputs)
print(how_many, 'calls with', two_rows_x.shape[0], 'rows took', time.time()-start_time, 'seconds')

50 calls with 2 rows took 2.0057129859924316 seconds


&uarr; similar time for 2x as many preds

# Live test

In [10]:
def _putText(image, text, org):
    cv2.putText(image, text, org, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=.75, color=(0,0,0), thickness=2)
    cv2.putText(image, text, org, fontFace=cv2.FONT_HERSHEY_COMPLEX, fontScale=.75, color=(255,255,255), thickness=1)
    return image

In [13]:
def live_test():
    win32api.GetAsyncKeyState(win32con.VK_ESCAPE)
    video_capture = cv2.VideoCapture(0)
    ort_session = onnxruntime.InferenceSession(str(model_path))
    image_helper = VisemeClassifierImageHelper()
    image_display_helper = ImageDisplayHelper(np.zeros([1,1]), 'expoco: Capture session')
    vocab = ['AH', 'EE', 'NO_EXPRESSION', 'OO']
    try:
        _, _ = video_capture.read()
        while True:
            if win32api.GetAsyncKeyState(win32con.VK_ESCAPE): 
                break
            retval, image = video_capture.read()
            raw_image = image
            image = image = image_helper.prepare_for_inference(image)
            ort_inputs = {'input': image[None, ...]}
            ort_outs = ort_session.run(None, ort_inputs)
            output = ort_outs[0][0]
            class_id = np.argmax(output)
            class_name = vocab[class_id]
            raw_image = image_helper.flip(raw_image)
            raw_image = _putText(raw_image, f'{class_id} {class_name}', (0,20))
            for i in range(4):
                raw_image = _putText(raw_image, f'{np.round(output[i], 2)}', (0,40+(i*20)))
            image_display_helper.show(raw_image)
            time.sleep(.05)
    finally:
        video_capture.release()
#         image_widget.close()

In [14]:
live_test()

Image(value=b'\x89PNG\r\n\x1a\n\x00\x00\x00\rIHDR\x00\x00\x00\x01\x00\x00\x00\x01\x08\x00\x00\x00\x00:~\x9bU\x…

In [None]:
def live_test2():
    win32api.GetAsyncKeyState(win32con.VK_ESCAPE)
    video_capture = cv2.VideoCapture(0)
    viseme_classifier = VisemeClassifier(model_path)
    image_helper = ImageHelper()
    image_display_helper = ImageDisplayHelper(np.zeros([1,1]), 'expoco: Capture session')
    vocab = ['AH', 'EE', 'NO_EXPRESSION', 'OO']
    try:
        _, _ = video_capture.read()
        while True:
            if win32api.GetAsyncKeyState(win32con.VK_ESCAPE): 
                break
            retval, image = video_capture.read()
            viseme_classifier.queue_item(image)
            if len(viseme_classifier.item_queue) > 1:
                class_names = viseme_classifier.predict()
                image = image_helper.flip(image)
                image = _putText(image, f'{class_names}', (0,20))
                image_display_helper.show(image)
            time.sleep(.05)
    finally:
        video_capture.release()
#         image_widget.close()

In [None]:
live_test2()

# TODO

Collect images and save as .png (cropped, but NOT normalized ) - during training we'll have to pre-process and augment https://github.com/cordmaur/Fastai2-Medium/blob/master/01_Create_Datablock.ipynb and [towardsdatascience article](https://towardsdatascience.com/how-to-create-a-datablock-for-multispectral-satellite-image-segmentation-with-the-fastai-v2-bc5e82f4eb5)

do better at onnx export https://github.com/tkeyo/fastai-onnx/blob/main/fastai_to_onnx.ipynb and [dev.to article](https://dev.to/tkeyo/export-fastai-resnet-models-to-onnx-2gj7)

## use smallest image possible

see how this influences inference time

- read from raw np file - .png is smaller and loads to give the same data
    - saving as int might be most efficient - just save as cropped grey image
    - if so, make data prep code sharable between training/inference
- use black and white
- crop 
    - static crop?
    - use face mesh to locate face and crop around it?
        - which is faster/better: face mesh or cv2 HAAR thing
    
## build model that classifies viseme and regresses where face is pointing

https://walkwithfastai.com/Multimodal_Head_and_Kaggle