In [5]:
import cv2
import numpy as np

# Open image and convert to tensor
image = cv2.imread('data/Lena.png', cv2.IMREAD_COLOR)

# Creating 4-dimensional blog from image

tensor = cv2.dnn.blobFromImage(image, scalefactor=1.0, size=(224,224),
                               mean=(104,117, 123), swapRB=False, crop=False)
"""
Docstring:
    cv2.dnn.blobFromImage
    blobFromImage(image, scalefactor, size, mean, swapRB, crop) 
    
    1. The function resizes the image. If the crop flag is True, the input image is resized
    while preserving the aspect ratio. One dimension (width or height) of the image
    is set to a desirable value and the other is set equal or greater than the
    corresponding value in the size argument. Then, the resulting image from the
    center is cropped to the necessary size. If the crop flag is False, the function just
    resizes to the target spatial size.

    2. The function converts the values of the resized image to a floating-point type, if
    necessary.

    3. The function swaps the first and last channels if the corresponding argument is
    True. This is necessary because OpenCV gives images in the BGR channel order
    after loading, but some Deep Learning models may be trained for images with
    the RGB channel order.

    4. The function then subtracts the mean value from each pixel of the image. The
    corresponding argument may be either a three-value tuple or just a one-value
    tuple. If it is a three-value tuple, each value is subtracted from the corresponding
    channel after the channels are swapped. If it's a single value, it is subtracted from
    each channel.

    5. Multiply the resulting image by the scale factor (2nd argument).

    6. Convert the three-dimensional image to a four-dimensional tensor with an
    NCHW order of dimensions.
    
    It's important to say that the preprocessing must be the same as it was while training the
    model. Otherwise, the model may work poorly or even not work at all. If you've trained the
    model yourself, you know all the parameters. But if you've found the model on the internet,
    you need to examine the description of the model or training scripts to get the necessary
    information.
"""

# Tensor from multiple images:
tensors = cv2.dnn.blobFromImages([image, image], scalefactor=1.0, size=(224,224),
                               mean=(104,117, 123), swapRB=False, crop=False)

print("   Single Tensor shape:", tensor.shape)
print("Multiple Tensors shape:", tensors.shape)

# ==============================================================
# Load GoogleNet pre-trained Inception NN (ImageNet winner 2014)
# http://deeplearning.net/tag/googlenet/
# ==============================================================
model = cv2.dnn.readNetFromCaffe('data/bvlc_googlenet.prototxt', 
                               'data/bvlc_googlenet.caffemodel')

# ==============================================================
# Report model Info
# ==============================================================
print('\nInception model info:')
print('gflops:', model.getFLOPS((1,3,224,224))*1e-9)
w,b = model.getMemoryConsumption((1,3,224,224))
print('weights (mb):', w*1e-6, ', blobs (mb):', b*1e-6)


   Single Tensor shape: (1, 3, 224, 224)
Multiple Tensors shape: (2, 3, 224, 224)

Inception model info:
gflops: 3.1904431360000003
weights (mb): 27.994208 , blobs (mb): 45.92096


In [6]:
# ==============================================================
# Perform a single forward-pass inference and calculate timings
# ==============================================================

model.setInput(tensors)
prob = model.forward()

# Timing machinery
total, timings = model.getPerfProfile()
tick2ms = 1e3/cv2.getTickFrequency()
print('Inference (ms): {:2f}\n'.format(total*tick2ms))

layer_names = model.getLayerNames()
print('{: <30} {}'.format('LAYER', 'TIME (ms)'))
for (i,t) in enumerate(timings[0:7]):
    print('{}. {: <30} {:.2f}'.format(i, layer_names[i], t[0]*tick2ms))
print('...')
for (i,t) in enumerate(timings[135:138]):
    i += 135
    print('{}. {: <30} {:.2f}'.format(i, layer_names[i], t[0]*tick2ms))
# ==============================================================

Inference (ms): 126.380455

LAYER                          TIME (ms)
0. conv1/7x7_s2                   13.27
1. conv1/relu_7x7                 0.00
2. pool1/3x3_s2                   1.51
3. pool1/norm1                    2.86
4. conv2/3x3_reduce               1.25
5. conv2/relu_3x3_reduce          0.00
6. conv2/3x3                      18.92
...
135. inception_5b/pool_proj         0.46
136. inception_5b/relu_pool_proj    0.00
137. inception_5b/output            0.20


In [7]:
# ==============================================================
# GoogleNet Image Classification
# ==============================================================

def classify(video_src, net, in_layer, out_layer,
             mean_val, category_names, swap_channels=False):
    """
    Classification function for GoogleNet neural network.

        1. Gets frames from a videos,
        2. Transforms them into tensors,
        3. Forward feed into neural network,
        4. Selects the highest probability out of five categories
        
    """
    cap = cv2.VideoCapture(video_src)
    t = 0
    while True:
        # --------------------- Get Video Frame and Status ------------------ #
        status_cap, frame = cap.read()
        if not status_cap:
            break
        frame = cv2.resize(frame, None, fx=0.25, fy=0.25)
        
        if isinstance(mean_val, np.ndarray):
            tensor = cv2.dnn.blobFromImage(frame, 1.0, (224, 224), 1.0, False)
            tensor -= mean_val
        else:
            tensor = cv2.dnn.blobFromImage(frame, 1.0, (224, 224), mean_val, swap_channels)
        
        # --------------------- Forward Propagation of NN ------------------ #
        net.setInput(tensor, in_layer)
        prob = net.forward(out_layer)
        prob = prob.flatten()
        
        # ---------------------- Add Prediction to Frame ------------------- #
        r = 1
        for i in np.argsort(prob)[-5:]:
            txt = ' "%s"; probability: %.2f' % (category_names[i], prob[i])
            cv2.putText(img=frame,
                        text=txt,
                        org=(0, frame.shape[0] - r*20), 
                        fontFace=cv2.FONT_HERSHEY_SIMPLEX,
                        fontScale=0.7,
                        color=(0, 255, 0),
                        thickness=2);
            r += 1
            
        # --------------------------- Flow Control ------------------------ #
        cv2.imshow('classification', frame)
        if cv2.waitKey(1) == 27:
            break
        
    cv2.destroyAllWindows()
    cap.release()


In [9]:
# ==============================================================
# Show ImageNet categories
# ==============================================================

with open('data/synset_words.txt') as f:
    class_names = [' '.join(l.split(' ')[1: ]).rstrip() for l in f.readlines()]
print("ImageNet classes:", class_names.__len__())
class_names[0:10]

ImageNet classes: 1000


['tench, Tinca tinca',
 'goldfish, Carassius auratus',
 'great white shark, white shark, man-eater, man-eating shark, Carcharodon carcharias',
 'tiger shark, Galeocerdo cuvieri',
 'hammerhead, hammerhead shark',
 'electric ray, crampfish, numbfish, torpedo',
 'stingray',
 'cock',
 'hen',
 'ostrich, Struthio camelus']

In [10]:
# ==============================================================
# Classify video with Inception NN
# ==============================================================

googlenet_caffe = cv2.dnn.readNetFromCaffe('data/bvlc_googlenet.prototxt', 
                                           'data/bvlc_googlenet.caffemodel')

# video_path = "data/Traffic.mp4"
video_path = "data/shuttle.mp4"

classify(video_path, googlenet_caffe, 'data', 'prob', (104, 117, 123), class_names)

In [None]:
resnet_caffe = cv2.dnn.readNetFromCaffe('../data/resnet_50.prototxt', 
                                           '../data/resnet_50.caffemodel')
mean = np.load('../data/resnet_50_mean.npy')

classify('../data/shuttle.mp4', resnet_caffe, 'data', 'prob', mean, class_names)