In [7]:
# import the necessary packages
from imutils.video import VideoStream, FPS
import numpy as np
import imutils
import time
import cv2

In [None]:
# initialize video stream
vs = VideoStream(src=0).start()
time.sleep(2.0)

# display the video stream
while True:
    frame = vs.read()
    frame = imutils.resize(frame, width=400)

    cv2.imshow("Frame", frame)

    # if the 'q' key is pressed, break from the loop
    if cv2.waitKey(1) == ord("q"):
        break

# do a bit of cleanup
cv2.destroyAllWindows()
vs.stop()

In [8]:
# load trained model and text description of the network architecture (prototxt file)
prototxt = "/Users/franziskamack/Downloads/MobileNetSSD_deploy.prototxt.txt"
model = "/Users/franziskamack/Downloads/MobileNetSSD_deploy.caffemodel"
# use opencv's Deep Neural Network module to read the model in
net = cv2.dnn.readNetFromCaffe(prototxt, model)

In [9]:
# initialize the list of class labels MobileNet SSD was trained to detect
CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat",
    "bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
    "dog", "horse", "motorbike", "person", "pottedplant", "sheep",
    "sofa", "train", "tvmonitor", "scissors", "banana", "apple", "carrot"]
# generate a set of bounding box colors for each class
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

In [10]:
vs = VideoStream(src=0).start()
time.sleep(2.0)

In [5]:
frame = vs.read()
frame = imutils.resize(frame, width=400)

# grab the frame dimensions and convert it to a blob
(h, w) = frame.shape[:2]
blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)),
0.007843, (300, 300), 127.5)

print(type(blob))
print(blob.shape)
blob

<class 'numpy.ndarray'>
(1, 3, 300, 300)


array([[[[ 0.4901875 ,  0.44312948,  0.4588155 , ...,  0.2627405 ,
           0.2941125 ,  0.2784265 ],
         [ 0.5058735 ,  0.44312948,  0.4588155 , ...,  0.2784265 ,
           0.2941125 ,  0.2627405 ],
         [ 0.4901875 ,  0.4352865 ,  0.4666585 , ...,  0.2941125 ,
           0.3019555 ,  0.2705835 ],
         ...,
         [ 0.0666655 ,  0.0588225 ,  0.0666655 , ..., -0.0666655 ,
          -0.0588225 , -0.0745085 ],
         [ 0.0431365 ,  0.0352935 ,  0.0509795 , ..., -0.0588225 ,
          -0.0588225 , -0.0666655 ],
         [ 0.0117645 ,  0.0274505 ,  0.0431365 , ..., -0.0509795 ,
          -0.0509795 , -0.0588225 ]],

        [[ 1.521542  ,  1.482327  ,  1.49017   , ...,  1.419583  ,
           1.443112  ,  1.419583  ],
         [ 1.545071  ,  1.482327  ,  1.49017   , ...,  1.435269  ,
           1.443112  ,  1.41174   ],
         [ 1.537228  ,  1.474484  ,  1.49017   , ...,  1.443112  ,
           1.450955  ,  1.419583  ],
         ...,
         [ 1.129392  ,  1.105863  

In [6]:
# pass the blob through the network and obtain the detections and predictions
net.setInput(blob)
netOutput = net.forward()
    
print(netOutput.shape)
netOutput

(1, 1, 100, 7)


array([[[[ 0.        , 15.        ,  0.9987282 ,  0.020123  ,
           0.05524048,  0.8954026 ,  0.9950789 ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.        ],
         [ 0.        ,  0.        ,  0.        ,  0.        ,
           0.        ,  0.        ,  0.   

In [None]:
print(netOutput[0,0,:,:].shape)
netOutput[0,0,:,:]

In [None]:
# loop over the frames from the video stream
while True:
    # grab the frame from the threaded video stream and resize it
    # to have a maximum width of 400 pixels
    frame = vs.read()
    frame = imutils.resize(frame, width=400)

    # grab the frame dimensions and convert it to a blob
    (h, w) = frame.shape[0:2]
    blob = cv2.dnn.blobFromImage(cv2.resize(frame, (300, 300)), 0.007843, (300, 300), 127.5)

    # pass the blob through the network and obtain the detections and
    # predictions
    net.setInput(blob)
    netOutput = net.forward()
    
    # loop over the detections
    for detection in netOutput[0,0,:,:]:
        # extract the confidence (i.e., probability) associated with
        # the prediction
        confidence = float(detection[2])

        # filter out weak detections by ensuring the 'confidence' is
        # greater than 80%
        if confidence > 0.8:
            # extract the index of the class label from the 'detection'
            idx = int(detection[1])
            
            # then compute the (x, y)-coordinates of the bounding box for the object
            left = int(detection[3] * w)
            top = int(detection[4] * h)
            right = int(detection[5] * w)
            bottom = int(detection[6] * h)
 
            #draw a rectangle around detected objects
            cv2.rectangle(frame, (left, top), (right, bottom), COLORS[idx], thickness=2)

            # draw the prediction on the frame
            label = "{}: {:.2f}%".format(CLASSES[idx], confidence * 100)
            y = top - 15 if top - 15 > 15 else top + 15
            cv2.putText(frame, label, (left, y),
            cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2)

    # show the output frame
    cv2.imshow("Frame", frame)
    # key = cv2.waitKey(1) & 0xFF

    # if the 'q' key was pressed, break from the loop
    if cv2.waitKey(1) == ord("q"):
        break

In [None]:
# do a bit of cleanup
cv2.destroyAllWindows()
vs.stop()