In [1]:
import cv2
import numpy as np
 
# Loading image
img = cv2.imread("images/traffic.jpg")

In [2]:
# Load Yolo model files
yolo_weight = "model/yolov3.weights"
yolo_config = "model/yolov3.cfg"
coco_labels = "model/coco.names"
net = cv2.dnn.readNet(yolo_weight, yolo_config)

In [5]:
#Load coco object names file.
#storing classes of objects.
#objects falling in this classes will only be detected.
classes = []
with open("model/coco.names", "r") as f:
    classes = [line.strip() for line in f.readlines()]
print(classes)    

['person', 'bicycle', 'car', 'motorbike', 'aeroplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'sofa', 'pottedplant', 'bed', 'diningtable', 'toilet', 'tvmonitor', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']


As we are working with YOLOv3-416 model, to get better results we should resize image to (416,416).

In [7]:
# # Defining desired shape
fWidth = 416
fHeight = 416
# Resize image in opencv
img = cv2.resize(img, (fWidth, fHeight))
height, width, channels = img.shape
print(height,width,channels)

416 416 3


<h3>In this step we are giving input image to the YOLO object detection network. Now we can not provide a simple image to the YOLO object detection network. The YOLO object detection network only supports a particular type of format which is Blob. So we have to convert our input image to Blob, then only we can pass it to the network. We will do this by cv2.dnn.blobFromImage function. Then we are sending converted blob image data to YOLO network.</h3>

In [9]:
# Convert image to Blob
blob = cv2.dnn.blobFromImage(img, 1/255, (fWidth, fHeight), (0, 0, 0), True, crop=False)
# Set input for YOLO object detection
net.setInput(blob)

<img src="./images/architecture.jpg"/><br>
<h3>Here you can see there are three different output layers (Predict one, Predict two, Predict three). That means we have three different values coming out from YOLO network. Now in order to get output from these output layers, we need to know the names of these output layers.
These output layer numbers are layer-82,94,106<h3>

In [14]:
# Find names of all layers
layer_names = net.getLayerNames()
print("Following are the names of all the layers:\n",layer_names)
# Find names of three output layers
output_layers = [layer_names[i[0] - 1] for i in net.getUnconnectedOutLayers()]
print("Following are the names of all output layers:\n",output_layers)

Following are the names of all the layers:
 ['conv_0', 'bn_0', 'relu_1', 'conv_1', 'bn_1', 'relu_2', 'conv_2', 'bn_2', 'relu_3', 'conv_3', 'bn_3', 'relu_4', 'shortcut_4', 'conv_5', 'bn_5', 'relu_6', 'conv_6', 'bn_6', 'relu_7', 'conv_7', 'bn_7', 'relu_8', 'shortcut_8', 'conv_9', 'bn_9', 'relu_10', 'conv_10', 'bn_10', 'relu_11', 'shortcut_11', 'conv_12', 'bn_12', 'relu_13', 'conv_13', 'bn_13', 'relu_14', 'conv_14', 'bn_14', 'relu_15', 'shortcut_15', 'conv_16', 'bn_16', 'relu_17', 'conv_17', 'bn_17', 'relu_18', 'shortcut_18', 'conv_19', 'bn_19', 'relu_20', 'conv_20', 'bn_20', 'relu_21', 'shortcut_21', 'conv_22', 'bn_22', 'relu_23', 'conv_23', 'bn_23', 'relu_24', 'shortcut_24', 'conv_25', 'bn_25', 'relu_26', 'conv_26', 'bn_26', 'relu_27', 'shortcut_27', 'conv_28', 'bn_28', 'relu_29', 'conv_29', 'bn_29', 'relu_30', 'shortcut_30', 'conv_31', 'bn_31', 'relu_32', 'conv_32', 'bn_32', 'relu_33', 'shortcut_33', 'conv_34', 'bn_34', 'relu_35', 'conv_35', 'bn_35', 'relu_36', 'shortcut_36', 'conv_37'

<h1>Note: If you only need output from any pre-trained model, you just need to pass input data to output layers. Other layers are required while training any model. Just to remind you.</h1>

In [15]:
# Send blob data to forward pass
outs = net.forward(output_layers)
print(outs[0].shape)
print(outs[1].shape)
print(outs[2].shape)

(507, 85)
(2028, 85)
(8112, 85)


In [19]:
# Generating random color for all 80 classes
colors = np.random.uniform(0, 255, size=(len(classes), 3))

In [17]:
class_ids=[]
confidences=[]
boxes=[]
for layer in outs:
    for detections in layer:
        scores=detections[5:]
        class_id=np.argmax(scores)
        confidence=scores[class_id]
        if confidence>0.5:
            center_x=int(detections[0]*width)
            center_y=int(detections[1]*height)
            w = int(detections[2] * width)
            h = int(detections[3] * height)
            # Rectangle coordinates
            x = int(center_x - w / 2)
            y = int(center_y - h / 2)
            boxes.append([x, y, w, h])
            confidences.append(float(confidence))
            class_ids.append(class_id)
 
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.5, 0.4)


In [20]:
# Draw bounding box with text for each object
font = cv2.FONT_HERSHEY_DUPLEX
for i in range(len(boxes)):
    if i in indexes:
        x, y, w, h = boxes[i]
        label = str(classes[class_ids[i]])
        confidence_label = int(confidences[i] * 100)
        color = colors[i]
        cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
        cv2.putText(img, f'{label, confidence_label}', (x-25, y + 75), font, 1, color, 2)
 
cv2.imshow("Image", img)
cv2.waitKey(0)
cv2.destroyAllWindows()