In [1]:
# import the necessary packages
from matplotlib import pyplot as plt
from imutils.video import FPS
import numpy as np
import argparse
import imutils
import cv2

In [39]:
def plt_imshow(title, image):
	# convert the image frame BGR to RGB color space and display it
	image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
	plt.imshow(image)
	plt.title(title)
	plt.grid(False)
	plt.show()

In [42]:
args = {
  "input" : "/content/real-time-object-detection/input_video_cars.mp4",
	"output" : "output.avi",
	"prototxt": "model.prototxt.txt",
	"model": "model_weights.caffemodel",
	"confidence" : 0.5
}

In [44]:
# initialize the list of class labels model was trained to
# detect, then generate a set of bounding box colors for each class
CLASSES = ["background", "aeroplane", "bicycle", "bird", "boat",
	"bottle", "bus", "car", "cat", "chair", "cow", "diningtable",
	"dog", "horse", "motorbike", "person", "pottedplant", "sheep",
	"sofa", "train", "tvmonitor"]
COLORS = np.random.uniform(0, 255, size=(len(CLASSES), 3))

# load our serialized model from disk
print("loading model...")
net = cv2.dnn.readNetFromCaffe(args["prototxt"], args["model"])

loading model...


In [45]:
# Model Visualization
model_path = '/content/real-time-object-detection/model.prototxt.txt'

# Read the prototxt file
with open(model_path, 'r') as f:
    lines = f.readlines()

# Initialize a list to store layer names
layer_names = []

# Parse the prototxt file to extract layer names
for line in lines:
    line = line.strip()
    if line.startswith("name:"):
        layer_name = line.split('"')[1]
        layer_names.append(layer_name)

# Print the list of layer names
for name in layer_names:
    print(name)


MobileNet-SSD
conv0
conv0/relu
conv1/dw
conv1/dw/relu
conv1
conv1/relu
conv2/dw
conv2/dw/relu
conv2
conv2/relu
conv3/dw
conv3/dw/relu
conv3
conv3/relu
conv4/dw
conv4/dw/relu
conv4
conv4/relu
conv5/dw
conv5/dw/relu
conv5
conv5/relu
conv6/dw
conv6/dw/relu
conv6
conv6/relu
conv7/dw
conv7/dw/relu
conv7
conv7/relu
conv8/dw
conv8/dw/relu
conv8
conv8/relu
conv9/dw
conv9/dw/relu
conv9
conv9/relu
conv10/dw
conv10/dw/relu
conv10
conv10/relu
conv11/dw
conv11/dw/relu
conv11
conv11/relu
conv12/dw
conv12/dw/relu
conv12
conv12/relu
conv13/dw
conv13/dw/relu
conv13
conv13/relu
conv14_1
conv14_1/relu
conv14_2
conv14_2/relu
conv15_1
conv15_1/relu
conv15_2
conv15_2/relu
conv16_1
conv16_1/relu
conv16_2
conv16_2/relu
conv17_1
conv17_1/relu
conv17_2
conv17_2/relu
conv11_mbox_loc
conv11_mbox_loc_perm
conv11_mbox_loc_flat
conv11_mbox_conf
conv11_mbox_conf_perm
conv11_mbox_conf_flat
conv11_mbox_priorbox
conv13_mbox_loc
conv13_mbox_loc_perm
conv13_mbox_loc_flat
conv13_mbox_conf
conv13_mbox_conf_perm
conv13_mbox_

In [47]:
# grab a reference to the video file, initialize pointer to output
# video file, and initialize the FPS counter
print("Opening video file...")
vs = cv2.VideoCapture(args["input"])
writer = None
fps = FPS().start()

Opening video file...


In [48]:
# loop over the frames from the video stream
while True:
	# grab the next frame
	frame = vs.read()[1]

	# if we did not grab a frame then we have reached the end of the
	# video
	if frame is None:
		break

	# resize the frame to have a maximum width of 400 pixels
	frame = imutils.resize(frame, width=400) #400

	# grab the frame dimensions and convert it to a blob
	(h, w) = frame.shape[:2]
	blob = cv2.dnn.blobFromImage(frame, 0.007843, (300, 300), 127.5)

	# pass the blob through the network and obtain the detections and
	# predictions
	net.setInput(blob)
	detections = net.forward()

 	# loop over the detections
	for i in np.arange(0, detections.shape[2]):
		# extract the confidence (i.e., probability) associated with
		# the prediction
		confidence = detections[0, 0, i, 2]

		# filter out weak detections by ensuring the `confidence` is
		# greater than the minimum confidence
		if confidence > args["confidence"]:
			# extract the index of the class label from the
			# `detections`, then compute the (x, y)-coordinates of
			# the bounding box for the object
			idx = int(detections[0, 0, i, 1])
			box = detections[0, 0, i, 3:7] * np.array([w, h, w, h])
			(startX, startY, endX, endY) = box.astype("int")

			# draw the prediction on the frame
			label = "{}: {:.2f}%".format(CLASSES[idx],
				confidence * 100)
			cv2.rectangle(frame, (startX, startY), (endX, endY),
				COLORS[idx], 2)
			y = startY - 15 if startY - 15 > 15 else startY + 15
			cv2.putText(frame, label, (startX, y),
				cv2.FONT_HERSHEY_SIMPLEX, 0.5, COLORS[idx], 2)

	# update the FPS counter
	fps.update()

	# if the video writer is None *AND* we are supposed to write
	# the output video to disk initialize the writer
	if writer is None and args["output"] is not None:
		fourcc = cv2.VideoWriter_fourcc(*"MJPG")
		writer = cv2.VideoWriter(args["output"], fourcc, 20,
			(frame.shape[1], frame.shape[0]), True)

	# if the writer is not None, write the frame with recognized
	# faces to disk
	if writer is not None:
		writer.write(frame)

# stop the timer and display FPS information
fps.stop()
print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print("[INFO] approx. FPS: {:.2f}".format(fps.fps()))

# do a bit of cleanup
vs.release()

# check to see if the video writer point needs to be released
if writer is not None:
	writer.release()

[INFO] elapsed time: 88.41
[INFO] approx. FPS: 10.24


In [None]:
!ffmpeg -i output.avi output.mp4

In [49]:
#@title Display video inline
from IPython.display import HTML
from base64 import b64encode

mp4 = open("output.mp4", "rb").read()
dataURL = "data:video/mp4;base64," + b64encode(mp4).decode()
HTML("""
<video width=800 controls>
      <source src="%s" type="video/mp4">
</video>
""" % dataURL)