In [1]:
# import necessary libraries
import matplotlib.pyplot as plt 
import torch
import torchvision.transforms as T
import torchvision
import numpy as np 
import cv2
import warnings
warnings.filterwarnings('ignore')

In [2]:
# load model
model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
# set to evaluation mode
model.eval()


# load the COCO dataset category names
# we will use the same list for this notebook
COCO_INSTANCE_CATEGORY_NAMES = [
    '__background__', 'person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus',
    'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'N/A', 'stop sign',
    'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow',
    'elephant', 'bear', 'zebra', 'giraffe', 'N/A', 'backpack', 'umbrella', 'N/A', 'N/A',
    'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball',
    'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket',
    'bottle', 'N/A', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl',
    'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza',
    'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'N/A', 'dining table',
    'N/A', 'N/A', 'toilet', 'N/A', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone',
    'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'N/A', 'book',
    'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush'
]

Downloading: "https://download.pytorch.org/models/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth" to /root/.cache/torch/hub/checkpoints/fasterrcnn_resnet50_fpn_coco-258fb6c6.pth


  0%|          | 0.00/160M [00:00<?, ?B/s]

In [3]:
class Object_Detector:
    def __init__(self):
        pass
    
    def get_prediction(self, img, confidence):
      # Define a transform to convert the input image to a tensor.
      transform = T.Compose([T.ToTensor()])
      img = transform(img)
      
      # Use the pre-trained object detection model to get predictions for the image.
      pred = model([img])
      
      # Get the labels, bounding boxes, and scores for the detected objects.
      pred_class = [COCO_INSTANCE_CATEGORY_NAMES[i] for i in list(pred[0]['labels'].numpy())]
      pred_boxes = [[(i[0], i[1]), (i[2], i[3])] for i in list(pred[0]['boxes'].detach().numpy())]
      pred_score = list(pred[0]['scores'].detach().numpy())
      
      # Only keep predictions with confidence scores greater than the specified threshold.
      pred_t = [pred_score.index(x) for x in pred_score if x>confidence][-1]
      pred_boxes = pred_boxes[:pred_t+1]
      pred_class = pred_class[:pred_t+1]
      
      return pred_boxes, pred_class

    def detect_object(self, img, confidence=0.5):
      # Get the predicted bounding boxes and class labels for the detected objects in the image.
      boxes, pred_cls = self.get_prediction(img, confidence)

      # Draw a bounding box and label for each detected object in the image.
      for i in range(len(boxes)):
        # Draw a rectangle around the object.
        cv2.rectangle(img, (int(boxes[i][0][0]), int(boxes[i][0][1])), (int(boxes[i][1][0]), int(boxes[i][1][1])),color=(0, 255, 0), thickness=3)
        # Add a label to the object.
        cv2.putText(img,pred_cls[i], (int(boxes[i][0][0]), int(boxes[i][0][1])), cv2.FONT_HERSHEY_SIMPLEX, 1, (0,255,0),thickness=2)

      # Return the image with the bounding boxes and labels added.
      return img
    

In [4]:
# Create a VideoCapture object and read the input video.
vidcap = cv2.VideoCapture('/kaggle/input/car-video/video.mp4')

# Initialize variables for the loop.
success = True
count = 0
img_array = []

# Loop over each frame in the video.
while success:
    # Read the next frame from the video.
    success,image = vidcap.read()
    
    # Check if the frame was successfully read.
    if(success == 0 or image is None): 
        break
        
    # Create an instance of the Object_Detector class.
    OD = Object_Detector()
    
    # Detect objects in the frame and add the frame to the img_array list.
    img_array.append(OD.detect_object(image, confidence=0.7))
    
    # Save certain frames as JPEG files.
    if(count == 12 or count == 200 or count == 400):
        cv2.imwrite("frame%d.jpg" % count, img_array[-1])     # save frame as JPEG file
        
    # Stop the loop after a certain number of frames.
    if(count == 300): # Edge Case
        break
        
    # Increment the frame count.
    count += 1


In [5]:
# Determining the size of the frames and initializing a VideoWriter object for writing the output video file
height, width, layers = img_array[0].shape
size = (width,height)
out = cv2.VideoWriter('project_YOLO.avi',cv2.VideoWriter_fourcc(*'DIVX'), 15, size)
 
# Writing each frame of the output video file
for i in range(len(img_array)):
    out.write(img_array[i])
    
# Releasing the VideoWriter object
out.release()