<a href="https://colab.research.google.com/github/mongoq/thesis/blob/main/video_bb_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import cv2
import numpy as np
np.set_printoptions(suppress=True)
from urllib.request import Request, urlopen
import urllib
import time
import os
import traceback
import pafy

In [None]:
"""
Class Detection consists basically everything useful in this script. It has 4 methods(class functions): 
configure_net, detect, draw_img and keyboard. Those methods share a lot of parameters. 
Instead of returning 20 parameters from one function and passing them into next function, 
I decided to make a class that would store them as variables of a class instance.
In __init__ I just wanted to give a heads up which variables exist and will be used by methods,
but majority of them is filled not in __init__ but in one of those 4 methods.
"""
class Detection:
    font = cv2.FONT_HERSHEY_PLAIN
    colors = ((255,0,0), (0,255,0), (0,0,255), (255,255,0), (0,255,255), (255,0,255), (128,0,0)) 
    
    def __init__(self, model_name = 'yolov3', input_height=416, path_yolo_classes='net/coco.txt', 
                 output_height=800, is_scale_output = True, MIN_confidence=0.5, IOU_threshold=0.6, 
                 is_blob_aspect_ratio=True, anchor_box_show=False, grid_show=False, show_text_on_box=True, 
                 is_recording=False, show_text_left=True):
        self.model_name = model_name
        self.net = None
        self.anchors = None
        
        self.grids_per_height = round(input_height/32)
        self.grids_per_width = self.grids_per_height
        self.input_height = self.grids_per_height * 32
        self.input_width = self.input_height
        if input_height%32:
            print('''Value of input_height={} is indivisible by 32, 
input_height={} will be used instead. 
Choose input_height that is an integer multiple of 32(eg.320,416,620,...).'''.format(input_height, 
                                                                                self.input_height))
            
        with open(path_yolo_classes, 'r') as f:
            self.classes = f.read().splitlines()
            
        self.anchor_box_show = anchor_box_show
        self.grid_show = grid_show   
        self.show_text_on_box = show_text_on_box
        self.show_text_left = show_text_left
        self.is_recording = is_recording
        self.is_any_frame_recorded = False
        
        self.MIN_confidence = MIN_confidence
        self.IOU_threshold = IOU_threshold
        self.FPS = 0.0
       
        self.img = None
        self.img_name = None
        self.img_with_drawings = None
        self.img_height, self.img_width = None, None
        self.boxes = None
        self.confidences = None
        self.best_class_ids = None
        self.grid_cells = None
        self.anchor_boxes = None
        self.bounding_box_centers = None
        self.detection_outputs = None
        
        self.is_blob_aspect_ratio = is_blob_aspect_ratio
        
        self.is_scale_output = is_scale_output
        self.output_height = output_height
          
        if cv2.cuda.getCudaEnabledDeviceCount():    
            self.is_cuda = True
            print('GPU is enabled.')
        else:
            self.is_cuda = False
            print('GPU is NOT enabled. OpenCV-{} will use CPU instead.'.format(cv2.__version__))
            
    
    """
    In this method we read in a specific model and set up parameters for it.
    """
    def configure_net(self, model_name=None):
        if model_name is None:
            model_name = self.model_name    
        files = os.listdir(r'./net')
        if '{}.weights'.format(model_name) not in files or '{}.cfg'.format(model_name) not in files:
            print('''"{mn}.weights" or "{mn}.cfg" not found in "/net" folder. 
Check if the file is there. '''.format(mn = model_name))
        else:
            self.model_name = model_name

            path_weights = 'net/{}.weights'.format(self.model_name)
            path_cfg = 'net/{}.cfg'.format(self.model_name)
            self.net = cv2.dnn.readNet(path_weights, path_cfg)

            if self.is_cuda:
                self.net.setPreferableBackend(cv2.dnn.DNN_BACKEND_CUDA)
                self.net.setPreferableTarget(cv2.dnn.DNN_TARGET_CUDA)
                
            """ Fetching anchor boxes values from .cfg file. """
            with open('net/{}.cfg'.format(self.model_name), 'r') as f:
                cfg = f.read().splitlines()
                anchors_line = [line for line in cfg if 'anchors' in line][0].replace('anchors', '').replace('=', '')
                anchors = np.array([int(num) for num in anchors_line.split(',')])
                anchors = anchors.reshape(int(len(anchors)/6), 3, 2)[::-1]
                self.anchors = anchors
                
    
    """
    This method runs detection on the image. It saves all parameters of detection as class instance variables.
    It saves ALL the detections above 0.1% accuracy from ALL grids. 
    It does not take into account given confidence and IOU threshold. 
    That will happen in draw_img('cv2.dnn.NMSBoxes') method.
    Thanks to that we only need to run detection on image once. So changing any parameters, besides size 
    and model from keybord level on image or paused video will result in an instant change on image, 
    because we already have all the information from detection, we just need to draw them.
    """    
    def detect(self, img):
        if self.is_blob_aspect_ratio:
            ratio_width2height = img.shape[1]/img.shape[0]
            self.grids_per_width = round((self.input_height * ratio_width2height)/32)
            self.input_width = self.grids_per_width*32
        else:
            self.input_width = self.input_height
            self.grids_per_width = self.grids_per_height
            
        """ Blob is the version of the original image adjusted perfectly for the neural net. """
        blob = cv2.dnn.blobFromImage(img, 1 / 255, (self.input_width, self.input_height),
                                 (0, 0, 0), swapRB=True, crop=False)
        
        self.net.setInput(blob)
        output_layers_names = self.net.getUnconnectedOutLayersNames()
        self.layerOutputs = self.net.forward(output_layers_names)
        
        """ After getting blob from original size image we resize it according to 'output_height', 
        so you could decide the size of output window, rather than take original size(eg.2160x3840, 120x210).
        Then we save resized image as class instance variable."""
        if self.is_scale_output:
            img = image_resize(img, height = self.output_height)
        self.img = img
        self.img_height, self.img_width, _ = self.img.shape
        
        boxes = []
        confidences = []
        best_class_ids = []
        grid_cells = []
        anchor_boxes = []
        bounding_box_centers = []
        detection_outputs = []
        
        """ Yolo algorithms give us multiple outputs. 'output' gives us detections from that output.
        'i' gives us the number of the detection output, so we could know from which resolution grid
        it came from (e.g. first output-13x13, second output-26x26, ...). """
        for i, output in enumerate(self.layerOutputs):
            """ Yolov4 gives us detection from the smallest grids(52x52) to the biggest(13x13)
            and rest of algorithms do the opposite. """
            if self.model_name == 'yolov4':
                if i==0:
                    i=2
                elif i==2:
                    i=0
            """ 'detection' is one singular detection from all detections from one 'output'.
            Every grid gives us 3 detections, so thanks to 'j' we can calculate from which anchor boxes 
            the detection came from. E.G 134 detection came from 134%(modulo)3=2 -> second anchor box
            and from which grid E.G. int(134/3)=44-grid. If we have grids_per_width=13, 
            that means 44/13=3 rest 5. So fourth(3+1) row and fifth column. """
            for j, detection in enumerate(output):
                """ All of the values from detection have values from 0 to 1. """
                scores = detection[5:]
                best_class_id = np.argmax(scores)
                confidence = detection[4] * scores[best_class_id]
#                 if i==0:
#                     print(j%3, self.anchors[i][j % 3], detection[:5], scores[best_class_id], 
#                           self.classes[best_class_id])

                if confidence > 0.001:
                    anchor_box = self.anchors[i][j % 3]
                    """ self.grids_per_width * 2 ** i-> 13 * 2 ** (0,1,2), so 13,26,52. """
                    grid_cell = [int(j / 3) % (self.grids_per_width * 2 ** i),
                                 int(j / (self.grids_per_width * 3 * 2 ** i))]
                    
                    center_x = round(detection[0] * self.img_width)
                    center_y = round(detection[1] * self.img_height)
                    w = round(detection[2] * self.img_width)
                    h = round(detection[3] * self.img_height)
                    x = round(center_x - w / 2)
                    y = round(center_y - h / 2)

                    boxes.append([x, y, w, h])
                    confidences.append((float(confidence)))
                    best_class_ids.append(best_class_id)
                    grid_cells.append(grid_cell)
                    anchor_boxes.append(anchor_box)
                    bounding_box_centers.append((center_x, center_y))
                    """ For scaling to proper grid(13,26,52,...). """
                    detection_outputs.append(i)
            
        self.boxes = boxes
        self.confidences = confidences
        self.best_class_ids = best_class_ids
        self.grid_cells = grid_cells
        self.anchor_boxes = anchor_boxes
        self.bounding_box_centers = bounding_box_centers
        self.detection_outputs = detection_outputs
        
    """
    This method draws all accessories according to parameters.
    """
    def draw_img(self):
        img = self.img.copy()
        if self.show_text_left:
            cv2.putText(img, "IOU:  {0:.0%}".format(self.IOU_threshold), (20, 40), self.font, 3, (0, 0, 255), 3)
            cv2.putText(img, "CONF: {0:.0%}".format(self.MIN_confidence), (20, 80), self.font, 3, (255, 0, 0), 3)
        
        """ This method uses MIN_confidence and IOU_threshold to choose right indexes of boxes to show.
        E.G. detection found 40 objects on the image, but some of them have low confidence
        and some of them seems to show the same object(IOU). So the method decides to only show
        objects with indexes i=3,11,18. 'c' is enumerator. """
        indexes = cv2.dnn.NMSBoxes(self.boxes, self.confidences, self.MIN_confidence, self.IOU_threshold)
        if len(indexes) > 0:
            for c, i in enumerate(indexes.flatten()):
                x, y, w, h = self.boxes[i]
                label = str(self.classes[self.best_class_ids[i]])
                confidence = self.confidences[i]
                color = self.colors[c%len(self.colors)]
                cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
                if self.show_text_on_box:
                    cv2.putText(img, '{}.{} {:.0%}'.format(c+1, label, confidence), (x+2, y-5), 
                                self.font, 2, color, 3)
                elif self.show_text_left:
                    cv2.putText(img, '{}'.format(c+1), (x+2, y-5), self.font, 2, color, 3)

                num_of_grids_w = (self.grids_per_width * 2 ** self.detection_outputs[i])
                num_of_grids_h = (self.grids_per_height * 2 ** self.detection_outputs[i])
                """ OpenCV draws rectangles using left top corner and right down corner parameters. """
                grid_corner_x= int(round(self.grid_cells[i][0] * self.img_width / num_of_grids_w))
                grid_corner_y = int(round(self.grid_cells[i][1] * self.img_height / num_of_grids_h))
                grid_w = round(self.img_width / num_of_grids_w)
                grid_h = round(self.img_height / num_of_grids_h)
                if self.grid_show:
                    cv2.rectangle(img, (grid_corner_x, grid_corner_y), (grid_corner_x+ grid_w, grid_corner_y + grid_h), 
                                  color, int(4 / 2 ** self.detection_outputs[i]))
                    cv2.circle(img, (self.bounding_box_centers[i]), 3, color, 4)

                ab_center_x = round(grid_corner_x+ grid_w * 0.5)
                ab_center_y = round(grid_corner_y + grid_h * 0.5)
                ab_width = self.anchor_boxes[i][0] * self.img_width / self.input_width
                ab_height = self.anchor_boxes[i][1] * self.img_height / self.input_height
                if self.anchor_box_show:
                    cv2.rectangle(img, (round(ab_center_x - 0.5 * ab_width),
                                        round(ab_center_y - 0.5 * ab_height)),
                                       (round(ab_center_x + 0.5 * ab_width), 
                                        round(ab_center_y + 0.5*ab_height)), 
                                       color, int(4 / 2 ** self.detection_outputs[i]))
                    cv2.rectangle(img, (round(ab_center_x - 0.5 * ab_width),
                                        round(ab_center_y - 0.5 * ab_height)),
                                       (round(ab_center_x + 0.5 * ab_width), 
                                        round(ab_center_y + 0.5*ab_height)), 
                                       (255,255,255), 1)
                    text = '{}:{} {:.0%} {}({})'.format(c+1, label, confidence, self.anchor_boxes[i], 
                                    self.detection_outputs[i])
                else:
                    text = '{}:{} {:.0%}'.format(c+1, label, confidence)
                if self.show_text_left:
                    cv2.putText(img, text, (20, 210 + 30 * c),self.font, 2, color, 3)
        if self.show_text_left:    
            cv2.putText(img, '{mn} {iw}x{ih}'.format(mn=self.model_name, iw=self.input_width, 
                        ih=self.input_height), (20, 110),self.font, 2, (255,0,255), 3)
            cv2.putText(img, 'FPS: {:.2f}'.format(self.FPS), (20, 140),
                    self.font, 2, (255,0,255), 3)
            cv2.putText(img, 'REC:{}'.format('ON' if self.is_recording else "OFF"), (20, 170),
            self.font, 2, ((0, 255, 0) if self.is_recording else (0, 0, 255)), 3)

        self.img_with_drawings = img
        cv2.imshow('Detection', img)
        
    """ 
    When pressed 'p' image will be saved to specific location.
    """
    def save_img(self):
        img_name = self.img_name
        if len(img_name) == 0: 
            img_name = 'camera'
        time_of_save = time.strftime('%d_%H%M%S', time.localtime())
        cv2.imwrite('detections/{}_{}.png'.format(img_name, time_of_save), 
                   self.img_with_drawings)    
    
    """ 
    Keyboard input handling. 
    """
    def keyboard(self, key, img):
        if key == ord('q'):
            return 'quit'
        elif key == 32: # 'Space'
            return 'pause-unpause'
        elif key == ord('3'):
            self.configure_net('yolov3')
            start_time = time.time()
            self.detect(img)
            self.FPS = 1/(time.time() - start_time)
            self.draw_img()
        elif key == ord('#'):
            self.configure_net('yolov3-tiny')
            start_time = time.time()
            self.detect(img)
            self.FPS = 1/(time.time() - start_time)
            self.draw_img()
        elif key == ord('4'):
            self.configure_net('yolov4')
            start_time = time.time()
            self.detect(img)
            self.FPS = 1/(time.time() - start_time)
            self.draw_img()
        elif key == ord('$'):
            self.configure_net('yolov4-tiny')
            start_time = time.time()
            self.detect(img)
            self.FPS = 1/(time.time() - start_time)
            self.draw_img()      
        elif key == 93: # ']'
            self.grids_per_height = min(self.grids_per_height + 1, 1024)
            self.input_height = self.grids_per_height * 32
            self.detect(img)
            self.draw_img()
        elif key == 91: # '['    
            self.grids_per_height = max(self.grids_per_height - 1, 1)
            self.input_height = self.grids_per_height * 32
            self.detect(img)
            self.draw_img()
        elif key == ord('r'):
            self.is_blob_aspect_ratio = not self.is_blob_aspect_ratio
            self.detect(img)
            self.draw_img()    
        elif key == ord('t'):
            self.show_text_on_box = not self.show_text_on_box
            self.draw_img() 
        elif key == ord('T'):
            self.show_text_left = not self.show_text_left
            self.draw_img() 
        elif key == ord('p'):
            self.save_img()
        elif key == ord('v'):
            self.is_recording = not self.is_recording
        elif key == ord('w'):
            self.IOU_threshold = min(self.IOU_threshold + 0.01, 1)
            self.draw_img()
        elif key == ord('s'):
            self.IOU_threshold = max(self.IOU_threshold - 0.01, 0)
            self.draw_img()
        elif key == ord('d'):
            self.MIN_confidence = min(self.MIN_confidence + 0.01, 1)
            self.draw_img()
        elif key == ord('a'):
            self.MIN_confidence = max(self.MIN_confidence - 0.01, 0)
            self.draw_img()
        elif key == ord('W'):
            self.IOU_threshold = min(self.IOU_threshold + 0.1, 1)
            self.draw_img()
        elif key == ord('S'):
            self.IOU_threshold = max(self.IOU_threshold - 0.1, 0)
            self.draw_img()
        elif key == ord('D'):
            self.MIN_confidence = min(self.MIN_confidence + 0.1, 1)
            self.draw_img()
        elif key == ord('A'):
            self.MIN_confidence = max(self.MIN_confidence - 0.1, 0)
            self.draw_img()
        elif key == ord('g'):
            self.grid_show = not self.grid_show
            self.draw_img()
        elif key == ord('b'):
            self.anchor_box_show = not self.anchor_box_show
            self.draw_img()
 
""" Resizing image. """
def image_resize(image, width = None, height = None, inter = cv2.INTER_AREA):
    dim = None
    (h, w) = image.shape[:2]
    if width is None and height is None:
        return image
    if width is None:
        r = height / float(h)
        dim = (int(w * r), height)
    else:
        r = width / float(w)
        dim = (width, int(h * r))
    resized = cv2.resize(image, dim, interpolation = inter)
    return resized

def video_recording(self, output_path):
    self.video_record =  cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MJPG'),
                               self.FPS, (self.img_width, self.img_height))

In [None]:
""" Run detection on video from your computer or camera. """
Det = Detection(model_name='yolov4-tiny', input_height=320, output_height=700, is_blob_aspect_ratio=False)
Det.configure_net()

""" For capturing camera videos enter '0', 
or '1,2,3,...' if you have multiple cameras connected to your computer. """
path_to_folder_input = r'data'
video_source_name = 'NY.wmv'

url   = "https://www.youtube.com/watch?v=kf4brQ2g5FI"
video = pafy.new(url)
best  = video.getbest(preftype="mp4")
capture = cv2.VideoCapture(best.url)
""" CHOOSE """
#video_source_path = r'{}/{}'.format(path_to_folder_input, video_source_name) # video file
#video_source_path = best.url # web video
video_source_path = 0 # CAMERA

video = cv2.VideoCapture(video_source_path) # for video file

print('Video image size {}x{}.'.format(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)),
                                 int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))))
if not int(video.get(cv2.CAP_PROP_FRAME_WIDTH)):
    print("""You probably misspelled the file name or the file does not exist.
Or you forgot to plug your camera in.""")
    
    
" You could set up the name you desire or leave variable empty, then name of the source will be used. "    
video_save_name = '' 
if len(video_save_name) == 0:
    if video_source_path == 0:
        video_save_name = 'camera'
    elif video_source_path == best.url:
        video_save_name = 'web'
    else:
        video_save_name = video_source_name.split('.')[0]

Det.img_name = video_save_name

" Few runs on the net to get FPS for save file, or you could set it manually. "
FPS_of_source_video = video.get(cv2.CAP_PROP_FPS) 
for i in range(2):
    start_time = time.time()
    check, img = video.read()
    Det.detect(img)
    Det.draw_img()
    Det.FPS = 1/(time.time() - start_time)
""" CHOOSE """
FPS_of_save_video = FPS_of_source_video # Not advised when source of the video is camera
#FPS_of_save_video = Det.FPS 
    
time_of_save = time.strftime('%d_%H%M%S', time.localtime())
output_path = 'detections/{}_{}.mp4'.format(video_save_name, time_of_save)
video_record = cv2.VideoWriter(output_path, cv2.VideoWriter_fourcc(*'MJPG'),
                               FPS_of_save_video, (Det.img_width, Det.img_height))


run_detection = True
while run_detection:
    try:
        start_time = time.time()
        check, img = video.read()
        if type(video_source_path) is int:
            img = cv2.flip(img, 1)
        
        
        Det.detect(img)
        Det.draw_img()
        
        if Det.is_recording:
            Det.is_any_frame_recorded = True
            video_record.write(Det.img_with_drawings)

        """ If any keyboard key has been pressed while the program is running, 
        OpenCV will capture it and save it as 'key' variable.
        If no key has been pressed key=-1. """
        key = cv2.waitKey(1) 
        if key != -1:
            key_response = Det.keyboard(key, img)
            if key_response == 'quit': 
                run_detection = False
            elif key_response == 'pause-unpause':
                while True:
                    " cv2.waitKey(0) will wait until the key is pressed. "
                    key = cv2.waitKey(0)
                    key_response = Det.keyboard(key, img)
                    if key_response == 'quit':
                        run_detection = False
                        break
                    elif key_response == 'pause-unpause':
                        break
                    if cv2.getWindowProperty('Detection',cv2.WND_PROP_VISIBLE) < 1:        
                        run_detection = False
                        break 

        if cv2.getWindowProperty('Detection',cv2.WND_PROP_VISIBLE) < 1:        
            run_detection = False 
        Det.FPS = 1/(time.time() - start_time)
        
    except: 
        print(traceback.format_exc())
        video.release()
        video_record.release()
        if not Det.is_any_frame_recorded:
            os.remove(output_path)
        cv2.destroyAllWindows()
        break
video.release()
video_record.release()
if not Det.is_any_frame_recorded:
    os.remove(output_path)
cv2.destroyAllWindows()

In [None]:
""" Run detection on image from your computer. """
Det = Detection(model_name='yolov4', input_height=320, output_height=700)
Det.configure_net()
path_to_folder_input = r'data'
img_name_with_extension = 'cat4k.jpg'
Det.img_name = ''.join(img_name_with_extension.split('.')[:-1])
try:
    img = cv2.imread(r'{}\{}'.format(path_to_folder_input, img_name_with_extension))
    print('Image size {}.'.format(img.shape[:2]))
    if not int(img.shape[0]):
        print("You probably misspelled the file name or the file does not exist.")

    start_time = time.time()
    Det.detect(img)
    Det.FPS = 1/(time.time() - start_time)
    Det.draw_img()

    while True:
        Det.is_pause = True
        key = cv2.waitKey(0)
        key = Det.keyboard(key, img)
        if key == 'quit':
            run_detection = False
            break
        if cv2.getWindowProperty('Detection',cv2.WND_PROP_VISIBLE) < 1:        
            break 

except: 
    print(traceback.format_exc())
    cv2.destroyAllWindows()
    
cv2.destroyAllWindows()

In [None]:
 """ Run detection on image from the web. """
Det = Detection(model_name='yolov4', input_height=320, output_height=700)
Det.configure_net()
url_to_img = '''
https://64.media.tumblr.com/91ec579b616e006c1d689c51f763d6ce/tumblr_oklmipESuY1ucobdyo1_500.jpg
'''
img_save_name = ''
if len(img_save_name)>0 and img_name:
    Det.img_name = img_save_name
elif any(char in url_to_img.replace("\n", "").split('/')[-1] for char in ['#','%','&','*',':','<','>','?']):
    Det.img_name = 'web_img'
else:
    Det.img_name = url_to_img.replace("\n", "").split('/')[-1]

try:
    req = Request(url_to_img, headers={'User-Agent': 'Mozilla/5.0'})
    req = urlopen(req).read()
    arr = np.asarray(bytearray(req), dtype=np.uint8)
    img = cv2.imdecode(arr, -1)
    print('Image size {}.'.format(img.shape[:2]))
    if not int(img.shape[0]):
        print("You probably misspelled the file name or the file does not exist.")

    start_time = time.time()
    Det.detect(img)
    Det.FPS = 1/(time.time() - start_time)
    Det.draw_img()

    while True:
        Det.is_pause = True
        key = cv2.waitKey(0)
        key = Det.keyboard(key, img)
        if key == 'quit':
            run_detection = False
            break
        if cv2.getWindowProperty('Detection',cv2.WND_PROP_VISIBLE) < 1:        
            break             
except urllib.error.HTTPError:
    print("This file probably does not exist(check if you paste it correctly) or it is protected.")
    print(traceback.format_exc())   
    cv2.destroyAllWindows()
except: 
    print(traceback.format_exc())
    cv2.destroyAllWindows()
    
cv2.destroyAllWindows()