In [3]:
from ultralytics import YOLO
import cv2
from utils import *
from cls_detection import *
device = 'cuda' if torch.cuda.is_available() else 'cpu'


In [4]:
def get_detection_box(x1, y1, x2, y2, n_height = 3.5, n_width = 2):
    # Calculate the center of the box
    center_x = (x1 + x2) / 2
    center_y = (y1 + y2) / 2

    # Calculate new width and height
    new_width = (x2 - x1) * n_width
    new_height = (y2 - y1) * n_height

    # Calculate new coordinates and round them to the nearest integer
    new_x1 = int(round(center_x - new_width / 2))
    new_y1 = int(round(center_y - new_height / 2))
    new_x2 = int(round(center_x + new_width / 2))
    new_y2 = int(round(center_y + new_height / 2))

    return new_x1, new_y1, new_x2, new_y2

def get_entry_box(x1, y1, x2, y2, alpha = 0.2, beta = -0.2):
    box_height = y2 - y1
    box_width = x2 - x1
    
    y1_, y2_ = y1 - int(box_height * (1 + beta)), y1
    x1_, x2_ = x1 - int(box_width * alpha), x2 + int(box_width * alpha)
    
    return x1_, y1_, x2_, y2_

def get_exit_box(x1, y1, x2, y2, alpha = 0.2, beta = -0.2):
    box_height = y2 - y1
    box_width = x2 - x1
    
    y1_, y2_ = y2, y2 + int(box_height * (1 + beta))
    x1_, x2_ = x1 - int(box_width * alpha), x2 + int(box_width * alpha)
    
    return x1_, y1_, x2_, y2_

def is_in_box(x_center, y_center, x1, y1, x2, y2):
    if x1 <= x_center <= x2 and y1 <= y_center <= y2:
        return True
    return False

def get_center(x1, y1, x2, y2):
    return (x1 + x2) // 2, (y1 + y2) // 2



In [5]:

model = YOLO("weights/detect_large.pt")
conf = 0.4
skip_to_sec = 0
batch_size = 64
show_progress = True

video_path = "/homedir/ugrad/z/zw2688/bigdata/DL_project/videos/2023_12_17__20230731_Game8.mp4"



cap, fps, frame_width, frame_height, total_frames = initialize_video_capture(video_path=video_path, skip_to_sec = skip_to_sec)
out, output_path = initialize_video_writer(fps = fps,
                                           video_dimension= (frame_width, frame_height),
                                           video_path=video_path,
                                           )

num_batches = math.ceil(total_frames / batch_size)

colors = {
    "hoop_box": (0, 255, 0),          # Green
    "detection_area": (255, 0, 0), # Blue
    "entry_box": (0, 0, 255),      # Red
    "exit_box": (0, 255, 255),     # Yellow
    "basketball": (255, 255, 0),   # Cyan
    "person": (255, 0, 255)        # Magenta
}

box_containing_ball_prev = None
score = 0
no_relevant_ball = True
if show_progress:
    batch_range = tqdm(range(num_batches))
else:
    batch_range = range(num_batches)

for i in batch_range:
    frames = []
    for i in range(batch_size):
        ret, img = cap.read()
        if ret:
            frames.append(img)
        else:
            break

    if frames:
        results = model(frames, 
                        stream=False, 
                        verbose = False, 
                        conf=conf,
                        device=device)
    else:
        continue

    for frame, r in zip(frames, results):
        boxes = r.boxes
        
        
        bounding_boxes = boxes.xyxy.cpu().numpy()
        bounding_boxes = bounding_boxes.astype(int)
        labels = [model.names[i] for i in boxes.cls.cpu().numpy()]
        
        objects = {label: [] for label in labels}
        

        for box, label in zip(bounding_boxes, labels):
            objects[label].append(box)
        
        if "basketball" not in objects or "hoop" not in objects:
            out.write(frame)       
            continue
        hoop_box = objects["hoop"]
        detection_area = [get_detection_box(*box) for box in hoop_box]
        entry_box = [get_entry_box(*box) for box in hoop_box]
        exit_box = [get_exit_box(*box) for box in hoop_box]
        ball_center = [get_center(*box) for box in objects["basketball"]]
        relevant_ball_centers = [center for center in ball_center 
                                        for det_area in detection_area
                                        if is_in_box(*center, *det_area)]
        if not relevant_ball_centers:
            no_relevant_ball = True
            continue
        else:
            no_relevant_ball = False

        for ball_boxes in objects["basketball"]:
            cv2.circle(frame, get_center(*ball_boxes), 5, colors["basketball"], -1)
        focus_areas = {
            "detection_area": detection_area,
            "hoop_box": hoop_box,
            "entry_box": entry_box,
            "exit_box": exit_box
        }
        box_containing_ball_cur = None
        # determine which box the ball is in
        for box_name, all_boxes in focus_areas.items():
            for box in all_boxes:
                if any([is_in_box(*relevant_ball_center, *box) for relevant_ball_center in relevant_ball_centers]):
                    box_containing_ball_cur = box_name #if not no_relevant_ball else None
                    cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), colors[box_name], 2)
                    cv2.putText(frame, box_name, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, colors[box_name], 2)
                else:
                    cv2.rectangle(frame, (box[0], box[1]), (box[2], box[3]), (0, 0, 0), 2)
                    cv2.putText(frame, box_name, (box[0], box[1] - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 0), 2)
        
        if box_containing_ball_prev == "entry_box" and (box_containing_ball_cur == "hoop_box" or box_containing_ball_cur == "exit_box"):
            score += 1
            
        box_containing_ball_prev = box_containing_ball_cur
                    
  
        cv2.putText(frame, f"Score: {score}", (20, 80), cv2.FONT_HERSHEY_SIMPLEX, 3, (255, 255, 255), 2)
        out.write(frame)            
            
cap.release()
out.release()

#display_video(output_path, ffmpeg_path = "ffmpeg", width = 1000)

  4%|▎         | 9/247 [00:16<07:03,  1.78s/it]

100%|██████████| 247/247 [07:18<00:00,  1.78s/it]


scoring detection logic:
1. must be in entry box first
2. either be detected in hoop_box or exit_box the next time the ball is detected  

5.01 s ± 106 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)

In [None]:
display_video(output_path, ffmpeg_path = "ffmpeg", width = 1000)

In [19]:
time_taken = []
for batch_size in [128]:
    time_ = 0
    for i in range(8):
        video_path = "/homedir/ugrad/z/zw2688/bigdata/DL_project/clips/20230720_Game4.mp4"



        cap, fps, frame_width, frame_height, total_frames = initialize_video_capture(video_path=video_path, skip_to_sec = skip_to_sec)
        out, output_path = initialize_video_writer(fps = fps,
                                                video_dimension= (frame_width, frame_height),
                                                video_path=video_path,
                                                )

        num_batches = math.ceil(total_frames / batch_size)
        
        start = time.time()
        for i in batch_range:
            frames = []
            for i in range(batch_size):
                ret, img = cap.read()
                if ret:
                    frames.append(img)
                else:
                    break
            break
        break
    break     
    #         if frames:
    #             results = model(frames, 
    #                             stream=False, 
    #                             verbose = False, 
    #                             conf=conf,
    #                             device=device)
    #         else:
    #             continue
    #     time_ += time.time() - start
    # time_taken.append(time_ / 8)
    # print(f"batch size: {batch_size} took {time_ / 8}")
    # cap.release()
    
        
    

In [24]:
for b in [1, 2, 4, 8, 16, 32, 64, 128]:
    start_time = time.time()
    results = model.predict(frames[:b], verbose = False, device = device)
    time_taken = time.time() - start_time
    print(f"batch size: {b} took {time_taken / b} seconds per image")

batch size: 1 took 0.023992300033569336 seconds per image
batch size: 2 took 0.016234517097473145 seconds per image
batch size: 4 took 0.014271020889282227 seconds per image
batch size: 8 took 0.012939661741256714 seconds per image
batch size: 16 took 0.011395186185836792 seconds per image
batch size: 32 took 0.01194259524345398 seconds per image
batch size: 64 took 0.012118056416511536 seconds per image
batch size: 128 took 0.0121160838752985 seconds per image


In [33]:
from deep_sort_pytorch.utils.parser import get_config
from deep_sort_pytorch.deep_sort import DeepSort

In [76]:
bounding_boxes = boxes.xyxy.cpu().numpy()
bounding_boxes = bounding_boxes.astype(int)
labels = [model.names[i] for i in boxes.cls.cpu().numpy()]

objects = {label: [] for label in labels}
for box, label in zip(bounding_boxes, labels):
    x1, y1, x2, y2 = box
    objects[label].append((x1, y1, x2, y2))

In [97]:
objects

{'person': [array([184, 335, 308, 619]),
  array([343, 368, 478, 563]),
  array([830, 336, 933, 546]),
  array([ 899,  293, 1041,  656])],
 'hoop': [{'hoop': array([485, 193, 535, 247]),
   'entry': (475, 150, 545, 193),
   'exit': (475, 247, 545, 290)}],
 'basketball': [array([280, 360, 320, 403]),
  array([1110,  422, 1133,  445]),
  array([713, 432, 735, 454])]}

In [143]:
model = YOLO("weights/detect_large_v10.pt")
classNames = model.names
def initialize_deepsort():
    # Create the Deep SORT configuration object and load settings from the YAML file
    cfg_deep = get_config()
    cfg_deep.merge_from_file("deep_sort_pytorch/configs/deep_sort.yaml")

    # Initialize the DeepSort tracker
    deepsort = DeepSort(cfg_deep.DEEPSORT.REID_CKPT,
                        max_dist=cfg_deep.DEEPSORT.MAX_DIST,
                        # min_confidence  parameter sets the minimum tracking confidence required for an object detection to be considered in the tracking process
                        min_confidence=cfg_deep.DEEPSORT.MIN_CONFIDENCE,
                        #nms_max_overlap specifies the maximum allowed overlap between bounding boxes during non-maximum suppression (NMS)
                        nms_max_overlap=cfg_deep.DEEPSORT.NMS_MAX_OVERLAP,
                        #max_iou_distance parameter defines the maximum intersection-over-union (IoU) distance between object detections
                        max_iou_distance=cfg_deep.DEEPSORT.MAX_IOU_DISTANCE,
                        # Max_age: If an object's tracking ID is lost (i.e., the object is no longer detected), this parameter determines how many frames the tracker should wait before assigning a new id
                        max_age=cfg_deep.DEEPSORT.MAX_AGE, n_init=cfg_deep.DEEPSORT.N_INIT,
                        #nn_budget: It sets the budget for the nearest-neighbor search.
                        nn_budget=cfg_deep.DEEPSORT.NN_BUDGET,
                        use_cuda=True
        )

    return deepsort
def compute_color_for_labels(label):
    """
    Function that adds fixed color depending on the class
    """
    if label == 0:
        color = (85, 45, 255)
    elif label == 1: 
        color = (222, 82, 175)
    elif label == 2: 
        color = (0, 204, 255)
    elif label == 3: 
        color = (0, 149, 255)

    return tuple(color)

def draw_boxes(img, bbox, identities=None, categories=None, names=None, offset=(0,0)):
    for i, box in enumerate(bbox):
        x1, y1,  x2, y2 = [int(i) for i in box]
        x1 += offset[0]
        x2 += offset[0]
        y1 += offset[0]
        y2 += offset[0]
        cat = int(categories[i]) if categories is not None else 0
        id = int(identities[i]) if identities is not None else 0
        #Create Bounding Boxes around the Detected Objects
        cv2.rectangle(img, (x1, y1), (x2, y2), color= compute_color_for_labels(cat),thickness=2, lineType=cv2.LINE_AA)
        label = str(id) + ":" + classNames[cat]

        #Create a rectangle above the detected object and add label and confidence score
        t_size=cv2.getTextSize(str(label), cv2.FONT_HERSHEY_SIMPLEX, fontScale=1/2, thickness=1)[0]
        c2=x1+t_size[0], y1-t_size[1]-3
        cv2.rectangle(frame, (x1, y1), c2, color=compute_color_for_labels(cat), thickness=-1, lineType=cv2.LINE_AA)
        cv2.putText(frame, str(label), (x1, y1-2), 0, 1/2, [255, 255, 255], thickness=1, lineType=cv2.LINE_AA)
    return img

In [146]:

conf = 0.5

video_path = "video_test_dataset/1/made_15.mp4"
cap, fps, width, height = get_video_info(video_path)
codec = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("output.mp4", codec, fps, (width, height))

deepsort = initialize_deepsort()

while True:
    xywh_bboxs = []
    confs = []
    oids = []
    outputs = []
    ret, frame = cap.read()
    if ret:
        results = model(frame, conf=0.2, verbose = False)
        boxes = results[0].boxes    
        confidences = boxes.conf.cpu().numpy().tolist()
        labels = boxes.cls.cpu().numpy().tolist()
        xywh = boxes.xywh.cpu().numpy().tolist()
        bbox_xyxys = boxes.xyxy.cpu().numpy().tolist()

        for (bbox_xyxy, confidence, cls) in zip(bbox_xyxys, confidences, labels):
            bbox = np.array(bbox_xyxy)
            x1, y1, x2, y2 = bbox[0], bbox[1], bbox[2], bbox[3]
            x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2)
            conf = round(confidence, 3)
            cx, cy = int((x1+x2)/2), int((y1+y2)/2)
            bbox_width = abs(x1-x2)
            bbox_height = abs(y1-y2)
            xcycwh = [cx, cy, bbox_width, bbox_height]
            xywh_bboxs.append(xcycwh)
            confs.append(conf)
            oids.append(int(cls))
        xywhs = torch.tensor(xywh_bboxs)
        confss= torch.tensor(confs)
        outputs = deepsort.update(xywhs, confss, oids, frame)
        if len(outputs)>0:
            bbox_xyxy = outputs[:,:4]
            identities = outputs[:, -2]
            object_id = outputs[:, -1]
            draw_boxes(frame, bbox_xyxy, identities, object_id)
        out.write(frame)
    else:
        break

out.release()
cap.release()


In [147]:
display_video("output.mp4", width = 960, ffmpeg_path="ffmpeg")

In [127]:
model.names

{0: 'basketball', 1: 'hoop', 2: 'made', 3: 'person'}