In [1]:
import cv2
import torch
from ultralytics import YOLO
import numpy as np
import math
from numpy import random
from IPython.display import HTML
import torchvision.models as torch_models
from base64 import b64encode
import os
from IPython.display import Video
from utils import *
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps") if torch.backends.mps.is_available() else device
print(device)

preprocess = transforms.Compose([
    transforms.Resize((112, 112)),
    transforms.ToTensor(),
])

yolo_model_path = "weights/detect_large.pt"
model = YOLO(yolo_model_path)
classNames = ['basketball', 'hoop', 'person']

cuda


In [2]:
def inference_by_batch(model, 
                       cls_model,
                       video_path, 
                       cls_conf_threshold = 0.6,
                       detect_conf_threshold = 0.4,
                       save_result_vid = False, 
                       output_dir = None, 
                       saved_video_name = None,
                       batch_size=128,
                       display_result = False,
                       show_progress = True,
                       skip_to_sec = 0,
                       show_score_prob = False,
                       ):
    cap, fps, frame_width, frame_height = get_video_info(video_path)
    if skip_to_sec > 0:
        cap.set(cv2.CAP_PROP_POS_MSEC, skip_to_sec * 1000)
        
    num_skiped_frames = int(skip_to_sec * fps)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - num_skiped_frames
    
    
    if save_result_vid:
        video_name = video_path.split("/")[-1]
        video_name = video_name.split(".")[0] + ".mp4"
        output_path = "inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "inferenced_" + video_name)
        if saved_video_name is not None:
            compressed_output_path = saved_video_name if output_dir is None else os.path.join(output_dir, saved_video_name)
        else:
            compressed_output_path = "compressed_inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "compressed_inferenced_" + video_name)
        codec = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, codec, fps, (frame_width,frame_height))
    
    num_batches = math.ceil(total_frames / batch_size)

    results = []
    score_timestamps = []
    
    count = 0
    score = 0
    display_prob = [0.0]
    
    if show_progress:
        batch_range = tqdm(range(num_batches))
    else:
        batch_range = range(num_batches)

    for i in batch_range:
        frames = []
        for i in range(batch_size):
            ret, img = cap.read()
            if ret:
                frames.append(img)
            else:
                break

        if frames:
            results = model(frames, 
                            stream=False, 
                            verbose = False, 
                            conf=detect_conf_threshold)
        else:
            continue

        for c, r in enumerate(results):
            img = r.orig_img
            boxes = r.boxes
            cropped_images = []
            count += 1
            for box in boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])  # convert to int values
                confidence = box.conf.item()
                predicted_class = model.names[int(box.cls)] 
                if predicted_class == "hoop":
                    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
                    cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    
                    if x1 > x2 or y1 > y2:
                        continue
                    else:
                        cropped_img = img[y1:y2, x1:x2]
                        cropped_images.append(cropped_img)
                        
                if predicted_class == "basketball":
                    cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    
            
            if len(cropped_images) == 0:
                continue
            pred, prob = predict_hoop_box_batch(cropped_images, cls_model,  preprocess, device, threshold=cls_conf_threshold)
            if pred.sum() > 0 and count > 60:
                score += 1
                count = 0
                current_frame = i * batch_size + c
                time_stamp = current_frame / fps
                score_timestamps.append((time_stamp, prob))
                display_prob = prob
        
            cv2.putText(img, f'Score: {score}', (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if show_score_prob:
                cv2.putText(img, f'Prob: {max(display_prob):.3f}', (10, 140), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if save_result_vid:
                out.write(img)
        if not ret:
            break
        
    if save_result_vid:
        out.release()
    cap.release()
    
    if save_result_vid:
        subprocess.run(['ffmpeg', '-y', '-hide_banner',  '-loglevel', 'error', '-i', output_path, '-vcodec', 'libx264', compressed_output_path], check=False)
        os.remove(output_path)
        if display_result:
            display(Video(compressed_output_path, embed=True))
        return score_timestamps, compressed_output_path
    else:
        return score_timestamps

In [3]:
def inference_by_frame(model, 
                       cls_model,
                       video_path, 
                       cls_conf_threshold = 0.6,
                       detect_conf_threshold = 0.4,
                       save_result_vid = False, 
                       output_dir = None, 
                       saved_video_name = None,
                       display_result = False,
                       show_progress = True,
                       skip_to_sec = 0,
                       show_score_prob = False,
                       
                       ):
    cap, fps, frame_width, frame_height = get_video_info(video_path)
    if skip_to_sec > 0:
        cap.set(cv2.CAP_PROP_POS_MSEC, skip_to_sec * 1000)
        
    num_skiped_frames = int(skip_to_sec * fps)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - num_skiped_frames
    
    
    if save_result_vid:
        video_name = video_path.split("/")[-1]
        video_name = video_name.split(".")[0] + ".mp4"
        output_path = "inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "inferenced_" + video_name)
        if saved_video_name is not None:
            compressed_output_path = saved_video_name if output_dir is None else os.path.join(output_dir, saved_video_name)
        else:
            compressed_output_path = "compressed_inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "compressed_inferenced_" + video_name)
        codec = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, codec, fps, (frame_width,frame_height))
    pbar = tqdm(total=total_frames, desc="Processing Frames", unit="frame") if show_progress else None

    score_timestamps = []
    count=0
    score = 0
    display_prob = [0.0]
    
    while True:
        ret, img = cap.read()
        frame_start_time = time.time()
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC)
        count += 1
        if ret:
            results = model(img, stream = False, device = device, conf = detect_conf_threshold, verbose = False)
            
            for r in results:
                boxes = r.boxes

                for box in boxes:
                    x1, y1, x2, y2 = box.xyxy[0]
                    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values
                    confidence = box.conf[0]
                    predicted_class = model.names[int(box.cls)]
                    
                    # If "basketball-hoops" is detected, make a prediction with cls_model
                    if predicted_class == "hoop":
                        # Crop the image and convert to PIL Image
                        # try:
                        if x1 > x2 or y1 > y2:
                            continue
                        else:
                            _, prediction, prob = predict_hoop_box(img, cls_model, x1, y1, x2, y2, preprocess, device, cls_conf_threshold)
                            if prediction == 1 and count > 60:
                                score += 1
                                count = 0
                                display_prob = prob
                                score_timestamps.append((current_time, prob))

                        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
                        cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    
                    if predicted_class == "basketball":
                        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            cv2.putText(img, f'Score: {score}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if show_score_prob:
                cv2.putText(img, f'Prob: {display_prob[0]:.3f}', (10, 140), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)

            if show_progress:
                frame_end_time = time.time()  # End time for frame processing
                time_per_frame = frame_end_time - frame_start_time
                pbar.set_postfix(time_per_frame=f"{time_per_frame:.3f} sec")
                pbar.update(1)

            
        else:
            break
        
        if save_result_vid:
            out.write(img)
            
    if save_result_vid:
        out.release()
    cap.release()
    
    if save_result_vid:
        subprocess.run(['ffmpeg', '-y', '-hide_banner',  '-loglevel', 'error', '-i', output_path, '-vcodec', 'libx264', compressed_output_path], check=False)
        os.remove(output_path)
        if display_result:
            display(Video(compressed_output_path, embed=True))
        return score_timestamps, compressed_output_path
    else:
        return score_timestamps

In [4]:
def test_model(model, cls_model, all_made, all_miss, conf = 0.5):
    true_labels = np.array([1] * len(os.listdir(all_made)) + [0] * len(os.listdir(all_miss)))
    predictions = np.empty(len(true_labels))
    i = 0
    for cls in [all_made, all_miss]:
        for vid in tqdm(os.listdir(cls)):
            vid = cls + f"/{vid}"
            result = inference_by_batch(model,
                                        cls_model,
                                        video_path = vid,
                                        save_result_vid = False,
                                        display_result = False,
                                        batch_size = 128,
                                        show_progress=False,
                                        cls_conf_threshold=conf)
            
            predictions[i] = len(result) > 0
            i += 1
    conf_mat = confusion_matrix(true_labels, predictions)
    
    return true_labels, predictions, conf_mat

In [5]:
test_set_dir = "video_test_dataset"
all_made = test_set_dir + "/1/"
all_miss = test_set_dir + "/0/"
print(len(os.listdir(all_made)) + len(os.listdir(all_miss)))
made_vids = os.listdir(all_made)

resnet_50 = load_resnet50("cls_chkpoint_resnet50/checkpoint_2023-12-18-21-45_lr_0.0001_batch_64/best_model.pth", device=device)
resnet_50.eval();


160


In [6]:
inference_by_batch(model, 
                   resnet_50,
                   video_path = all_made + made_vids[0],
                   save_result_vid = False,
                   display_result = False,
                   batch_size = 128,
                   show_progress=False,
                   cls_conf_threshold=0.3,
                   detect_conf_threshold=0.4,
                   show_score_prob=True)

[(447.41363333333334, [0.010158891789615154, 0.4117841422557831])]

In [7]:
results = test_model(model, resnet_50, all_made, all_miss, conf = 0.3)

100%|██████████| 85/85 [07:49<00:00,  5.52s/it]
100%|██████████| 75/75 [07:13<00:00,  5.79s/it]


In [10]:
accuracy = results[0] == results[1]
accuracy = accuracy.sum() / len(accuracy)
accuracy

0.55