In [32]:
import cv2
import torch
from ultralytics import YOLO
import numpy as np
import math
from numpy import random
from IPython.display import HTML
import torchvision.models as torch_models
from base64 import b64encode
import os
from IPython.display import Video
from utils import *
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
#from cls_detection import *
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("mps") if torch.backends.mps.is_available() else device
print(device)

preprocess = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

yolo_model_path = "weights/detect_large_v2.pt"
model = YOLO(yolo_model_path)
classNames = ['basketball', 'hoop', 'made', 'person']

cuda


In [53]:
cls_predict(cls_model,
            [Image.open("data/classification_dataset_groupby_env_split/train/1/Screenshot-2023-07-24-at-2-15-35-PM-2-_png.rf.f07b3ee9df294a4027b326dc292361c3.jpg")],
            preprocess,
            device)

(array([ True]), array([    0.99987], dtype=float32))

In [63]:
def cls_predict_image(cls_model, img, preprocess, device, threshold = 0.5):
    input_tensor = preprocess(img).unsqueeze(0).to(device)
    with torch.no_grad():
        cls_output = cls_model(input_tensor)
    probability = torch.sigmoid(cls_output.squeeze())

    # prob, predicted_class = torch.max(probability, dim=0)
    # return predicted_class.item(), prob.item()
    return probability[1] > threshold, probability


def cls_predict(cls_model, batch_imgs, preprocess, device, threshold = 0.5):
    # Process and batch images
    # check if batch_imgs is a list.
        

    batch_tensor = torch.stack([preprocess(img) for img in batch_imgs])
    batch_tensor = batch_tensor.to(device)


    # Forward pass for the whole batch
    with torch.no_grad():
        cls_output = cls_model(batch_tensor)

    # Calculate probabilities and predicted classes
    probabilities = torch.sigmoid(cls_output).squeeze(1)
    predictions = probabilities > threshold
    predictions = predictions

    return predictions.cpu().numpy(), probabilities.cpu().numpy()

def predict_hoop_box(img, cls_model,  preprocess, device, threshold = 0.5):
    cropped_imgs_pil = []

    for cropped_img in img:
        cropped_img_pil = Image.fromarray(cv2.cvtColor(cropped_img, cv2.COLOR_RGB2BGR))
        cropped_imgs_pil.append(cropped_img_pil)

    return cls_predict(cls_model, cropped_imgs_pil, preprocess, device, threshold)


def inference_by_batch(model,
                       cls_model,
                       video_path, 
                       cls_conf_threshold = 0.6,
                       detect_conf_threshold = 0.4,
                       save_result_vid = False, 
                       output_dir = None, 
                       saved_video_name = None,
                       batch_size=128,
                       display_result = False,
                       show_progress = True,
                       skip_to_sec = 0,
                       show_score_prob = False,
                       cls_img_size = 112,
                       device = device,
                       ):
    preprocess = transforms.Compose([
                transforms.Resize((cls_img_size, cls_img_size)),
                transforms.ToTensor(),
            ])
    cls_model.to(device)    
    cls_model.eval()
    print("Models loaded!")
    
    cap, fps, frame_width, frame_height = get_video_info(video_path)
    if skip_to_sec > 0:
        cap.set(cv2.CAP_PROP_POS_MSEC, skip_to_sec * 1000)
        
    num_skiped_frames = int(skip_to_sec * fps)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - num_skiped_frames
    
    print("Initializing video capture...")
    if save_result_vid:
        video_name = video_path.split("/")[-1]
        video_name = video_name.split(".")[0] + ".mp4"

        if saved_video_name is not None:
            output_path = saved_video_name if output_dir is None else os.path.join(output_dir, saved_video_name)
        else:
            output_path = "inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "inferenced_" + video_name)
            
        codec = cv2.VideoWriter_fourcc(*'vp09')
        out = cv2.VideoWriter(output_path, codec, fps, (frame_width,frame_height))
    
    num_batches = math.ceil(total_frames / batch_size)

    results = []
    score_timestamps = []
    
    count = 0
    score = 0
    display_prob = [0.0]
    
    if show_progress:
        batch_range = tqdm(range(num_batches))
    else:
        batch_range = range(num_batches)

    for i in batch_range:
        frames = []
        for i in range(batch_size):
            ret, img = cap.read()
            if ret:
                frames.append(img)
            else:
                break

        if frames:
            results = model(frames, 
                            stream=False, 
                            verbose = False, 
                            conf=detect_conf_threshold,
                            device=device)
        else:
            continue
        
        print("Finished detecting objects in batch", i + 1, "out of", num_batches)

        for c, r in tqdm(enumerate(results)):
            #print(c)
            img = r.orig_img
            boxes = r.boxes
            cropped_images = []
            count += 1
            for box in boxes:
                x1, y1, x2, y2 = map(int, box.xyxy[0])  # convert to int values
                confidence = box.conf.item()
                predicted_class = model.names[int(box.cls)] 
                if predicted_class == "hoop":
                    cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
                    cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    
                    if x1 > x2 or y1 > y2:
                        continue
                    else:
                        cropped_img = img[y1:y2, x1:x2]
                        cropped_images.append(cropped_img)
                        
                if predicted_class == "basketball":
                    cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                    cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)
                    
            
            if len(cropped_images) == 0:
                continue
            pred, prob = predict_hoop_box_batch(cropped_images, cls_model,  preprocess, device, threshold=cls_conf_threshold)
            if pred.sum() > 0 and count > 60:
                score += 1
                count = 0
                current_frame = i * batch_size + c
                time_stamp = current_frame / fps
                score_timestamps.append((time_stamp, prob))
                display_prob = prob
        
            cv2.putText(img, f'Score: {score}', (10, 70), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if show_score_prob:
                cv2.putText(img, f'Prob: {max(display_prob):.3f}', (10, 140), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if save_result_vid:
                out.write(img)
        print("finished inferencing with cls")
        if not ret:
            break
        
    if save_result_vid:
        print("releasing video writer")
        out.release()
    print("releasing video capture")
    cap.release()
    

    if display_result:
        display(Video(output_path, embed=True))
        return score_timestamps, output_path
    else:
        return score_timestamps
    
def display_video(input_path, width=640, ffmpeg_path='ffmpeg-git-20231128-amd64-static/ffmpeg'):
    temp_output_path = "temp_" + os.path.basename(input_path)

    try:
        # Use subprocess to safely call FFmpeg
        subprocess.run([ffmpeg_path, '-y', '-i', input_path, '-vcodec', 'libx264', temp_output_path],
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL,
                       check=True)

        # Overwrite the original file with the compressed one
        shutil.move(temp_output_path, input_path)

        display(Video(input_path, embed=True))
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
        # Clean up the temporary file in case of an error
        if os.path.exists(temp_output_path):
            os.remove(temp_output_path)
    
def inference_by_frame(model, 
                       cls_model,
                       video_path, 
                       cls_conf_threshold = 0.6,
                       detect_conf_threshold = 0.4,
                       save_result_vid = False, 
                       output_dir = None, 
                       saved_video_name = None,
                       display_result = False,
                       show_progress = True,
                       skip_to_sec = 0,
                       show_score_prob = False,
                       device = device,
                       cls_img_size = 224
                       ):
    preprocess = transforms.Compose([
                transforms.Resize((cls_img_size, cls_img_size)),
                transforms.ToTensor(),
            ])
    
    cap, fps, frame_width, frame_height = get_video_info(video_path)
    if skip_to_sec > 0:
        cap.set(cv2.CAP_PROP_POS_MSEC, skip_to_sec * 1000)
        
    num_skiped_frames = int(skip_to_sec * fps)
    total_frames = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) - num_skiped_frames
    
    
    if save_result_vid:
        video_name = video_path.split("/")[-1]
        video_name = video_name.split(".")[0] + ".mp4"

        if saved_video_name is not None:
            output_path = saved_video_name if output_dir is None else os.path.join(output_dir, saved_video_name)
        else:
            output_path = "inferenced_" + video_name if output_dir is None else os.path.join(output_dir, "inferenced_" + video_name)
        codec = cv2.VideoWriter_fourcc(*'mp4v')
        out = cv2.VideoWriter(output_path, codec, fps, (frame_width,frame_height))
    pbar = tqdm(total=total_frames, desc="Processing Frames", unit="frame") if show_progress else None

    score_timestamps = []
    count=0
    score = 0
    display_prob = 0.0
    
    while True:
        ret, img = cap.read()
        frame_start_time = time.time()
        current_time = cap.get(cv2.CAP_PROP_POS_MSEC)
        count += 1
        if ret:
            results = model(img, stream = False, device = device, conf = detect_conf_threshold, verbose = False)
            
            for r in results:
                boxes = r.boxes

                for box in boxes:
                    x1, y1, x2, y2 = box.xyxy[0]
                    x1, y1, x2, y2 = int(x1), int(y1), int(x2), int(y2) # convert to int values
                    confidence = box.conf[0]
                    predicted_class = model.names[int(box.cls)]
                    
                    # If "basketball-hoops" is detected, make a prediction with cls_model
                    if predicted_class == "hoop":
                        # Crop the image and convert to PIL Image
                        # try:
                        if x1 > x2 or y1 > y2:
                            continue
                        else:
                            cropped_img = img[y1:y2, x1:x2]
                            prediction, prob = predict_hoop_box(cropped_img, cls_model, preprocess, device, cls_conf_threshold)
                            print(prediction)
                            if prediction[0] == 1 and count > 60:
                                score += 1
                                count = 0
                                display_prob = prob[0]
                                score_timestamps.append((current_time, prob))

                        cv2.rectangle(img, (x1, y1), (x2, y2), (255, 0, 0), 2)
                        cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (255, 0, 0), 2)
                    
                    if predicted_class == "basketball":
                        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 2)
                        cv2.putText(img, f'{predicted_class}: {confidence:.3f}', (x1, y1 - 10), cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 255, 0), 2)

            cv2.putText(img, f'Score: {score}', (10, 30), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)
            if show_score_prob:
                # print(display_prob)
                cv2.putText(img, f'Prob: {display_prob:.3f}', (10, 140), cv2.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 2)

            if show_progress:
                frame_end_time = time.time()  # End time for frame processing
                time_per_frame = frame_end_time - frame_start_time
                pbar.set_postfix(time_per_frame=f"{time_per_frame:.3f} sec")
                pbar.update(1)

            
        else:
            break
        
        if save_result_vid:
            out.write(img)
            
    if save_result_vid:
        out.release()
    cap.release()
    
    if display_result:
        display_video(output_path)
        return score_timestamps, output_path
    else:
        return score_timestamps


In [61]:
all_pos_vid = "video_test_dataset/all_made.mp4"
all_neg_vid = "video_test_dataset/all_miss.mp4"

all_models = [
    "cls_chkpoint_resnet18/checkpoint_2023-12-24-15-10_lr_0.0001_batch_64/best_model.pth",
    "cls_chkpoint_resnet18/checkpoint_2023-12-24-15-02_lr_0.001_batch_64/best_model.pth",
    "cls_chkpoint_resnet18/checkpoint_2023-12-24-15-27_lr_0.0005_batch_64/best_model.pth",
    ]

batch_size = 128 
conf = 0.4
device = "cuda:1"

for model_path in all_models:
    pos_results = inference_by_batch(yolo_model_path,
                    model_path,
                    video_path = all_pos_vid,
                    save_result_vid = False,
                    display_result = False,
                    batch_size = batch_size,
                    show_progress=True,
                    cls_conf_threshold=conf,
                    device=device,
                    model_type="resnet18"
                    )
    pos_score = len(pos_results)
    
    neg_results = inference_by_batch(yolo_model_path,
                    model_path,
                    video_path = all_neg_vid,
                    save_result_vid = False,
                    display_result = False,
                    batch_size = batch_size,
                    show_progress=True,
                    cls_conf_threshold=conf,
                    device=device,
                    model_type="resnet18"
                    )
    neg_score = len(neg_results)
    print(f"{model_path}: pos_score: {pos_score}, neg_score: {neg_score}")

TypeError: inference_by_batch() got an unexpected keyword argument 'model_type'

In [64]:
modelPath = "weights/cls_chkpoint_resnet18/checkpoint_2023-12-27-13-24_batch128_lr0.0001_optimizerrmsprop_imgsize224/best_model.pth"

batch_size = 128 
conf = 0.2

cls_model = load_resnet18(modelPath)
inference_by_frame(model,
                    cls_model,
                    video_path = "data/video_test_dataset/1/2.mp4",
                    save_result_vid = True,
                    display_result = True,
                    #batch_size = 128 * 2,
                    show_progress=True,
                    cls_conf_threshold=conf,
                    device="cuda",
                    show_score_prob=True
                    )


[A

[False  True  True False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False False  True False False False False False False False False False False False False
 False  True]


ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

In [4]:
display_video("inferenced_2.mp4")

In [11]:
Image.open("classification_dataset_groupby_env/mima/0/2023_06_21_-_Game_2-xJLCPqvNo00_mp4-1_jpg.rf.9a5945913f60f3140f47637565fc1c4e.jpg").size

(100, 97)

In [1]:


from roboflow import Roboflow
rf = Roboflow(api_key="nz2w1UFnyFrM7e73WOzh")
project = rf.workspace("nyu-figsb").project("basketballdetection-cki6r")
dataset = project.version(25).download("yolov8")


loading Roboflow workspace...
loading Roboflow project...
Dependency ultralytics==8.0.196 is required but found version=8.0.228, to fix: `pip install ultralytics==8.0.196`


Downloading Dataset Version Zip in basketballDetection-25 to yolov8:: 100%|██████████| 1718058/1718058 [00:20<00:00, 82707.78it/s]





Extracting Dataset Version Zip to basketballDetection-25 in yolov8:: 100%|██████████| 14206/14206 [04:27<00:00, 53.18it/s]  


In [8]:
video_path = "data/video_test_dataset/1/2.mp4"
cap, fps, frame_width, frame_height = get_video_info(video_path)
codec = cv2.VideoWriter_fourcc(*'mp4v')
out = cv2.VideoWriter("test.mp4", codec, fps, (frame_width,frame_height))

    
while cap.isOpened():
    ret, img = cap.read()
    if ret:
        out.write(img)
    else:
        break
    
out.release()

    

In [15]:
def display_video(input_path, output_path=None, width=640):
    output_path = "compressed_" + input_path if output_path is None else output_path
    #os.remove(output_path) if os.path.exists(output_path) else None
    try:
        # Use subprocess to safely call FFmpeg
        subprocess.run(['ffmpeg-git-20231128-amd64-static/ffmpeg', '-y', '-i', input_path, '-vcodec', 'libx264', output_path], 
                       stdout=subprocess.DEVNULL,
                       stderr=subprocess.DEVNULL,
                       check=True)
        
        return Video(output_path, width=width, embed=True)
    except subprocess.CalledProcessError as e:
        print(f"An error occurred: {e}")
display_video("test.mp4", "test_.mp4")