In [163]:
from ultralytics import YOLO
import cv2
from video_utils import *
import torch
import math
import numpy as np
import time
import torch.nn.functional as F
from PIL import Image
import matplotlib.pyplot as plt
from torchvision import transforms, models
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


In [14]:
model_path = "yolov8l-seg.pt"
video_path = "video_test_dataset/1/3.mp4"

In [141]:
model = YOLO(model_path)
cap, fps, frame_width, frame_height, total_frames = initialize_video_capture(video_path=video_path, skip_to_sec = 0)
out, output_path = initialize_video_writer(fps = fps,
                                           video_dimension= (frame_width, frame_height),
                                           video_path=video_path,
                                           )
all_players = []
while cap.isOpened():
    ret, frame = cap.read()
    if not ret:
        break
    
    # Get segmentation results for the current frame
    results = model(frame, verbose=False, device=device)
    
    # Process each result
    for r in results:
        all_masks = r.masks.data.cpu().numpy() # Assuming this gives you binary masks for each detected object
        all_boxes = r.boxes.xyxy.cpu().numpy().astype(int)
        classes = r.boxes.cls.cpu().numpy().astype(int)
        class_names = [model.names[cls] for cls in classes]
        
        for mask, box, class_name in zip(all_masks, all_boxes, class_names):
            if class_name == "person":
                mask = cv2.resize(mask, (frame_width, frame_height))
                mask = np.stack([mask, mask, mask], axis=2)
                mask = mask.astype(np.uint8)
                masked_img = mask * frame
                
                x1, y1, x2, y2 = box
                masked_img = masked_img[y1:y2, x1:x2]
                masked_img = cv2.cvtColor(masked_img, cv2.COLOR_BGR2RGB)
                all_players.append(masked_img)

In [215]:
# use resnet15 to extract features
# use kmeans to cluster players

resnet = models.mobilenet_v3_large(pretrained=True)
resnet = resnet.to(device)

transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
])

dataset = torch.stack([transform(img) for img in all_players])
dataset = dataset.to(device)

features = resnet(dataset).detach().cpu().numpy()

Downloading: "https://download.pytorch.org/models/mobilenet_v3_large-8738ca79.pth" to /homedir/ugrad/z/zw2688/.cache/torch/hub/checkpoints/mobilenet_v3_large-8738ca79.pth
100%|██████████| 21.1M/21.1M [00:00<00:00, 257MB/s]


In [216]:
# find cosine similarity between each pair of players to the first player
player_1 = features[0]
cosine_similarities = F.cosine_similarity(torch.tensor(player_1), torch.tensor(features))

# find the top 5 most similar players, and their indices
top_5_indices = cosine_similarities.argsort(descending=True)[:10]

top_5_indices

tensor([  0,   6,  11,  16,  20, 301,  41, 192, 305,  27])