In [11]:
import sys
import torch
import json
from PIL import Image
from torchvision import transforms
import numpy as np
from sklearn_extra.cluster import KMedoids


In [3]:
from ultralytics import YOLO
import os
import json
import sys
model = YOLO("yolov8n.pt", verbose = False)

## YOLO test

In [5]:
txt = "D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat"

In [7]:
def get_yolo_labels(path):

    images = os.listdir(path)
    # sort images by number
    images.sort(key=lambda x: int(x.split(".")[0]))

    res = model(path, conf=0.25)

    result = []
    for ix, r in enumerate(res):
        els = set()
        # If there are no boxes, skip
        try:
            # use r.boxes and not r[0].boxes to get all boxes
            for c in r.boxes.cls:
                els.add(model.names[int(c)])
        except:
            pass
        
        result.append({"frame": ix + 1, "labels": list(els)})

    json_result = json.dumps(result)

    return json_result

In [10]:
get_yolo_labels(txt)


image 1/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\1.jpg: 384x640 1 cat, 18.5ms
image 2/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\10.jpg: 384x640 1 cat, 18.3ms
image 3/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\11.jpg: 384x640 1 cat, 19.3ms
image 4/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\12.jpg: 384x640 1 cat, 1 dog, 18.5ms
image 5/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\13.jpg: 384x640 1 cat, 18.0ms
image 6/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\14.jpg: 384x640 1 cat, 18.4ms
image 7/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\2.jpg: 384x640 1 cat, 18.9ms
image 8/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\3.jpg: 384x640 1 cat, 16.2ms
image 9/14 D:\programming\Multilearning\immersiveaudio\Code\Scripts\output\cat\4.jpg: 384x640 1 cat, 9.0ms
image 10/14 D:\p

'[{"frame": 1, "labels": ["cat"]}, {"frame": 2, "labels": ["cat"]}, {"frame": 3, "labels": ["cat"]}, {"frame": 4, "labels": ["dog", "cat"]}, {"frame": 5, "labels": ["cat"]}, {"frame": 6, "labels": ["cat"]}, {"frame": 7, "labels": ["cat"]}, {"frame": 8, "labels": ["cat"]}, {"frame": 9, "labels": ["cat"]}, {"frame": 10, "labels": ["cat"]}, {"frame": 11, "labels": ["cat"]}, {"frame": 12, "labels": ["cat"]}, {"frame": 13, "labels": ["cat"]}, {"frame": 14, "labels": ["cat"]}]'

In [12]:
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)
if torch.cuda.is_available():
    print("CUDA is available! Running on gpu")
    model = model.to('cuda')
model.eval()

CUDA is available! Running on gpu


Using cache found in C:\Users\tedoc/.cache\torch\hub\pytorch_vision_v0.10.0


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  

In [29]:
def get_best_frame(path):

    inputPath = path
    frames_number = 14
    video_id = 1


    # Load the images
    images = {}
    for i in range(1,frames_number + 1):
        img_name = inputPath + "/" + str(i) + ".jpg" 
        img = Image.open(img_name)
        images[img_name] = (img)


    activation = None

    def hook(model, input, output):
        nonlocal activation
        activation = output


    preprocess = transforms.Compose([
        transforms.Resize(512),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
    ])
    latent = {}
    for i in images:
        input_tensor = preprocess(images[i])   
        input_batch = input_tensor.unsqueeze(0)

        # move the input and model to GPU for speed if available
        if torch.cuda.is_available():
            input_batch = input_batch.to('cuda')

        model.layer3[1].conv2.register_forward_hook(hook)
        
        with torch.no_grad():
            model(input_batch)
        latent[i] = activation[0]

    #  initial shape 512 x 16 x 29, need to reshape everything to 1 x (512*16*29) then stack to 
    flattened_arrays = [arr.cpu().flatten() for arr in latent.values()]

    flattened_data = np.vstack(flattened_arrays)
    
    med_model = KMedoids(n_clusters=1, random_state=0)
   
    med_model.fit(flattened_data)

    # Trovare il medoide
    medoid_index = med_model.medoid_indices_[0]

    return json.dumps({
        "video_id" : video_id,
        "best_frame" : list(images.keys())[medoid_index],
    })

In [30]:
get_best_frame(txt)

'{"video_id": 1, "best_frame": "D:\\\\programming\\\\Multilearning\\\\immersiveaudio\\\\Code\\\\Scripts\\\\output\\\\cat/10.jpg"}'