In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [44]:
import cv2
import matplotlib.pyplot as plt
import numpy as np
from torchvision.models import mobilenet_v2
from torch import nn
import torch

In [17]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.95-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.14-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.95-py3-none-any.whl (949 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m949.8/949.8 kB[0m [31m28.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.14-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.95 ultralytics-thop-2.0.14


## Needed functions

In [2]:
def extract_patches_from_detections(image, detections, target_size=(224, 224)):
     """
    Extract images lying inside bounding boxs

    Args:
        image: An image or frame.
        detections: the detections obtained by detector for the image.

    Returns:
        list: List of small images inside bbox
    """
    
    patches = []
    
    for det in detections:
        # Lấy tọa độ bounding box: bb_left, bb_top, bb_width, bb_height
        bb_left, bb_top, bb_width, bb_height = det[2:6].astype(np.int64)
            
        # Chuyển sang định dạng (x1, y1, x2, y2)
        x1, y1 = bb_left, bb_top
        x2, y2 = x1 + bb_width, y1 + bb_height
            
        # Giới hạn tọa độ trong kích thước ảnh
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(image.shape[1] - 1, x2)  # width
        y2 = min(image.shape[0] - 1, y2)  # height
            
        patch = image[y1:y2, x1:x2]
            
        # Resize về target_size
        patch = cv2.resize(patch, target_size[::-1])  
            
        patches.append(patch)
    
    return patches

In [10]:
def extract_feature(model, img, device):
    '''
    Extract feature vector of an image

    Args:
        model: CNNs model for feature extracting
        img: An image or frame.
        device: cuda or cpu
        
    Returns:
        torch.tensor: Feature vector of an image'''
    model.to(device)
    model.eval()
    with torch.no_grad():
        img = torch.tensor(img).permute(2, 0, 1).float().unsqueeze(0).to(device)
        return model(img)

In [16]:
def get_model(device):
    
    model = mobilenet_v2(pretrained=True)
    model.classifier = nn.Sequential(
            nn.Dropout(p=0.2),
            nn.Linear(1280, 128)  # trích xuất vector 128 chiều
        )
    model.eval()
    model = model.to(device)
    return model

In [39]:
def get_feature_file(model, folder_path, detections_path):
    '''
    Args:
        folder_path: folder which contains images or frame of the video
        detections_path: detections file path from the MOT challenge dataset
    '''
    mot_det = np.loadtxt(detections_path, delimiter=',')
    my_features = []
    # Duyệt qua tất cả file trong folder
    frame_id = 1
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(('.png', '.jpg', '.jpeg')):  # Chỉ xử lý các file ảnh
            image_path = os.path.join(folder_path, filename)
            img = cv2.imread(image_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            detections = mot_det[mot_det[:, 0] == frame_id, :]
            cropped_images = extract_patches_from_detections(img, detections,target_size=(224, 224))
            for cropped_image in cropped_images:
                feature = extract_feature(model, cropped_image, device)
                my_features.append(feature.detach().cpu().numpy().ravel())
            print(frame_id)
        frame_id += 1
    my_features = np.array(my_features)
    new_features = np.hstack([mot_det, my_features])
    np.save(folder_path.split('/')[6] + 'det', new_features)

## Reimplement functions for YOLO

In [45]:
def extract_patches_from_detections_yolo(image, detections, target_size=(224, 224)):
    
    patches = []
    
    for det in detections:
        # Lấy tọa độ bounding box: bb_left, bb_top, bb_width, bb_height
        x1, y1, x2, y2, conf = det.astype(np.int64)
            
        # ## Chuyển sang định dạng (x1, y1, x2, y2)
        # x1, y1 = bb_left, bb_top
        # x2, y2 = x1 + bb_width, y1 + bb_height
            
        # Giới hạn tọa độ trong kích thước ảnh
        x1 = max(0, x1)
        y1 = max(0, y1)
        x2 = min(image.shape[1] - 1, x2)  # width
        y2 = min(image.shape[0] - 1, y2)  # height
            
        patch = image[y1:y2, x1:x2]
            
        # Resize về target_size
        patch = cv2.resize(patch, target_size[::-1])  # cv2 dùng (width, height)
            
        patches.append(patch)
    
    return patches

In [62]:
def get_feature_file_yolo_detector(yolo, model, folder_path):
    '''
    Args:
        yolo :light weight yolo model
        model: CNNs model for feature extracting
        folder_path: folder which contains images or frame of the video
    '''
    my_features = []
    my_detections = []
    my_frame_id = []
    frame_id = 1
    for filename in sorted(os.listdir(folder_path)):
        if filename.endswith(('.png', '.jpg', '.jpeg')):  # Chỉ xử lý các file ảnh
            image_path = os.path.join(folder_path, filename)
            # Mở và hiển thị ảnh bằng PIL
            img = cv2.imread(image_path)
            img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            # detections = mot_det[mot_det[:, 0] == frame_id, :]
            results = yolo(img)
            detections = results[0].boxes.data.cpu().numpy()[:, :-1]
            # detections = torch.cat([boxes.xyxy, boxes.conf.reshape(len(boxes.conf), -1)], dim=1).cpu().numpy()
            my_detections.append(detections)
            cropped_images = extract_patches_from_detections_yolo(img, detections,target_size=(224, 224))
            for cropped_image in cropped_images:
                feature = extract_feature(model, cropped_image, device)
                my_features.append(feature.detach().cpu().numpy().ravel())
                my_frame_id.append(frame_id)
            print(frame_id)
        frame_id += 1
    
    my_features = np.array(my_features)
    my_frame_id = np.array(my_frame_id).reshape(len(my_frame_id),-1)
    my_detections = np.vstack(my_detections)
    my_detections[:, 2] -= my_detections[:, 0]
    my_detections[:, 3] -= my_detections[:, 1]
    minus_column = -np.ones((my_frame_id.shape[0], 1))
    minus_matrix = -np.ones((my_frame_id.shape[0], 3))
    result = np.hstack([my_frame_id, minus_column, my_detections, minus_matrix, my_features])
    np.save('yolo' + folder_path.split('/')[6] + 'det', result)

In [77]:
def get_feature_file_from_video(yolo, model, video_path):
    '''
    Args:
        yolo :light weight yolo model
        model: CNNs model for feature extracting
        video_path: path to video from MOT Challenge dataset
    '''
    my_features = []
    my_detections = []
    my_frame_id = []
    
    frame_id = 1
    cap = cv2.VideoCapture(video_path)
    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
    
        # Phát hiện bounding box bằng YOLOv8
        results = yolo(frame)
        detections = results[0].boxes.data.cpu().numpy()[:, :-1]  # [x_min, y_min, x_max, y_max, conf, cls]
        my_detections.append(detections)
    
        cropped_images = extract_patches_from_detections_yolo(frame, detections,target_size=(224, 224))
    
        for cropped_imgs in cropped_images:
            feature = extract_feature(model, cropped_imgs, device)
            my_features.append(feature.detach().cpu().numpy().ravel())
            my_frame_id.append(frame_id) 
        print(frame_id)
        frame_id += 1
    cap.release()

    my_features = np.array(my_features)
    my_frame_id = np.array(my_frame_id).reshape(len(my_frame_id),-1)
    my_detections = np.vstack(my_detections)
    my_detections[:, 2] -= my_detections[:, 0]
    my_detections[:, 3] -= my_detections[:, 1]
    my_detections *= 2
    minus_column = -np.ones((my_frame_id.shape[0], 1))
    minus_matrix = -np.ones((my_frame_id.shape[0], 3))
    result = np.hstack([my_frame_id, minus_column, my_detections, minus_matrix, my_features])
    np.save('yolo' + video_path.split('/')[3] + 'det', result)

## Implement

In [41]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

In [40]:
folder_path = '/kaggle/input/mot16-dataset/MOT16/train/MOT16-02/img1'
detections_path = '/kaggle/input/mot16-dataset/MOT16/train/MOT16-02/det/det.txt'

In [42]:
model = get_model(device)

Downloading: "https://download.pytorch.org/models/mobilenet_v2-b0353104.pth" to /root/.cache/torch/hub/checkpoints/mobilenet_v2-b0353104.pth
100%|██████████| 13.6M/13.6M [00:00<00:00, 143MB/s]


In [None]:
get_feature_file(model, folder_path, detections_path)

## YOLO

In [57]:
from ultralytics import YOLO

yolo = YOLO('yolov8n.pt')

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.
Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8n.pt to 'yolov8n.pt'...


100%|██████████| 6.25M/6.25M [00:00<00:00, 123MB/s]


In [75]:
folder_path = '/kaggle/input/mot16-dataset/MOT16/test/MOT16-07/img1'

In [76]:
get_feature_file_yolo_detector(yolo, model, folder_path)


0: 384x640 12 persons, 1 bus, 2 benchs, 5 handbags, 1 suitcase, 10.0ms
Speed: 3.0ms preprocess, 10.0ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)
1

0: 384x640 13 persons, 2 benchs, 7 handbags, 1 suitcase, 7.6ms
Speed: 2.6ms preprocess, 7.6ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
2

0: 384x640 14 persons, 2 benchs, 5 handbags, 6.9ms
Speed: 2.5ms preprocess, 6.9ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
3

0: 384x640 14 persons, 2 benchs, 6 handbags, 6.8ms
Speed: 2.3ms preprocess, 6.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
4

0: 384x640 16 persons, 3 benchs, 6 handbags, 7.0ms
Speed: 2.3ms preprocess, 7.0ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
5

0: 384x640 13 persons, 3 benchs, 6 handbags, 6.8ms
Speed: 2.1ms preprocess, 6.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
6

0: 384x640 12 persons, 2 benchs, 5 handbags, 6.8ms
Speed: 2.3ms pr

In [78]:
video_path = '/kaggle/input/mot16-07/MOT16-07-raw.webm'

In [79]:
get_feature_file_from_video(yolo, model, video_path)


0: 384x640 15 persons, 2 benchs, 6 handbags, 8.4ms
Speed: 2.3ms preprocess, 8.4ms inference, 2.5ms postprocess per image at shape (1, 3, 384, 640)
1

0: 384x640 12 persons, 2 benchs, 4 handbags, 7.4ms
Speed: 3.3ms preprocess, 7.4ms inference, 1.3ms postprocess per image at shape (1, 3, 384, 640)
2

0: 384x640 11 persons, 2 benchs, 4 handbags, 6.6ms
Speed: 3.2ms preprocess, 6.6ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
3

0: 384x640 16 persons, 2 benchs, 5 handbags, 7.5ms
Speed: 2.7ms preprocess, 7.5ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
4

0: 384x640 15 persons, 2 benchs, 6 handbags, 7.3ms
Speed: 2.8ms preprocess, 7.3ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
5

0: 384x640 14 persons, 2 benchs, 6 handbags, 8.8ms
Speed: 2.7ms preprocess, 8.8ms inference, 1.2ms postprocess per image at shape (1, 3, 384, 640)
6

0: 384x640 12 persons, 2 benchs, 5 handbags, 7.0ms
Speed: 2.6ms preprocess, 7.0ms inference, 1.2ms 

Any model from ultralytics can be used as detector instead of YOLO.