In [None]:
!git clone https://github.com/xinyu1205/recognize-anything.git
%cd recognize-anything
%pip install -e .

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import random
import cv2
import torch
import os
from PIL import Image
from ram.models import ram_plus
from ram import inference_ram as inference
from ram import get_transform
from tqdm.notebook import tqdm
from typing import List, Tuple
from torchvision.transforms import Compose

  return torch.cuda.amp.custom_fwd(orig_func)  # type: ignore
  return torch.cuda.amp.custom_bwd(orig_func)  # type: ignore


### Hàm khởi tạo mô hình RAM++ (Recognize Anything Model)

In [2]:
def initialize_model(model_weight: str, image_size: int) -> Tuple[torch.nn.Module, Compose]:
    """
    Khởi tạo mô hình RAM và bộ chuyển đổi hình ảnh.

    Args:
        model_weight (str): Đường dẫn đến file trọng số của mô hình.
        image_size (int): Kích thước ảnh đầu vào cho mô hình.

    Returns:
        Tuple[torch.nn.Module, Compose]: Mô hình đã được khởi tạo và bộ chuyển đổi hình ảnh.
    """
    device: torch.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Khởi tạo bộ chuyển đổi hình ảnh
    transform: Compose = get_transform(image_size=image_size)
    
    # Khởi tạo mô hình RAM
    model: torch.nn.Module = ram_plus(
        pretrained=model_weight,
        image_size=image_size,
        vit='swin_l'
    )
    model.eval() # chuyển qua chế dộ đánh giá (evaluation)
    model = model.to(device) # chuyển qua device 
    
    return model, transform

In [21]:
def download_checkpoints():
    if not os.path.exists('pretrained'):
        os.makedirs('pretrained')

    ram_weights_path = 'pretrained/ram_swin_large_14m.pth'
    if not os.path.exists(ram_weights_path):
        !wget https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth -O pretrained/ram_swin_large_14m.pth
    else:
        print("RAM weights already downloaded!")

download_checkpoints()
print(model, 'weights are downloaded!')


python(30639) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.


--2024-08-11 22:32:55--  https://huggingface.co/xinyu1205/recognize-anything-plus-model/resolve/main/ram_plus_swin_large_14m.pth
Resolving huggingface.co (huggingface.co)... 18.165.122.120, 18.165.122.101, 18.165.122.11, ...
Connecting to huggingface.co (huggingface.co)|18.165.122.120|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/1f/c0/1fc0c455d992a58616eaae5f3ce9e34b1d2c49026fede34de9a7a5f3d06373dd/497c178836ba66698ca226c7895317e6e800034be986452dbd2593298d50e87d?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27ram_plus_swin_large_14m.pth%3B+filename%3D%22ram_plus_swin_large_14m.pth%22%3B&Expires=1723663976&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcyMzY2Mzk3Nn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmdmYWNlLmNvL3JlcG9zLzFmL2MwLzFmYzBjNDU1ZDk5MmE1ODYxNmVhYWU1ZjNjZTllMzRiMWQyYzQ5MDI2ZmVkZTM0ZGU5YTdhNWYzZDA2MzczZGQvNDk3YzE3ODgzNmJhNjY2OThjYT

In [6]:
import os
import json
from typing import Dict, List
import torch
from tqdm import tqdm
from PIL import Image


def process_image(image_path: str, transform, model, device: torch.device) -> List[str]:
    """Process a single image and return the list of detected objects."""
    img = Image.open(image_path)
    processed_img = transform(img).unsqueeze(0).to(device)
    res = inference(processed_img, model)
    objects = [obj.strip() for obj in res[0].strip().split("|")]
    return objects


def process_video(
    video_path: str, transform, model, device: torch.device, base_dir: str
) -> Dict[str, List[str]]:
    """Process all images in a video folder and return a dictionary of detected objects and their image paths."""
    results = {}
    image_paths = [
        os.path.join(video_path, f)
        for f in os.listdir(video_path)
        if f.lower().endswith(("jpg", "jpeg", "png", "webp"))
    ]

    for image_path in tqdm(
        image_paths, desc=f"Processing {os.path.relpath(video_path, base_dir)}"
    ):
        objects = process_image(image_path, transform, model, device)
        relative_path = os.path.relpath(image_path, base_dir)

        for obj in objects:
            if obj not in results:
                results[obj] = []
            results[obj].append(relative_path)

    return results


def process_group(
    group_path: str, transform, model, device: torch.device, base_dir: str
) -> Dict[str, List[str]]:
    """Process all videos in a group folder and return a dictionary of detected objects and their image paths."""
    group_results = {}

    for video in os.listdir(group_path):
        video_path = os.path.join(group_path, video)
        if os.path.isdir(video_path):
            video_results = process_video(
                video_path, transform, model, device, base_dir
            )
            for obj, paths in video_results.items():
                if obj not in group_results:
                    group_results[obj] = []
                group_results[obj].extend(paths)

    return group_results


def create_json(
    model: torch.nn.Module,
    transform: Compose,
    base_dir: str = "data/images",
    output_file: str = "results.json",
) -> None:
    """Traverse all video directories, run inference, and create a JSON results file."""
    device: torch.device = next(model.parameters()).device
    results: Dict[str, List[str]] = {}

    for group in os.listdir(base_dir):
        group_path = os.path.join(base_dir, group)
        if os.path.isdir(group_path):
            group_results = process_group(
                group_path, transform, model, device, base_dir
            )
            for obj, paths in group_results.items():
                if obj not in results:
                    results[obj] = []
                results[obj].extend(paths)

    # Write results to JSON file
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Results saved to {output_file}")

In [7]:
IMAGE_SIZE: int = 384
MODEL_WEIGHT: str = os.path.join('pretrained', 'ram_swin_large_14m.pth')

# Khởi tạo mô hình và bộ chuyển đổi
model, transform = initialize_model(MODEL_WEIGHT, IMAGE_SIZE)

# Chạy inference và tạo file JSON
create_json(model, transform)

--------------
pretrained/ram_swin_large_14m.pth
--------------
load checkpoint from pretrained/ram_swin_large_14m.pth
vit: swin_l


Processing 1/6: 100%|██████████| 149/149 [02:23<00:00,  1.04it/s]
Processing 1/19: 100%|██████████| 44/44 [00:41<00:00,  1.05it/s]
Processing 2/4: 100%|██████████| 110/110 [01:47<00:00,  1.02it/s]
Processing 2/3: 100%|██████████| 58/58 [00:53<00:00,  1.09it/s]

Results saved to results.json





# Tag list 

Tag list này để cho người dùng chọn và filter

https://github.com/xinyu1205/recognize-anything/blob/main/ram/data/ram_tag_list.txt

https://github.com/xinyu1205/recognize-anything/blob/main/datasets/openimages_rare_200/openimages_rare_200_ram_taglist.txt