In [50]:
from ultralytics import YOLO
import cv2 
from deepface import DeepFace
import numpy as np
import time
import torch
import torch.nn as nn
import torchvision.models as models
import torchvision.transforms as T
from PIL import Image
from urllib.request import urlopen
import timm
model_names = [
    "VGG-Face", "Facenet", "Facenet512", "OpenFace", "DeepFace",
    "DeepID", "ArcFace", "Dlib", "SFace", "GhostFaceNet",
    "Buffalo_L",
]

In [51]:
img_demo = Image.open(urlopen(
    'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png'
))

model = timm.create_model(
    'vit_small_patch16_224.dino',
    pretrained=True,
    num_classes=0,  # remove classifier nn.Linear
)
model = model.eval()

model.safetensors:   0%|          | 0.00/86.7M [00:00<?, ?B/s]

In [55]:
# get model specific transforms (normalization, resize)
data_config = timm.data.resolve_model_data_config(model)
transforms = timm.data.create_transform(**data_config, is_training=False)

output = model(transforms(img_demo).unsqueeze(0))  # unsqueeze single image into batch of 1

top5_probabilities, top5_class_indices = torch.topk(output.softmax(dim=1) * 100, k=5)
print(top5_class_indices)

tensor([[ 94,  79,  27, 299, 192]])


In [2]:
model = YOLO("yolo11n.pt")  

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

resnet18 = models.resnet18(weights=models.ResNet18_Weights.IMAGENET1K_V1)
resnet18 = resnet18.to(device)

# Remove the final classification layer → gives 512-d embedding
embedding_model = nn.Sequential(*list(resnet18.children())[:-1])
embedding_model.eval()

# --------------------------------------------------
# 2. Preprocessing transform for OpenCV images
# --------------------------------------------------
transform = T.Compose([
    T.ToPILImage(),
    T.Resize((224, 224)),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])

In [4]:
def facial_verf(select_list, target, model_name):
    target_list=[]
    for i, susp in enumerate(select_list): 
        result = DeepFace.verify(target, susp, enforce_detection=False, model_name=model_name)
        if result["verified"]== True :
            print("Target found")
            target_path = f"target_found_{i}.jpg"
            target_list.append(susp)
            cv2.imwrite(target_path, susp)
    return target_list

In [5]:
def select_img_by_label(img, model, desired_label="person"): 
    results = model(img)  # predict on an image
    select_list=[]
    # Get boxes and labels
    boxes = results[0].boxes.xyxy  # Bounding box coordinates
    labels = results[0].boxes.cls  # Class indices
    class_names = model.names       # Mapping from class index to label name
    
    # Loop through detections
    for i, (box, label_idx) in enumerate(zip(boxes, labels)):
        label_name = class_names[int(label_idx)]
        
        # Only save boxes with the desired label
        if label_name == desired_label:
            x1, y1, x2, y2 = map(int, box)
            #print(f" x1:{x1},x2:{x2},y1:{y1},y1:{y2}")
            cropped_img = img[y1:y2, x1:x2]
            select_list.append(cropped_img)
            cropped_image_path = f"cropped_{desired_label}_{i}.jpg"
            #cv2.imwrite(cropped_image_path, cropped_img)
            #print(f"Saved: {cropped_image_path}")
    return select_list

In [27]:
def extract_label_target(target, model): 
    results = model(target)  # predict on an image
    label_list=[]
    # Get boxes and labels
    boxes = results[0].boxes.xyxy  # Bounding box coordinates
    labels = results[0].boxes.cls  # Class indices
    class_names = model.names       # Mapping from class index to label name
    
    # Loop through detections
    for i, (box, label_idx) in enumerate(zip(boxes, labels)):
        label_name = class_names[int(label_idx)]
        x1, y1, x2, y2 = map(int, box)
        cropped_img = target[y1:y2, x1:x2]
        temp_dict= {"label":label_name,"img": cropped_img}
        #print(temp_dict)
        label_list.append(temp_dict)

    return label_list

In [7]:
def cosine_similarity(a, b):
    return float(a @ b / ( (a**2).sum()**0.5 * (b**2).sum()**0.5 ))

In [8]:
def get_embedding(cv_img):
    # BGR → RGB
    img_rgb = cv2.cvtColor(cv_img, cv2.COLOR_BGR2RGB)

    # Apply transforms
    tensor = transform(img_rgb).unsqueeze(0).to(device)

    # Forward pass
    with torch.no_grad():
        embedding = embedding_model(tensor)

    # flatten from (1, 512, 1, 1) → (512,)
    return embedding.squeeze().cpu().numpy()

In [9]:
def general_label_sim(susp, target): 

# Get embeddings
    emb1 = get_embedding(susp)
    emb2 = get_embedding(target)

# Similarity score
    sim = cosine_similarity(emb1, emb2)
    return sim


In [29]:
def list_general_case(select_list, target, threshold):
    target_list=[]
    print(len(select_list))
    for i, susp in enumerate(select_list): 
        sim = general_label_sim(susp, target)
        print(sim)
        if sim >= threshold:
            print("Target found")
            target_path = f"target_found_{i}.jpg"
            target_list.append(susp)
            cv2.imwrite(target_path, susp)
    return target_list

In [14]:


# Load OpenCV images
#img1 = cv2.imread("bag_1.jpg")
#img2 = cv2.imread("bag_3.jpg")

# Get embeddings
#emb1 = get_embedding(img1)
#emb2 = get_embedding(img2)

# Similarity score
#sim = cosine_similarity(emb1, emb2)

#print("Cosine similarity:", sim)


In [40]:
def analyze_image(img, target_img,threshold=0.8,model_for_detection=YOLO("yolo11n.pt") , model_name_for_face_rec="SFace"):
    """ 
    img  cv_imread: 
        Image which is analyzed 
    target_img cv_imread: 
        Picture of object or person to be identified currently on called if label is person 
    label string 
        Name of a category which detection models categorizes 
    model_for_detection model that can be read by YOLO 
        Model that will categorize image by objects
    model_name_for_face_rec model that can be read by DeepFace 
        Model that will do face verification if label is person
    """
    meta_list=[]
    #TODO: Verify that target image is of the object the label relates to 
    label_list= extract_label_target(target_img, model_for_detection)
    for label_dict in label_list: 
        label_name= label_dict["label"]
        print(f"label is {label_name}")
        label_target= label_dict["img"]
        select_list = select_img_by_label(img, model_for_detection, desired_label=label_name)
        if label_name=="person":
            #print("how got i here")
            target_list=facial_verf(select_list, target_img, model_name_for_face_rec)
            meta_list.append(target_list)
        else:
            #print("i got here at least")
            target_list=list_general_case(select_list, target_img, threshold)
            meta_list.append(target_list)
    return meta_list





In [47]:
label= "person"
img1 = cv2.imread("cup_close2.jpg")
img2 = cv2.imread("cup_scene.jpg")
analyze_image(img1, img2, 0.8, model, model_names[8])


0: 640x480 8 persons, 1 cup, 2 laptops, 1 cell phone, 86.2ms
Speed: 1.6ms preprocess, 86.2ms inference, 1.5ms postprocess per image at shape (1, 3, 640, 480)
label is laptop

0: 640x480 1 person, 1 cup, 1 dining table, 112.9ms
Speed: 3.1ms preprocess, 112.9ms inference, 1.7ms postprocess per image at shape (1, 3, 640, 480)
0
label is cup

0: 640x480 1 person, 1 cup, 1 dining table, 55.2ms
Speed: 2.8ms preprocess, 55.2ms inference, 0.9ms postprocess per image at shape (1, 3, 640, 480)
1
0.4523012340068817
label is person

0: 640x480 1 person, 1 cup, 1 dining table, 39.6ms
Speed: 1.8ms preprocess, 39.6ms inference, 0.4ms postprocess per image at shape (1, 3, 640, 480)
label is laptop

0: 640x480 1 person, 1 cup, 1 dining table, 50.6ms
Speed: 2.1ms preprocess, 50.6ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)
0
label is person

0: 640x480 1 person, 1 cup, 1 dining table, 70.3ms
Speed: 2.7ms preprocess, 70.3ms inference, 0.5ms postprocess per image at shape (1, 3, 64

[[], [], [], [], [], [], [], [], [], [], [], []]

In [None]:
pred_img= cv2.imread("top_gear.webp")  
results = model(pred_img)  # predict on an image

desired_label = "person"  # Replace with the label you want
people_list=[]
# Get boxes and labels
boxes = results[0].boxes.xyxy  # Bounding box coordinates
labels = results[0].boxes.cls  # Class indices
class_names = model.names       # Mapping from class index to label name

# Loop through detections
for i, (box, label_idx) in enumerate(zip(boxes, labels)):
    label_name = class_names[int(label_idx)]
    
    # Only save boxes with the desired label
    if label_name == desired_label:
        x1, y1, x2, y2 = map(int, box)
        print(f" x1:{x1},x2:{x2},y1:{y1},y1:{y2}")
        cropped_img = pred_img[y1:y2, x1:x2]
        people_list.append(cropped_img)
        cropped_image_path = f"cropped_{desired_label}_{i}.jpg"
        cv2.imwrite(cropped_image_path, cropped_img)
        print(f"Saved: {cropped_image_path}")

In [None]:
target_img= cv2.imread("matt_le_blanc.jpg") 
print(models[3])
for i, susp in enumerate(people_list): 
    result = DeepFace.verify(target_img, susp, enforce_detection=False, model_name=models[8])
    if result["verified"]== True :
        print("Target found")
        target_path = f"target_found_{i}.jpg"
        cv2.imwrite(target_path, susp)
    print(result)


In [46]:
results = model("cup_close.jpg")  # results list

# Visualize the results
for i, r in enumerate(results):
    # Plot results image
    im_bgr = r.plot()  # BGR-order numpy array
    im_rgb = Image.fromarray(im_bgr[..., ::-1])  # RGB-order PIL image

    # Show results to screen (in supported environments)
    r.show()

    # Save results to disk
    r.save(filename=f"cup_close{i}.jpg")


image 1/1 /home/alex/Documents/EDTH2025/robot_part/cup_close.jpg: 640x480 1 cup, 1 spoon, 2 keyboards, 82.9ms
Speed: 1.6ms preprocess, 82.9ms inference, 0.8ms postprocess per image at shape (1, 3, 640, 480)
