In [2]:
import torch
import torch.nn.functional as F
import json
from ultralytics import YOLO
from torchvision import transforms, models
from PIL import Image
import cv2
import numpy as np

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/Users/pbanavara/Library/Application Support/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [3]:
yolo_model = YOLO("yolov8s.pt")

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov8s.pt to 'yolov8s.pt'...


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 21.5M/21.5M [00:00<00:00, 37.0MB/s]


In [6]:
num_classes = 12  # Change this to match your trained model
efficientnet_model = models.efficientnet_b0(pretrained=False)
efficientnet_model.classifier[1] = torch.nn.Linear(efficientnet_model.classifier[1].in_features, num_classes)  # Ensure same output size
efficientnet_model.load_state_dict(torch.load("efficientnet_multilabel.pth"))  # Load your fine-tuned model
efficientnet_model.eval()  # Set to evaluation mode

EfficientNet(
  (features): Sequential(
    (0): Conv2dNormActivation(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): SiLU(inplace=True)
    )
    (1): Sequential(
      (0): MBConv(
        (block): Sequential(
          (0): Conv2dNormActivation(
            (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), groups=32, bias=False)
            (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (2): SiLU(inplace=True)
          )
          (1): SqueezeExcitation(
            (avgpool): AdaptiveAvgPool2d(output_size=1)
            (fc1): Conv2d(32, 8, kernel_size=(1, 1), stride=(1, 1))
            (fc2): Conv2d(8, 32, kernel_size=(1, 1), stride=(1, 1))
            (activation): SiLU(inplace=True)
            (scale_activation): Sigmoid()
          )
          (2): Conv2dNormActivat

In [7]:
# Define EfficientNet transformations
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [19]:
object_classes = ["plate", "glass", "cup", "pan", "bowl", "fork", "knife", "spoon", "casserole", "dutch oven"]
label_classes = ['burnt', 'casserole', 'coffee', 'cups', 'dirty', 'dutch', 'food', 'oven', 'pan', 'residue', 'stains', 'wok']


In [20]:
def detect_objects(image_path):
    """
    Detect objects in an image using YOLOv8.
    Returns a list of detected objects with bounding boxes.
    """
    results = yolo_model(image_path)
    detections = []

    for result in results:
        for box in result.boxes.data:
            x1, y1, x2, y2, conf, cls = box.cpu().numpy()
            class_name = result.names[int(cls)]
            if class_name in object_classes:
                detections.append({"name": class_name, "bbox": [int(x1), int(y1), int(x2), int(y2)]})
    
    return detections

In [21]:
detect_objects("test_sink_image.png")


image 1/1 /Users/pbanavara/dev/kitchen_robot_agent/test_sink_image.png: 640x480 2 handbags, 1 bottle, 3 cups, 1 spoon, 3 bowls, 2 chairs, 1 sink, 2 books, 79.9ms
Speed: 1.8ms preprocess, 79.9ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)


[{'name': 'cup', 'bbox': [1119, 2186, 1406, 2496]},
 {'name': 'bowl', 'bbox': [623, 2539, 1273, 3177]},
 {'name': 'bowl', 'bbox': [2863, 1523, 3021, 1928]},
 {'name': 'bowl', 'bbox': [606, 2208, 1154, 2632]},
 {'name': 'cup', 'bbox': [1718, 1116, 1969, 1498]},
 {'name': 'spoon', 'bbox': [1084, 2773, 1366, 3185]},
 {'name': 'cup', 'bbox': [251, 1255, 669, 1615]}]

In [22]:
def classify_cleanliness(image, bbox):
    """
    Crop the detected object from the image and classify its cleanliness.
    """
    x1, y1, x2, y2 = bbox
    cropped_object = image.crop((x1, y1, x2, y2))
    cropped_object = transform(cropped_object).unsqueeze(0)

    with torch.no_grad():
        logits = efficientnet_model(cropped_object)
        probs = torch.sigmoid(logits).squeeze(0)  # Multi-label classification

    print("Probabilities", probs)
    labels = [label_classes[i] for i, prob in enumerate(probs) if prob > 0.5]
    return labels if labels else ["clean"]  # Default to clean if no strong label


In [23]:
def process_image(image_path):
    """
    Run the full pipeline: Detect objects, classify cleanliness, and return structured output.
    """
    image = Image.open(image_path).convert("RGB")
    detections = detect_objects(image_path)

    final_output = {"objects": []}
    for obj in detections:
        labels = classify_cleanliness(image, obj["bbox"])
        final_output["objects"].append({"name": obj["name"], "labels": labels})

    return final_output

In [24]:
image_path = "test_sink_image.png"  # Replace with your image path
output = process_image(image_path)


image 1/1 /Users/pbanavara/dev/kitchen_robot_agent/test_sink_image.png: 640x480 2 handbags, 1 bottle, 3 cups, 1 spoon, 3 bowls, 2 chairs, 1 sink, 2 books, 75.7ms
Speed: 1.8ms preprocess, 75.7ms inference, 1.1ms postprocess per image at shape (1, 3, 640, 480)
Probabilities tensor([0.0606, 0.0332, 0.1241, 0.2509, 0.5558, 0.0623, 0.0561, 0.0688, 0.0452, 0.0662, 0.9208, 0.0266])
Probabilities tensor([3.7720e-02, 1.6680e-01, 1.1271e-01, 4.0829e-02, 2.0507e-01, 7.7071e-04, 8.4703e-01, 6.9137e-04, 1.6571e-02, 2.5762e-01, 9.8790e-01, 4.7770e-02])
Probabilities tensor([0.2927, 0.3696, 0.5158, 0.2545, 0.3916, 0.0764, 0.1480, 0.0890, 0.7115, 0.2681, 0.0652, 0.1351])
Probabilities tensor([0.2589, 0.1153, 0.0493, 0.0741, 0.1432, 0.0276, 0.8080, 0.0372, 0.3109, 0.2275, 0.7533, 0.1391])
Probabilities tensor([0.7610, 0.0530, 0.0235, 0.0834, 0.0914, 0.0815, 0.8042, 0.0678, 0.3249, 0.7914, 0.6687, 0.4815])
Probabilities tensor([0.0168, 0.6266, 0.7418, 0.1158, 0.5335, 0.0095, 0.8909, 0.0102, 0.0228, 0.1

In [25]:
output

{'objects': [{'name': 'cup', 'labels': ['dirty', 'stains']},
  {'name': 'bowl', 'labels': ['food', 'stains']},
  {'name': 'bowl', 'labels': ['coffee', 'pan']},
  {'name': 'bowl', 'labels': ['food', 'stains']},
  {'name': 'cup', 'labels': ['burnt', 'food', 'residue', 'stains']},
  {'name': 'spoon',
   'labels': ['casserole', 'coffee', 'dirty', 'food', 'stains']},
  {'name': 'cup', 'labels': ['dirty', 'food', 'stains']}]}