In [1]:
from ultralytics import YOLO
import torch
from torchvision.transforms import Compose, Resize, ToTensor
from torchvision import transforms
import cv2
import matplotlib.pyplot as plt

In [8]:
model = YOLO("yolo11n-seg.pt")

In [10]:
import cv2 as cv
img = cv.imread("dataset/test.jpg")
results = model(img)[0]
results.show()  # display to screen
results.save(filename="result.jpg")
# cv.imshow("Display window", img)
k = cv.waitKey(0) # Wait for a keystroke in the window


0: 480x640 18 apples, 155.6ms
Speed: 4.7ms preprocess, 155.6ms inference, 28.7ms postprocess per image at shape (1, 3, 480, 640)


In [None]:
!pip install timm torch ultralytics

In [None]:
model_type = "DPT_Large"
midas = torch.hub.load("intel-isl/MiDaS", model_type)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
midas.to(device)
midas.eval()

In [None]:
custom_transform = transforms.Compose([
    transforms.ToPILImage(),
    transforms.Resize((384, 384)),  # or any size multiple of 32
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406],
                         std=[0.229, 0.224, 0.225])
])

img = cv2.imread("dataset/test.jpg")
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
input_tensor = custom_transform(img).unsqueeze(0).to("cuda" if torch.cuda.is_available() else "cpu")

with torch.no_grad():
    prediction = midas(input_tensor)

# Resize prediction to original image size
prediction = torch.nn.functional.interpolate(
    prediction.unsqueeze(1),
    size=(448,640),
    mode="bicubic",
    align_corners=False
).squeeze()
depth_map = prediction.cpu().numpy()
plt.imshow(depth_map)

Get fruit items detecions

In [None]:
def get_index(results, results_2):
  index = []
  import numpy as np
  array_index = np.zeros(len(results[0]['boxes']), dtype=bool)
  for i in range(len(results_2[0].boxes.xyxy)):
    #segmentation
    x1,y1,x2,y2 = results_2[0].boxes.xyxy[i]
    for y in range(len(results[0]['boxes'])):
      # if array_index[y] == True:
      #   break
      x1_1,y1_1,x2_1,y2_1 = results[0]['boxes'][y]
      if abs(x1-x1_1) < 50 and abs(x2-x2_1) < 50 and abs(y1-y1_1) < 50 and abs(y2-y2_1) < 50:
        index.append(i)
        # array_index[y] = True
      # else:
      #   break
  return index

In [None]:
import cv2
import numpy as np
import matplotlib.pyplot as plt


def draw_image_segmentation(results_2,index):
  img_bgr = results_2[0].orig_img.copy()
  img = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)

  mask_ids = index

  H, W = img.shape[:2]
  combined_mask = np.zeros((H, W), dtype=np.uint8)

  for idx in mask_ids:
      m = results_2[0].masks.data[idx].cpu().numpy().astype(np.uint8)
      if m.shape != (H, W):
          m = cv2.resize(m, (W, H), interpolation=cv2.INTER_NEAREST)
      combined_mask = np.logical_or(combined_mask, m)

  overlay = img.copy()
  color = np.array([0, 255, 0], dtype=np.uint8)
  alpha = 0.5
  overlay[combined_mask] = (alpha * color + (1 - alpha) * overlay[combined_mask]).astype(np.uint8)

  plt.imshow(overlay)
  plt.axis("off")
  plt.show()


Load the model to do detection

In [None]:
from google.colab import userdata
from huggingface_hub import login
login(token=userdata.get("HF_TOKEN"))

In [None]:
import requests

import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection

model_id = "IDEA-Research/grounding-dino-base"
device = "cuda" if torch.cuda.is_available() else "cpu"

processor = AutoProcessor.from_pretrained(model_id)
model_dec = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)

In [None]:
import numpy as np
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib.patches import Rectangle
import torch

def show_gd_results(img, results, score_thr=0.25):
    if isinstance(img, str):
        img = Image.open(img).convert("RGB")
    elif isinstance(img, torch.Tensor):
        if img.ndim == 3 and img.shape[0] in (1,3):
            img = img.permute(1,2,0).detach().cpu().numpy()
        else:
            img = img.detach().cpu().numpy()
        img = (img*255).astype(np.uint8) if img.max()<=1 else img.astype(np.uint8)
        img = Image.fromarray(img)
    elif not isinstance(img, Image.Image):
        img = Image.fromarray(img)

    r = results[0] if isinstance(results, (list, tuple)) else results
    boxes  = r["boxes"].detach().cpu().numpy()
    scores = r["scores"].detach().cpu().numpy()
    labels = r.get("labels", r.get("text_labels", []))

    fig, ax = plt.subplots(figsize=(8,8))
    ax.imshow(np.asarray(img))
    for (x1,y1,x2,y2), s, lab in zip(boxes, scores, labels):
        if s < score_thr:
            continue
        ax.add_patch(Rectangle((x1, y1), x2-x1, y2-y1, fill=False, linewidth=2, edgecolor='blue'))
        ax.text(x1, max(0, y1-5), f"{lab} {s:.2f}",
                fontsize=9, color="white",
                bbox=dict(facecolor="blue", alpha=0.5, pad=2))
    ax.axis("off")
    plt.show()

# Example:
# show_gd_results(original_pil_image, results, score_thr=0.3)


In [None]:
# image_url = "/content/test.jpg"
def get_detection(image_path, model_dec):
  image = Image.open(image_path).convert("RGB")
  # Check for cats and remote controls
  # VERY important: text queries need to be lowercased + end with a dot
  text = "banana . broccoli . avocado . tomato . onion . apple ."

  inputs = processor(images=image, text=text, return_tensors="pt").to(device)
  with torch.no_grad():
      outputs = model_dec(**inputs)
  results = processor.post_process_grounded_object_detection(
      outputs,
      inputs.input_ids,
      target_sizes=[image.size[::-1]]
  )
  show_gd_results(image, results)
  return results

Crop image

In [None]:
import torch
def get_index_detection(results):
  dic_ind = {"banana" : [], "tomato" : [] , "broccoli" : [], "onion" : [], "apple" : [], "avocado" : []}
  for index in range(len(results[0]['text_labels'])):
    if results[0]['text_labels'][index] not in dic_ind:
      continue
    dic_ind[results[0]['text_labels'][index]].append(index)

  xyxy = torch.zeros((6, 4), dtype= torch.float32)  # 3 rows, 2 columns
  index_arr = 0
  for fruit, index in dic_ind.items():
    x1, y1, x2,y2 = results[0]['boxes'][dic_ind[fruit][0]]
    xyxy[index_arr][0] = x1
    xyxy[index_arr][1] = y1
    xyxy[index_arr][2] = x2
    xyxy[index_arr][3] = y2
    index_arr += 1


  index_arr = 0
  for fruit, index in dic_ind.items():
    for i in index:
      x1, y1, x2,y2 = results[0]['boxes'][i]
      if (x1 < xyxy[index_arr][0]):
        xyxy[index_arr][0] = x1

      if (y1 < xyxy[index_arr][1]):
        xyxy[index_arr][1] = y1

      if (x2 > xyxy[index_arr][2]):
        xyxy[index_arr][2] = x2

      if (y2 > xyxy[index_arr][3]):
        xyxy[index_arr][3] = y2
    index_arr += 1
  return dic_ind,xyxy

In [None]:
import cv2
import torch
import numpy as np
import matplotlib.pyplot as plt

def draw_xyxy_on_image(img_path, xyxy, classes, seen=None, out_path=None):
    img = cv2.imread(img_path)
    H, W = img.shape[:2]

    boxes = xyxy.detach().to('cpu').round().to(torch.int64).numpy()

    # optional clamp
    boxes[:, [0,2]] = boxes[:, [0,2]].clip(0, W)
    boxes[:, [1,3]] = boxes[:, [1,3]].clip(0, H)

    for i, (x1, y1, x2, y2) in enumerate(boxes):
        if seen is not None and (not bool(seen[i].item())):
            continue
        if x2 <= x1 or y2 <= y1:
            continue
        cv2.rectangle(img, (x1, y1), (x2, y2), (0, 255, 0), 3)
        label = classes[i]
        cv2.putText(img, label, (x1, max(0, y1 - 8)),
                    cv2.FONT_HERSHEY_SIMPLEX, 0.9, (0, 255, 0), 2, cv2.LINE_AA)

    if out_path:
        cv2.imwrite(out_path, img)
    plt.figure(figsize=(8,8))
    plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
    plt.axis('off')
    plt.show()



In [None]:
import cv2
import torch
import os
import matplotlib.pyplot as plt

def crop_xyxy_regions(img_path, xyxy, classes, seen=None, save_dir="crops", show=True):
    names = []
    crops = []
    img = cv2.imread(img_path)
    H, W = img.shape[:2]
    os.makedirs(save_dir, exist_ok=True)
    boxes = xyxy.detach().to('cpu').round().to(torch.int64).numpy()

    for i, (x1, y1, x2, y2) in enumerate(boxes):
        if seen is not None and not bool(seen[i].item()):
            continue
        if x2 <= x1 or y2 <= y1:
            continue
        x1 = max(0, min(W, x1))
        x2 = max(0, min(W, x2))
        y1 = max(0, min(H, y1))
        y2 = max(0, min(H, y2))

        crop = img[y1:y2, x1:x2]
        filename = f"{classes[i]}_crop.jpg"
        names.append(filename)
        crops.append(crop)

        cv2.imwrite(os.path.join(save_dir, filename), crop)
        print(f"Saved {filename}")

    if show and crops:
        cols = min(3, len(crops))
        rows = (len(crops) + cols - 1) // cols
        plt.figure(figsize=(5*cols, 5*rows))
        for j, crop in enumerate(crops):
            plt.subplot(rows, cols, j+1)
            plt.imshow(cv2.cvtColor(crop, cv2.COLOR_BGR2RGB))
            plt.title(names[j])
            plt.axis("off")
        plt.show()

    return names, crops


Segmentation the whole image

In [None]:
from ultralytics import SAM

# Load a model
# For SAM=sam_b.pt, SAM2=sam2_b.pt, SAM2.1=sam2.1_b.pt
model = SAM("sam2.1_l.pt")

model.info()  # Display model information (optional)

In [None]:
# Run inference (image or video)
def get_whole_photo_seg(image_path, model):
  results_2 = model.predict(image_path, conf = 0.1, iou = 1, batch = 10, max_det = 600, rect = False)  # image
  # results = model("https://youtu.be/LNwODJXcvt4")  # video file

  results_2[0].show()  # Display results
  return results_2