#### load dependencies

In [1]:
import cv2
import os
import pandas as pd 
import sys
from collections import defaultdict
from tqdm import tqdm

from autodistill_grounded_sam import GroundedSAM
from autodistill.detection import CaptionOntology



#### configuration

- set your working directory to the root of the repo
- append the scripts folder to the python path, so you can load the utility script 

In [2]:
os.chdir(r"D:\git-repos\mluerig\grounded-sam-intro")
sys.path.append("scripts")
from utils import model_helpers

pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.


#### load the model

- initialize GroundedSAM model with your search ontology, in this case, for butterfly detection
- set text and box thresholds for detection sensitivity (0.1 = everything above 10% detection confidence will be included)
- I'd recommend setting those sensitivies not too high, and instead, filter our unwanted masks yourself later 
- you can add multiple labels

In [3]:
base_model = GroundedSAM(
    ontology=CaptionOntology({
        "butterfly": "butterfly",
        # "label": "label", ## example of adding more classes
        # "ruler": "ruler", ## example of adding more classes
        }),
    text_threshold = 0.1,
    box_threshold = 0.1,
    )


trying to load grounding dino directly


torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at C:\actions-runner\_work\pytorch\pytorch\pytorch\aten\src\ATen\native\TensorShape.cpp:4319.)


final text_encoder_type: bert-base-uncased


#### collect all images

- this works with nested folders

In [None]:
## point to image root dir
root_dir = fr"data_raw\input_imgs\butterflies"

## loop through all subfolders
dict_imgs = defaultdict(list)
for root, dirs, files in tqdm(list(os.walk(root_dir))):
    for file_name in files:
        file_path = os.path.join(root, file_name)
        parts = os.path.normpath(file_path).split(os.sep)
        info = {
            "image_name": file_name,
            "image_path": file_path,
        }
        dict_imgs[file_name].append(info)

## make a df        
flattened_data = []
for key, value_list in dict_imgs.items():
    for value in value_list:
        flattened_entry = {}
        flattened_entry.update(value)
        flattened_data.append(flattened_entry)
data_imgs = pd.DataFrame(flattened_data)


100%|██████████| 1/1 [00:00<?, ?it/s]


#### set up how the results are stored

In [5]:
path_mask_root_dir = fr"data_raw/segmentation_masks-butterflies/"
path_seg_data = fr"data/segmentation_results-butterflies.csv"
os.makedirs(path_mask_root_dir, exist_ok=True)
os.makedirs(os.path.dirname(path_seg_data), exist_ok=True)
dict_results = {}

#### run segmentation

- depending on the size of your dataset, and whether you use a GPU or not, this can take a while
- this replicates the nested folder structure of your input folder, if it exists
- use a sensible minimum area cutoff to not include small, likely noisy detections 

In [6]:
min_area = 10000 ## min area in px

pbar = tqdm(total=len(data_imgs), position=0, leave=False, desc="Segmenting images")
for idx1, row in data_imgs.iterrows():
    
    ## pull info from df
    image_name = row["image_name"]
    image_path = row["image_path"]     
    base_image_name = os.path.splitext(image_name)[0]

    ## path management, preserve nested structure
    rel_path = os.path.relpath(image_path, root_dir)
    rel_dir = os.path.dirname(rel_path) 
    mask_dir = os.path.join(path_mask_root_dir, rel_dir)
    
    # load image
    try:
        image = cv2.imread(image_path)
        assert image is not None, "Failed to load the image."
    except:
        dict_results[base_image_name] = {"result": "no_image", "image_name": image_name}
        pbar.update(1)
        continue
    
    ## do prediction
    result = base_model.predict(image)
    
    # Check for masks and process
    if len(result.mask) > 0:
        for idx2, (area, mask) in enumerate(zip(result.area, result.mask)):
            
            ## save mask if area > min_area
            mask_name = base_image_name + f"_{idx2+1}.png"
            if area > min_area:
                roi, info = model_helpers.filter_mask(image, mask, min_area)
                os.makedirs(mask_dir, exist_ok=True)
                saved = cv2.imwrite(os.path.join(mask_dir, mask_name), roi)
            else:
                info = {}
                                
            # Store info
            info["confidence"] = result.confidence[idx2]
            info["area"] = area
            info["mask_idx"] = idx2 + 1
            info["image_name"] = image_name
            dict_results[mask_name] = info
        pbar.update(1)
    else:
        # Add an empty entry if no detections
        dict_results[mask_name] = {"result": "no_detection", "image_name": image_name}
    pbar.update(1)
pbar.close()

Segmenting images:   0%|          | 0/8 [00:00<?, ?it/s]The `device` argument is deprecated and will be removed in v5 of Transformers.
torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. Starting in PyTorch 2.9, calling checkpoint without use_reentrant will raise an exception. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.
None of the inputs have requires_grad=True. Gradients will be None
`torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.
                                                                

#### save results to csv
- this saves the results of your dictionary to a table
- use it to inspect the detected masks (e.g., confidence, size, etc)

In [7]:
## to dt   
data_results = pd.DataFrame.from_dict(dict_results, orient="index").reset_index()
data_results = data_results.rename(columns={"index":"mask_name"})
data_results = data_results[data_results['bbox'].notna()]
data_results.rename(columns={"confidence":"confidence_seg"}, inplace=True)
data_results[["mask_idx", "area", "diameter"]] = data_results[["mask_idx", "area", "diameter"]].apply(lambda x: x.astype('int'))
data_results = data_results[['image_name', 'mask_idx','mask_name','confidence_seg','area','bbox', 'center','diameter']]
data_results = data_results.sort_values(by=['image_name', "mask_idx"])
data_results.to_csv(path_seg_data, index=False)    