In [53]:
import os
import torch
import json
import pandas as pd
from IPython.display import Image, clear_output

In [6]:
print(f"Setup complete. Using torch {torch.__version__} " \
      f"({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

Setup complete. Using torch 2.6.0+cu124 (NVIDIA GeForce RTX 3090)


In [49]:
ds_path = "/SKU110K_fixed/"

In [29]:
# annotations = pd.read_csv("/SKU110K_fixed/annotations/annotations_test.csv")
# annotations.columns=['image_name','x1','y1','x2','y2','class','image_width','image_height']
# annotations.head(5)

Unnamed: 0,image_name,x1,y1,x2,y2,class,image_width,image_height
0,test_0.jpg,727,2269,862,2376,object,2448,3264
1,test_0.jpg,463,2274,715,2434,object,2448,3264
2,test_0.jpg,158,2290,283,2444,object,2448,3264
3,test_0.jpg,0,2290,154,2456,object,2448,3264
4,test_0.jpg,1995,2032,2131,2199,object,2448,3264


In [42]:
raw_data = {
    'train': None,
    'val': None,
    'test': None
}

for split in raw_data.keys():
    annotations = pd.read_csv(f"/SKU110K_fixed/annotations/annotations_{split}.csv")
    annotations.columns=['image_name','x1','y1','x2','y2','class','image_width','image_height']
    raw_data[split] = annotations
    print(f"{split} split: {len(annotations)} annotations, {len(annotations.groupby('image_name'))} samples")


train split: 1208481 annotations, 8219 samples
val split: 90967 annotations, 588 samples
test split: 431545 annotations, 2936 samples


In [55]:
ds = {}

for split, data in raw_data.items():
    
    data = data.reset_index()
    images = set(data['image_name'])

    ds[split] = []

    for i, image_name in enumerate(list(images)):
        df = data[data['image_name'] == image_name]

        img_path = os.path.join(ds_path,image_name)
        bboxes = []

        for idx, ann in df.iterrows():
            bbox = [ann['x1'],ann['x2'],ann['y1'],ann['y2']]
            bboxes.append(bbox)        

        ds[split].append({"image_path": img_path, "bboxes": bboxes})
        if i%50 == 0:
            print(f"{100*i/len(images):.1f}% of {split} split processed")


with open('sku110_dataset.json', 'w') as f: 
    json.dump(ds, f)

0.0% of train split processed
0.6% of train split processed
1.2% of train split processed
1.8% of train split processed
2.4% of train split processed
3.0% of train split processed
3.7% of train split processed
4.3% of train split processed
4.9% of train split processed
5.5% of train split processed
6.1% of train split processed
6.7% of train split processed
7.3% of train split processed
7.9% of train split processed
8.5% of train split processed
9.1% of train split processed
9.7% of train split processed
10.3% of train split processed
11.0% of train split processed
11.6% of train split processed
12.2% of train split processed
12.8% of train split processed
13.4% of train split processed
14.0% of train split processed
14.6% of train split processed
15.2% of train split processed
15.8% of train split processed
16.4% of train split processed
17.0% of train split processed
17.6% of train split processed
18.3% of train split processed
18.9% of train split processed
19.5% of train split proc

In [59]:
len(ds['train'])

8219

In [36]:
def generate_mask_from_box(image, bbox, grabcut_iter=5):
    """
    Given a single bounding box [x1, y1, x2, y2] and an image (H x W x 3),
    return a binary mask (H x W) indicating the object's region.
    
    We use GrabCut initialized by a rectangle around the bounding box.
    This is a naive example of turning a box into a (rough) segmentation mask.
    """
    x1, y1, x2, y2 = bbox
    
    # Ensure coordinates are within image bounds
    h, w = image.shape[:2]
    x1 = max(0, min(x1, w - 1))
    x2 = max(0, min(x2, w - 1))
    y1 = max(0, min(y1, h - 1))
    y2 = max(0, min(y2, h - 1))
    
    # Create initial mask for GrabCut
    # Possible values in 'mask':
    #    cv2.GC_BGD    = 0 (definite background)
    #    cv2.GC_FGD    = 1 (definite foreground)
    #    cv2.GC_PR_BGD = 2 (probable background)
    #    cv2.GC_PR_FGD = 3 (probable foreground)
    mask = np.zeros((h, w), np.uint8)
    
    # GrabCut requires 2 background/foreground models
    bgd_model = np.zeros((1, 65), np.float64)
    fgd_model = np.zeros((1, 65), np.float64)

    # Rectangle for the bounding box
    rect = (x1, y1, x2 - x1, y2 - y1)
    
    # Initialize GrabCut with the bounding box
    cv2.grabCut(
        image,        # input image
        mask,         # mask to be modified
        rect,         # rectangle that includes the foreground object
        bgd_model,    # temporary array for background model
        fgd_model,    # temporary array for foreground model
        grabcut_iter, # number of iterations
        mode=cv2.GC_INIT_WITH_RECT
    )
    
    # After GrabCut, mask is in {0,1,2,3}, so let's create a binary mask
    # "where definite foreground OR probable foreground => object"
    bin_mask = np.where(
        (mask == cv2.GC_FGD) | (mask == cv2.GC_PR_FGD), 
        255, 
        0
    ).astype('uint8')
    
    return bin_mask


In [37]:
def box2mask_synthetic(annotations, output_dir='synthetic_masks'):
    """
    Loop over images & bounding boxes, produce synthetic masks, and save them.
    Each bounding box yields one mask image (with the same resolution).
    
    :param annotations: list of dicts
                       [ { 'image_path': ..., 'bboxes': [[x1,y1,x2,y2], ...] }, ...]
    :param output_dir: Where to store the mask images
    """
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    for ann_idx, ann in enumerate(annotations):
        image_path = ann['image_path']
        bboxes = ann['bboxes']
        
        # Read image
        image = cv2.imread(image_path)
        if image is None:
            print(f"Warning: Could not read image {image_path}. Skipping.")
            continue
        
        base_name = os.path.splitext(os.path.basename(image_path))[0]
        
        # For each bounding box, generate a mask
        for i, bbox in enumerate(bboxes):
            mask = generate_mask_from_box(image, bbox)
            
            # Save the mask as a PNG (or any format)
            mask_filename = f"{base_name}_box{i}.png"
            output_path = os.path.join(output_dir, mask_filename)
            cv2.imwrite(output_path, mask)
            
        print(f"Processed {len(bboxes)} boxes for {image_path} => masks saved to {output_dir}/")


In [None]:
box2mask_synthetic(annotations, output_dir=os.path.join(ds_path, 'synthetic_masks_output'))