**Counting Weed Instances per Image**

This script analyzes a dataset of semantic and instance segmentation masks to compute the number of weed instances present in each image. This is useful for supporting targeted augmentation or training.

In [None]:
import os
import cv2
import numpy as np
from tqdm import tqdm
import json

semantic_dir = "semantics"
instance_dir = "plant_instances"
output_json = "weed_instance_counts.json"

def count_weed_instances(semantic_path, instance_path, weed_labels={2, 4}, threshold=0.5):
    sem_mask = cv2.imread(semantic_path, cv2.IMREAD_UNCHANGED)
    inst_mask = cv2.imread(instance_path, cv2.IMREAD_UNCHANGED)

    if sem_mask is None or inst_mask is None:
        return None

    weed_count = 0
    instance_ids = np.unique(inst_mask)
    instance_ids = instance_ids[instance_ids > 0]

    for inst_id in instance_ids:
        instance_mask = (inst_mask == inst_id)
        sem_values = sem_mask[instance_mask]
        weed_ratio = np.isin(sem_values, list(weed_labels)).sum() / len(sem_values)
        if weed_ratio >= threshold:
            weed_count += 1

    return weed_count

weed_counts = {}
filenames = sorted(os.listdir(semantic_dir))

for filename in tqdm(filenames, desc="Analysing images"):
    if not filename.endswith(('.png', '.jpg', '.tif')):
        continue

    semantic_path = os.path.join(semantic_dir, filename)
    instance_path = os.path.join(instance_dir, filename)

    if not os.path.exists(instance_path):
        continue

    weed_count = count_weed_instances(semantic_path, instance_path)
    if weed_count is not None:
        weed_counts[filename] = weed_count

sorted_counts = dict(sorted(weed_counts.items(), key=lambda item: item[1]))

with open(output_json, 'w') as f:
    json.dump(sorted_counts, f, indent=2)