IMPLEMENTING IMAGE SEGMENTATION, COUNTING PEOPLE AND SHOWING BOUNDING BOXES

NOTE: 
FastSAM (or most “segment anything” models) does not always segment “people only” out of the box.
By default, these models generate masks for all salient objects, not just people.

Why Does This Happen?
	•	FastSAM and similar models are category-agnostic by default: they segment everything that looks like an object.
	•	The mask(s) you get often include people, but also objects, animals, backgrounds, etc.


How to Remove Everything Except People

1. Use a People/Person Class Mask
	•	You need a model that knows what a “person” is. This is called a class-aware or semantic segmentation or detection model.
	•	Option 1: Use a dedicated human segmentation model or person detector.
	•	Option 2: Filter FastSAM’s masks using a person detector (e.g., YOLO, Faster R-CNN, or segment_anything + detectron2/GroundingDINO).


2. Practical Solution: Two-Stage Pipeline

A. Detect people first (bounding boxes), then use segmentation mask
	1.	Run an object detector (like YOLOv8, YOLOv5, or any with a “person” class) to get bounding boxes for people.
	2.	For each mask from FastSAM, check if its bounding box overlaps with a detected “person” box.
	•	If yes, keep the mask.
	•	If no, discard the mask.
	3.	Combine only the “person” masks to create your final mask.

B. Use a people-specific segmentation model (recommended if only people matter)
	•	Models like DeepLabv3+ with “person” class or Selfie Segmentation from MediaPipe are designed for this.



In [None]:
# YOU DO NOT NEED ALL THESE PACKAGES, BUT I DO BECAUSE OF ALL THE OTHER LITTLE STEPS I WANTED TO RUN
# I AM RUNNING python==3.10.15

import os
import cv2
import torch
import roboflow
from roboflow import Roboflow
import base64
import supervision as sv
import numpy as np
from fastsam import FastSAM, FastSAMPrompt
from segment_anything import sam_model_registry, SamAutomaticMaskGenerator, SamPredictor
import random

from matplotlib.backends.backend_agg import FigureCanvasAgg
import matplotlib.pyplot as plt

from ultralytics.nn.tasks import SegmentationModel
from ultralytics import YOLO

print("DONE")

In [None]:
#LOAD FAST_SAM MODEL

DEVICE = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
print(f"DEVICE = {DEVICE}")
fast_sam = FastSAM("FastSAM.pt")

In [None]:
# SELECT AND FORMAT PATHS TO FILE TO BE USED, AS WELL AS NAME/LOCATION TO BE SAVED

file_n = "construction3.jpg" # This is the image to be worked on
file2_n = "constr3"      # Name of generated file to be saved
img_r = "./images/"     # Path to image for use
IMAGE_PATH = f"{img_r}{file_n}"
output_p = f"{'./output'}{'/'}{file2_n}{'.jpg'}"
print(IMAGE_PATH, output_p)

In [None]:
# Run the FastSAM segmentation model on the image located at IMAGE_PATH

results = fast_sam(
    source=IMAGE_PATH,
    device=DEVICE,
    retina_masks=True,
    imgsz=1024,
    conf=0.4,
    iou=0.9)
prompt_process = FastSAMPrompt(IMAGE_PATH, results, device=DEVICE) # Initializes a helper object (FastSAMPrompt) to further process or
                                                                   # interact with the masks/predictions generated by fast_sam
fastsam_masks = prompt_process.everything_prompt() # Extracts all masks (i.e., every segmented object) from the image
prompt_process.plot(annotations=fastsam_masks, output_path=f"{'./output/'}{file2_n}{'.jpg'}")
print("done with this")

In [None]:
# Converts masks to boolean (True/False)

def masks_to_bool(masks):
    if type(masks) == np.ndarray:
        return masks.astype(bool)
    return masks.cpu().numpy().astype(bool)
print("done with this too")

In [None]:
# Takes an image file path (image_path) and a set of segmentation masks (masks), and returns a NumPy array (the annotated image).

def annotate_image(image_path: str, masks: np.ndarray) -> np.ndarray:
    import numpy as np
    image = cv2.imread(image_path)
    masks = np.array(masks).astype(bool)  # Ensure masks is a NumPy array
    xyxy = sv.mask_to_xyxy(masks=masks)
    detections = sv.Detections(xyxy=xyxy, mask=masks)
    mask_annotator = sv.MaskAnnotator(color_lookup = sv.ColorLookup.INDEX)
    return mask_annotator.annotate(scene=image.copy(), detections=detections)
print("done")

In [None]:
# Mask and annotate

masks = masks_to_bool(fastsam_masks)
annotated_image=annotate_image(image_path=IMAGE_PATH, masks=masks)
sv.plot_image(image=annotated_image, size=(6, 4))
print('done')

In [None]:
# REMOVE BACKGROUND AND THEN REPLACE WITH WHITE

def remove_background(image_path: str, mask: np.ndarray) -> np.ndarray:
    image = cv2.imread(image_path, cv2.IMREAD_UNCHANGED)
    h, w = image.shape[:2]
    # If mask is not same size as image, resize
    mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST)

    # If mask has multiple objects, combine into single mask
    if len(mask.shape) > 2:
        mask = np.any(mask, axis=0).astype(np.uint8)  # [num_masks, H, W] -> [H, W]
    
    # Create alpha channel: foreground (object) = 255, background = 0
    alpha = (mask * 255).astype(np.uint8)
    # Add alpha channel to the image
    bgr = image[..., :3]
    rgba = cv2.cvtColor(bgr, cv2.COLOR_BGR2BGRA)
    rgba[..., 3] = alpha
    return rgba

final_mask = np.any(masks, axis=0).astype(np.uint8) if masks.ndim == 3 else masks

rgba_result = remove_background(IMAGE_PATH, final_mask)
#cv2.imwrite(output_p, rgba_result)

#sv.plot_image(rgba_result, size=(8, 8))

def remove_background_white(image_path: str, mask) -> np.ndarray:
    import numpy as np
    import cv2

    # Convert torch tensor to numpy if needed
    if hasattr(mask, 'detach'):
        mask = mask.detach().cpu().numpy()

    image = cv2.imread(image_path)
    h, w = image.shape[:2]

    # If mask has multiple objects, collapse to a single mask
    if mask.ndim > 2:
        mask = np.any(mask, axis=0).astype(np.uint8)

    # Resize mask to match image (width, height)
    mask = cv2.resize(mask.astype(np.uint8), (w, h), interpolation=cv2.INTER_NEAREST)

    # Broadcast mask to 3 channels if needed
    mask_3ch = np.repeat(mask[:, :, np.newaxis], 3, axis=2)

    white_bg = np.ones_like(image) * 255
    result = np.where(mask_3ch == 1, image, white_bg)
    return result

whiteBG = remove_background_white(IMAGE_PATH, masks)
#cv2.imwrite(output_p, whiteBG)

#sv.plot_image(whiteBG, size=(6, 4))
print('done')

In [None]:
# Code Outline: FastSAM + YOLOv8 for “person” filtering

# Step 1. Detect people with YOLO

# Load YOLOv8 or v5 model
yolo_model = YOLO("yolov8n.pt")  # or yolov5s.pt if you prefer

results = yolo_model(IMAGE_PATH)
person_bboxes = []
for box, cls in zip(results[0].boxes.xyxy.cpu().numpy(), results[0].boxes.cls.cpu().numpy()):
    if int(cls) == 0:  # class 0 is 'person' in COCO
        person_bboxes.append(box)  # [x1, y1, x2, y2]

print("done")

In [None]:
# Step 2. Filter FastSAM masks

def bbox_overlap(mask_bbox, person_bboxes, iou_threshold=0.2):
    # mask_bbox: [x1, y1, x2, y2]
    # person_bboxes: list of [x1, y1, x2, y2]
    xA = np.maximum(mask_bbox[0], [b[0] for b in person_bboxes])
    yA = np.maximum(mask_bbox[1], [b[1] for b in person_bboxes])
    xB = np.minimum(mask_bbox[2], [b[2] for b in person_bboxes])
    yB = np.minimum(mask_bbox[3], [b[3] for b in person_bboxes])
    interArea = np.maximum(0, xB - xA + 1) * np.maximum(0, yB - yA + 1)
    maskArea = (mask_bbox[2] - mask_bbox[0] + 1) * (mask_bbox[3] - mask_bbox[1] + 1)
    personAreas = [(b[2] - b[0] + 1) * (b[3] - b[1] + 1) for b in person_bboxes]
    iou = interArea / (maskArea + np.array(personAreas) - interArea + 1e-6)
    return np.any(iou > iou_threshold)

# Filter masks
def compute_bbox_from_mask(mask):
    import numpy as np
    ys, xs = np.where(mask > 0)
    if len(xs) == 0 or len(ys) == 0:
        return [0, 0, 0, 0]
    x1, y1, x2, y2 = xs.min(), ys.min(), xs.max(), ys.max()
    return [x1, y1, x2, y2]

person_masks = []
for i, mask in enumerate(fastsam_masks):  # fastsam_masks: shape (N, H, W)
    mask_bbox = compute_bbox_from_mask(mask)
    if bbox_overlap(mask_bbox, person_bboxes):
        person_masks.append(mask)

print("done")

In [None]:
# Display the bounding boxes and masks corresponding to people
#	•	Draws each YOLO person bounding box in red.
#	•	Overlays each mask classified as “human” in a translucent color (blue or random).
#	•	Draws a cyan dashed bounding box around each accepted mask for clarity.

def draw_bboxes_and_masks(image_path, person_bboxes, person_masks):
    import cv2
    import numpy as np
    import matplotlib.pyplot as plt
    import random

    image = cv2.imread(image_path)
    image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(10, 10))
    plt.imshow(image_rgb)
    ax = plt.gca()

    # Draw YOLO person bboxes in red
    for box in person_bboxes:
        x1, y1, x2, y2 = box
        rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='red', facecolor='none')
        ax.add_patch(rect)

    h, w = image.shape[:2]
    overlay = np.zeros((h, w, 3), dtype=np.uint8)
    for i, mask in enumerate(person_masks):
        # --- Fix: Convert torch tensor to numpy if needed ---
        if hasattr(mask, 'detach'):
            mask = mask.detach().cpu().numpy()
        mask = mask.astype(bool)
        color = (random.randint(0,255), random.randint(0,255), 255)
        overlay[mask] = color[:3]
        x1, y1, x2, y2 = compute_bbox_from_mask(mask)
        rect = plt.Rectangle((x1, y1), x2 - x1, y2 - y1, linewidth=2, edgecolor='cyan', facecolor='none', linestyle='--')
        ax.add_patch(rect)

    plt.imshow(overlay, alpha=0.35)
    plt.title("People Bounding Boxes (red) and Accepted Masks (blue/cyan)")
    plt.axis('off')
    plt.show()

# Example usage:
draw_bboxes_and_masks(IMAGE_PATH, person_bboxes, person_masks)

In [None]:
# A. For Images With or Without Alpha Channel (General Case)

def show_image(image, title="Result"):
    # If you read with OpenCV, image is BGR; convert to RGB for matplotlib
    if image.shape[2] == 4:  # RGBA image
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGRA2RGBA)
    else:  # RGB or BGR image
        image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    plt.figure(figsize=(6, 4))
    plt.imshow(image_rgb)
    plt.axis('off')
    plt.title(title)
    plt.show()

# Usage
annotated_img = annotate_image(IMAGE_PATH, np.array(person_masks))
show_image(annotated_img, "People Segmentation Result")

In [None]:
# B. For White Background Results Only (3-channel BGR image)

plt.figure(figsize=(6, 6))
plt.imshow(cv2.cvtColor(whiteBG, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.title("People Segmentation Result")
plt.show()