In [89]:
import os
from pathlib import Path

current_directory = Path.cwd()
if current_directory.name == "semantic_segmentation":
    # This means that the notebook is run from the main anomalib directory.
    root_directory = current_directory.parent
elif current_directory.name == "f3loc":
    # This means that the notebook is run from the main anomalib directory.
    root_directory = current_directory

os.chdir(root_directory)
root_directory

PosixPath('/cluster/home/wueestm/f3loc')

In [90]:
import numpy as np
import cv2

def gravity_align_segmentation(
    seg_map,
    r,
    p,
    K=np.array([[240, 0, 320], [0, 240, 240], [0, 0, 1]]).astype(np.float32),
    mode=0
):
    """
    Align the segmentation map with gravity direction.
    
    Input:
        seg_map: input segmentation map of shape (N, H, W), where N is the number of channels.
        r: roll angle in radians.
        p: pitch angle in radians.
        K: camera intrinsics.
        mode: interpolation mode for warping, default: 0 - 'linear', else 1 - 'nearest'
    
    Output:
        aligned_seg_map: gravity-aligned segmentation map.
    """
    # Validate input shape
    if seg_map.ndim != 3:
        raise ValueError("Segmentation map must be a 3D array with shape (N, H, W).")
    
    N, h, w = seg_map.shape
    
    # Calculate R_gc from roll and pitch
    p = -p  # This is because the pitch axis of robot and camera is in the opposite direction
    cr = np.cos(r)
    sr = np.sin(r)
    cp = np.cos(p)
    sp = np.sin(p)

    # Compute R_cg first
    R_x = np.array([[1, 0, 0], [0, cp, sp], [0, -sp, cp]])  # Pitch
    R_z = np.array([[cr, sr, 0], [-sr, cr, 0], [0, 0, 1]])  # Roll

    R_cg = R_z @ R_x
    R_gc = R_cg.T

    # Compute the homography matrix
    persp_M = K @ R_gc @ np.linalg.inv(K)

    # Create an empty array for the aligned segmentation map
    aligned_seg_map = np.zeros_like(seg_map)
    
    # Process each channel independently
    for i in range(N):
        # Align the current channel
        aligned_channel = cv2.warpPerspective(
            seg_map[i, :, :], persp_M, (w, h), flags=cv2.INTER_NEAREST if mode == 1 else cv2.INTER_LINEAR
        )
        aligned_seg_map[i, :, :] = aligned_channel
    
    return aligned_seg_map

# Example usage
if __name__ == "__main__":
    # Example segmentation map (e.g., 150 channels, H=150, W=200)
    seg_map = np.random.randint(0, 10, (150, 150, 200), dtype=np.uint8)
    
    # Example roll and pitch
    roll = 0.1  # in radians
    pitch = 0.2  # in radians
    
    # Align the segmentation map
    aligned_seg_map = gravity_align_segmentation(seg_map, roll, pitch)
    
    # aligned_seg_map now contains the segmentation map aligned to gravity


In [91]:
def gravity_align(
    img,
    r,
    p,
    K=np.array([[240, 0, 320], [0, 240, 240], [0, 0, 1]]).astype(np.float32),
    mode=0,
):
    """
    Align the image with gravity direction
    Input:
        img: input image
        r: roll
        p: pitch
        K: camera intrisics
        mode: interpolation mode for warping, default: 0 - 'linear', else 1 - 'nearest'
    Output:
        aligned_img: gravity aligned image
    """
    # calculate R_gc from roll and pitch
    # From gravity to camera, yaw->pitch->roll
    # From camera to gravity, roll->pitch->yaw
    p = (
        -p
    )  # this is because the pitch axis of robot and camera is in the opposite direction
    cr = np.cos(r)
    sr = np.sin(r)
    cp = np.cos(p)
    sp = np.sin(p)

    # compute R_cg first
    # pitch
    R_x = np.array([[1, 0, 0], [0, cp, sp], [0, -sp, cp]])

    # roll
    R_z = np.array([[cr, sr, 0], [-sr, cr, 0], [0, 0, 1]])

    R_cg = R_z @ R_x
    R_gc = R_cg.T

    # get shape
    h, w = list(img.shape[:2])

    # directly compute the homography
    persp_M = K @ R_gc @ np.linalg.inv(K)

    aligned_img = cv2.warpPerspective(
        img, persp_M, (w, h), flags=cv2.INTER_NEAREST if mode == 1 else cv2.INTER_LINEAR
    )

    return aligned_img

In [None]:
roll = ref_euler_angles[0]
        pitch = ref_euler_angles[1]
        ref_img = gravity_align(ref_img, r=pitch, p=-(roll+np.pi/2),  mode=1, K=self.K)

In [None]:
# from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
# from PIL import Image
# import requests
# 
# image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
# model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
# 
# url = "http://images.cocodataset.org/val2017/000000039769.jpg"
# image = Image.open(requests.get(url, stream=True).raw)
# 
# inputs = image_processor(images=image, return_tensors="pt")
# outputs = model(**inputs)
# logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)
# list(logits.shape)


In [None]:
image

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
#image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")
#model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b1-finetuned-ade-512-512")
#image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")
#model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b2-finetuned-ade-512-512")

#image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b4-finetuned-ade-512-512")
#model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b4-finetuned-ade-512-512")
#image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b5-finetuned-ade-640-640")
#model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b5-finetuned-ade-640-640")

image_path_stem = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/"
#image_path_ls = [image_path_stem + "00000-0.jpg", image_path_stem + "00090-0.jpg", image_path_stem + "00120-0.jpg", image_path_stem + "00173-0.jpg", image_path_stem + "00334-0.jpg", image_path_stem + "00342-0.jpg"]
image_path_ls = [image_path_stem + "00120-0.jpg"]

# Load an image
for image_path in image_path_ls:
    #image_path = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/00120-0.jpg"
    image = Image.open(image_path)

    # Preprocess the image and forward it through the model
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

    # Get the predicted class for each pixel
    predicted_class = logits.argmax(dim=1).squeeze().cpu().numpy()

    # Resize the predicted class indices to match the original image size
    predicted_class_resized = Image.fromarray(predicted_class.astype(np.uint8)).resize(image.size, resample=Image.NEAREST)
    predicted_class_resized = np.array(predicted_class_resized)

    # Manually define the ADE20K label mapping
    ade20k_labels = {
        "0": "wall", "1": "building", "2": "sky", "3": "floor", "4": "tree", 
        "5": "ceiling", "6": "road", "7": "bed", "8": "windowpane", "9": "grass", 
        "10": "cabinet", "11": "sidewalk", "12": "person", "13": "earth", "14": "door", 
        "15": "table", "16": "mountain", "17": "plant", "18": "curtain", "19": "chair", 
        "20": "car", "21": "water", "22": "painting", "23": "sofa", "24": "shelf", 
        "25": "house", "26": "sea", "27": "mirror", "28": "rug", "29": "field", 
        "30": "armchair", "31": "seat", "32": "fence", "33": "desk", "34": "rock", 
        "35": "wardrobe", "36": "lamp", "37": "bathtub", "38": "railing", "39": "cushion", 
        "40": "base", "41": "box", "42": "column", "43": "signboard", "44": "chest of drawers", 
        "45": "counter", "46": "sand", "47": "sink", "48": "skyscraper", "49": "fireplace", 
        "50": "refrigerator", "51": "grandstand", "52": "path", "53": "stairs", "54": "runway", 
        "55": "case", "56": "pool table", "57": "pillow", "58": "screen door", "59": "stairway", 
        "60": "river", "61": "bridge", "62": "bookcase", "63": "blind", "64": "coffee table", 
        "65": "toilet", "66": "flower", "67": "book", "68": "hill", "69": "bench", 
        "70": "countertop", "71": "stove", "72": "palm", "73": "kitchen island", "74": "computer", 
        "75": "swivel chair", "76": "boat", "77": "bar", "78": "arcade machine", "79": "hovel", 
        "80": "bus", "81": "towel", "82": "light", "83": "truck", "84": "tower", 
        "85": "chandelier", "86": "awning", "87": "streetlight", "88": "booth", "89": "television receiver", 
        "90": "airplane", "91": "dirt track", "92": "apparel", "93": "pole", "94": "land", 
        "95": "bannister", "96": "escalator", "97": "ottoman", "98": "bottle", "99": "buffet", 
        "100": "poster", "101": "stage", "102": "van", "103": "ship", "104": "fountain", 
        "105": "conveyer belt", "106": "canopy", "107": "washer", "108": "plaything", "109": "swimming pool", 
        "110": "stool", "111": "barrel", "112": "basket", "113": "waterfall", "114": "tent", 
        "115": "bag", "116": "minibike", "117": "cradle", "118": "oven", "119": "ball", 
        "120": "food", "121": "step", "122": "tank", "123": "trade name", "124": "microwave", 
        "125": "pot", "126": "animal", "127": "bicycle", "128": "lake", "129": "dishwasher", 
        "130": "screen", "131": "blanket", "132": "sculpture", "133": "hood", "134": "sconce", 
        "135": "vase", "136": "traffic light", "137": "tray", "138": "ashcan", "139": "fan", 
        "140": "pier", "141": "crt screen", "142": "plate", "143": "monitor", "144": "bulletin board", 
        "145": "shower", "146": "radiator", "147": "glass", "148": "clock", "149": "flag"
    }

    # Classes to keep and display in color
    classes_to_display = [0, 1, 3, 5, 8, 12, 14, 15, 19, 42, 53, 59, 97, 104, 132, 147]
    class_names = {i: ade20k_labels[str(i)] for i in classes_to_display}

    # Manually specify distinct colors for each class
    class_colors = {
        0: [255, 0, 0],        # Red for 'wall'
        1: [0, 255, 0],        # Green for 'building'
        3: [0, 0, 255],        # Blue for 'floor'
        5: [255, 255, 0],      # Yellow for 'ceiling'
        8: [0, 255, 255],      # Cyan for 'windowpane'
        12: [255, 0, 255],     # Magenta for 'person'
        14: [128, 0, 128],     # Purple for 'door'
        15: [128, 128, 0],     # Olive for 'table'
        19: [0, 128, 128],     # Teal for 'chair'
        42: [90, 90, 90],   # Dark gray for 'column'
        53: [192, 192, 192],   # Light gray for 'stairs'
        59: [255, 165, 0],     # Orange for 'stairway'
        97: [255, 105, 180],   # Pink for 'ottoman'
        104: [255, 20, 147],   # Deep pink for 'fountain'
        132: [255, 69, 0],     # Red orange for 'sculpture'
        147: [139, 69, 19]     # Saddle brown for 'glass'
    }

    # Create a blank segmentation map (all black)
    segmentation_map = np.zeros((*predicted_class_resized.shape, 3), dtype=np.uint8)

    # Fill in the segmentation map with colors only for the specified classes
    for class_id, color in class_colors.items():
        mask = predicted_class_resized == class_id
        segmentation_map[mask] = color

    # Convert to image format
    segmentation_image = Image.fromarray(segmentation_map)

    # Display the original image, segmentation map, and legend
    plt.figure(figsize=(12, 8))

    plt.subplot(1, 2, 1)
    plt.title("Original Image")
    plt.imshow(image)
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.title("Segmentation Map")
    plt.imshow(segmentation_image)
    plt.axis("off")

    # Create a legend plot
    for idx, (class_id, class_name) in enumerate(class_names.items()):
        plt.fill_between([0, 1], idx + 0.5, idx + 1.5, color=np.array(class_colors[class_id]) / 255.0, label=class_name)

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Classes", title_fontsize='13')
    plt.tight_layout()
    plt.show()


In [None]:
image = Image.open(image_path)

In [None]:
image

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

image_path_stem = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/"
image_path = image_path_stem + "00120-0.jpg"


#image_path = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/00120-0.jpg"
image = Image.open(image_path)
image_rgb = image.convert('RGB')


# Preprocess the image and forward it through the model
inputs = image_processor(images=image_rgb, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

# Get the predicted class for each pixel
predicted_class = logits.argmax(dim=1).squeeze().cpu().numpy()

# Resize the predicted class indices to match the original image size
predicted_class_resized = Image.fromarray(predicted_class.astype(np.uint8)).resize(image.size, resample=Image.NEAREST)
predicted_class_resized = np.array(predicted_class_resized)



In [None]:
image_rgb.shape

In [None]:
image_np = np.array(image_rgb)
image_np.shape

In [92]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

image_path_stem = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/"
image_path = image_path_stem + "00120-0.jpg"

image = Image.open(image_path)
image_rgb = image.convert('RGB')
rgb_origin = np.array(image_rgb)


# Preprocess the image and forward it through the model
inputs = image_processor(images=rgb_origin, return_tensors="pt")
#inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

# Get the logits for the first image in the batch
logits = logits.squeeze(0)  # Remove batch dimension, shape (num_labels, height/4, width/4)

# Get the dimensions of the original image
original_width, original_height = image.size

# Resize the logits to the original image size
resize_factor = (original_height // logits.shape[1], original_width // logits.shape[2])
logits_resized = torch.nn.functional.interpolate(
    logits.unsqueeze(0),  # Add batch dimension back for interpolation
    size=(original_height, original_width),
    mode='bilinear',
    align_corners=False
).squeeze(0)  # Remove batch dimension

# Convert the resized logits to a NumPy array
logits_np = logits_resized.detach().cpu().numpy()  # shape (num_labels, original_height, original_width)

print("Resized logits shape:", logits_np.shape)

Resized logits shape: (150, 1920, 1440)


In [None]:
rgb_origin.shape

In [None]:
rgb_origin.shape[:2]

In [None]:
# # Compute the predicted class for each pixel
# predicted_class = logits_np.argmax(axis=0)  # shape (original_height, original_width)
# 
# # Define a color map for visualization
# def get_color_map(num_classes):
#     cmap = plt.get_cmap('tab20', num_classes)
#     return (cmap(np.arange(num_classes)) * 255).astype(np.uint8)
# 
# # Get a color map for the classes
# num_classes = logits_np.shape[0]
# color_map = get_color_map(num_classes)
# 
# # Create a blank segmentation map (all black)
# segmentation_map = np.zeros((*predicted_class.shape, 3), dtype=np.uint8)
# 
# # Fill in the segmentation map with colors only for the specified classes
# for class_id in range(num_classes):
#     mask = (predicted_class == class_id)
#     segmentation_map[mask] = color_map[class_id][:3]  # Ensure we use only the RGB channels
# 
# # Convert to image format
# segmentation_image = Image.fromarray(segmentation_map)
# 
# # Display the original image and segmentation map
# plt.figure(figsize=(12, 8))
# 
# plt.subplot(1, 2, 1)
# plt.title("Original Image")
# plt.imshow(image)
# plt.axis("off")
# 
# plt.subplot(1, 2, 2)
# plt.title("Segmentation Map")
# plt.imshow(segmentation_image)
# plt.axis("off")
# 
# plt.show()

In [93]:
#ref_euler_angles = [-2.05386654, 0.05212421, -1.59732346]
ref_euler_angles = [-1.78765814, -0.00908211, 1.51359217]
K = np.array([[1596, 0, 960], [0, 1596, 720], [0, 0, 1]])

In [94]:
r=ref_euler_angles[1]
p=-(ref_euler_angles[0]+np.pi/2)
aligned_logits = gravity_align_segmentation(logits_np, r, p, K)
aligned_logits.shape

(150, 1920, 1440)

In [None]:
mask = np.ones(list(aligned_logits.shape[1:3]))
#mask = gravity_align(mask, r, p, visualize=False, mode=1)
mask = gravity_align(mask, r, p, mode=1, K=K)
mask[mask < 1] = 0

In [None]:
aligned_logits[:, mask == 0] = 0

In [None]:
def softmax(logits):
    """
    Apply softmax to the logits to get probabilities.
    
    Input:
        logits: input segmentation map of shape (N, H, W), where N is the number of channels.
    
    Output:
        probabilities: output probabilities of shape (N, H, W), where N is the number of channels.
    """
    # Ensure logits are of type float32 for numerical stability
    logits = logits.astype(np.float32)
    
    # Shift logits for numerical stability
    logits_max = np.max(logits, axis=0, keepdims=True)
    exp_logits = np.exp(logits - logits_max)
    sum_exp_logits = np.sum(exp_logits, axis=0, keepdims=True)
    probabilities = exp_logits / sum_exp_logits
    
    return probabilities


In [None]:
mask.shape

In [None]:
-float('inf')

In [None]:
import torch
import torch.nn.functional as F
import numpy as np

def softmax_pytorch(logits, mask=None, device=None):
    """
    Apply softmax to the logits to get probabilities using PyTorch.
    
    Input:
        logits: input segmentation map of shape (N, H, W), where N is the number of channels.
        mask: optional mask indicating which pixels should be set to 0 in the output probabilities.
        device: the device to perform the computation on ('cpu' or 'cuda').
    
    Output:
        probabilities: output probabilities of shape (N, H, W), where N is the number of channels.
    """

    # Choose device: 'cuda' if available, else 'cpu'
    device = 'cuda' if torch.cuda.is_available() else 'cpu'

    
    # Convert logits to PyTorch tensor and move to device
    logits = torch.tensor(logits, dtype=torch.float32).to(device)
    
    # Apply softmax along the channel dimension (dim=0)
    probabilities = F.softmax(logits, dim=0)
    
    # Apply mask if provided
    if mask is not None:
        mask = torch.tensor(mask, dtype=torch.float32).to(device)
        probabilities[:, mask == 0] = 0
    
    return probabilities.cpu().numpy()

In [None]:
# Compute probabilities
probabilities = softmax(aligned_logits)
probabilities[:, mask == 0] = 0
print(probabilities.shape)

In [None]:
# Compute probabilities
probabilities = softmax_pytorch(aligned_logits)
probabilities[:, mask == 0] = 0
print(probabilities.shape)

In [None]:
# Compute the predicted class for each pixel
predicted_class = probabilities.argmax(axis=0)  # shape (original_height, original_width)

# Define a color map for visualization
def get_color_map(num_classes):
    cmap = plt.get_cmap('tab20', num_classes)
    return (cmap(np.arange(num_classes)) * 255).astype(np.uint8)

# Get a color map for the classes
num_classes = logits_np.shape[0]
color_map = get_color_map(num_classes)

# Create a blank segmentation map (all black)
segmentation_map = np.zeros((*predicted_class.shape, 3), dtype=np.uint8)

# Fill in the segmentation map with colors only for the specified classes
for class_id in range(num_classes):
    mask_class = (predicted_class == class_id)
    segmentation_map[mask_class] = color_map[class_id][:3]  # Ensure we use only the RGB channels

segmentation_map[mask == 0, :] = 0

# Convert to image format
segmentation_image = Image.fromarray(segmentation_map)

# Display the original image and segmentation map
plt.figure(figsize=(12, 8))

plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(image)
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("Segmentation Map")
plt.imshow(segmentation_image)
plt.axis("off")

plt.show()

In [None]:
# Compute the predicted class for each pixel
predicted_class = aligned_seg_map.argmax(axis=0)  # shape (original_height, original_width)

# Define a color map for visualization
def get_color_map(num_classes):
    cmap = plt.get_cmap('tab20', num_classes)
    return (cmap(np.arange(num_classes)) * 255).astype(np.uint8)

# Get a color map for the classes
num_classes = logits_np.shape[0]
color_map = get_color_map(num_classes)

# Create a blank segmentation map (all black)
segmentation_map = np.zeros((*predicted_class.shape, 3), dtype=np.uint8)

# Fill in the segmentation map with colors only for the specified classes
for class_id in range(num_classes):
    mask = (predicted_class == class_id)
    segmentation_map[mask] = color_map[class_id][:3]  # Ensure we use only the RGB channels

# Convert to image format
segmentation_image = Image.fromarray(segmentation_map)

# Display the original image and segmentation map
plt.figure(figsize=(12, 8))

plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(image)
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("Segmentation Map")
plt.imshow(segmentation_image)
plt.axis("off")

plt.show()

In [None]:
logits_np.shape

In [None]:
# Example usage
if __name__ == "__main__":
    # Example segmentation map (e.g., 150 channels, H=150, W=200)
    seg_map = np.random.randint(0, 10, (150, 150, 200), dtype=np.uint8)
    
    # Example roll and pitch
    roll = 0.1  # in radians
    pitch = 0.2  # in radians
    
    # Align the segmentation map
    aligned_seg_map = gravity_align_segmentation(seg_map, roll, pitch)
    
    # aligned_seg_map now contains the segmentation map aligned to gravity

roll = ref_euler_angles[0]
pitch = ref_euler_angles[1]
ref_img = gravity_align(ref_img, r=pitch, p=-(roll+np.pi/2),  mode=1, K=self.K)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

# Path to the image
image_path_stem = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/"
image_path = image_path_stem + "00120-0.jpg"

# Load and preprocess the image
image = Image.open(image_path)
inputs = image_processor(images=image, return_tensors="pt")
outputs = model(**inputs)
logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

# Get the logits for the first image in the batch
logits = logits.squeeze(0)  # Remove batch dimension, shape (num_labels, height/4, width/4)

# Get the dimensions of the original image
original_width, original_height = image.size

# Resize the logits to the original image size
resize_factor = (original_height // logits.shape[1], original_width // logits.shape[2])
logits_resized = torch.nn.functional.interpolate(
    logits.unsqueeze(0),  # Add batch dimension back for interpolation
    size=(original_height, original_width),
    mode='bilinear',
    align_corners=False
).squeeze(0)  # Remove batch dimension

# Convert the resized logits to a NumPy array
logits_np = logits_resized.detach().cpu().numpy()  # shape (num_labels, original_height, original_width)

# Compute the predicted class for each pixel
predicted_class = logits_np.argmax(axis=0)  # shape (original_height, original_width)

# Manually define the ADE20K label mapping
ade20k_labels = {
    "0": "wall", "1": "building", "2": "sky", "3": "floor", "4": "tree", 
    "5": "ceiling", "6": "road", "7": "bed", "8": "windowpane", "9": "grass", 
    "10": "cabinet", "11": "sidewalk", "12": "person", "13": "earth", "14": "door", 
    "15": "table", "16": "mountain", "17": "plant", "18": "curtain", "19": "chair", 
    "20": "car", "21": "water", "22": "painting", "23": "sofa", "24": "shelf", 
    "25": "house", "26": "sea", "27": "mirror", "28": "rug", "29": "field", 
    "30": "armchair", "31": "seat", "32": "fence", "33": "desk", "34": "rock", 
    "35": "wardrobe", "36": "lamp", "37": "bathtub", "38": "railing", "39": "cushion", 
    "40": "base", "41": "box", "42": "column", "43": "signboard", "44": "chest of drawers", 
    "45": "counter", "46": "sand", "47": "sink", "48": "skyscraper", "49": "fireplace", 
    "50": "refrigerator", "51": "grandstand", "52": "path", "53": "stairs", "54": "runway", 
    "55": "case", "56": "pool table", "57": "pillow", "58": "screen door", "59": "stairway", 
    "60": "river", "61": "bridge", "62": "bookcase", "63": "blind", "64": "coffee table", 
    "65": "toilet", "66": "flower", "67": "book", "68": "hill", "69": "bench", 
    "70": "countertop", "71": "stove", "72": "palm", "73": "kitchen island", "74": "computer", 
    "75": "swivel chair", "76": "boat", "77": "bar", "78": "arcade machine", "79": "hovel", 
    "80": "bus", "81": "towel", "82": "light", "83": "truck", "84": "tower", 
    "85": "chandelier", "86": "awning", "87": "streetlight", "88": "booth", "89": "television receiver", 
    "90": "airplane", "91": "dirt track", "92": "apparel", "93": "pole", "94": "land", 
    "95": "bannister", "96": "escalator", "97": "ottoman", "98": "bottle", "99": "buffet", 
    "100": "poster", "101": "stage", "102": "van", "103": "ship", "104": "fountain", 
    "105": "conveyer belt", "106": "canopy", "107": "washer", "108": "plaything", "109": "swimming pool", 
    "110": "stool", "111": "barrel", "112": "basket", "113": "waterfall", "114": "tent", 
    "115": "bag", "116": "minibike", "117": "cradle", "118": "oven", "119": "ball", 
    "120": "food", "121": "step", "122": "tank", "123": "trade name", "124": "microwave", 
    "125": "pot", "126": "animal", "127": "bicycle", "128": "lake", "129": "dishwasher", 
    "130": "screen", "131": "blanket", "132": "sculpture", "133": "hood", "134": "sconce", 
    "135": "vase", "136": "traffic light", "137": "tray", "138": "ashcan", "139": "fan", 
    "140": "pier", "141": "crt screen", "142": "plate", "143": "monitor", "144": "bulletin board", 
    "145": "shower", "146": "radiator", "147": "glass", "148": "clock", "149": "flag"
}

# Classes to keep and display in color
classes_to_display = [0, 1, 3, 5, 8, 12, 14, 15, 19, 42, 53, 59, 97, 104, 132, 147]
class_names = {i: ade20k_labels[str(i)] for i in classes_to_display}

# Manually specify distinct colors for each class
class_colors = {
    0: [255, 0, 0],        # Red for 'wall'
    1: [0, 255, 0],        # Green for 'building'
    3: [0, 0, 255],        # Blue for 'floor'
    5: [255, 255, 0],      # Yellow for 'ceiling'
    8: [0, 255, 255],      # Cyan for 'windowpane'
    12: [255, 0, 255],     # Magenta for 'person'
    14: [128, 0, 128],     # Purple for 'door'
    15: [128, 128, 0],     # Olive for 'table'
    19: [0, 128, 128],     # Teal for 'chair'
    42: [90, 90, 90],   # Dark gray for 'column'
    53: [192, 192, 192],   # Light gray for 'stairs'
    59: [255, 165, 0],     # Orange for 'stairway'
    97: [255, 105, 180],   # Pink for 'ottoman'
    104: [255, 20, 147],   # Deep pink for 'fountain'
    132: [255, 69, 0],     # Red orange for 'sculpture'
    147: [139, 69, 19]     # Saddle brown for 'glass'
}

# Create a blank segmentation map (all black)
segmentation_map = np.zeros((*predicted_class.shape, 3), dtype=np.uint8)

# Fill in the segmentation map with colors only for the specified classes
for class_id, color in class_colors.items():
    mask = (predicted_class == class_id)
    segmentation_map[mask] = color

# Convert to image format
segmentation_image = Image.fromarray(segmentation_map)

# Display the original image and segmentation map
plt.figure(figsize=(12, 8))

plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(image)
plt.axis("off")

plt.subplot(1, 2, 2)
plt.title("Segmentation Map")
plt.imshow(segmentation_image)
plt.axis("off")

# Create a legend plot
legend_elements = [plt.Line2D([0], [0], color=np.array(color) / 255.0, lw=4) for color in class_colors.values()]
plt.legend(legend_elements, [class_names[class_id] for class_id in class_colors.keys()], loc='center left', bbox_to_anchor=(1, 0.5), title="Classes", title_fontsize='13')

plt.tight_layout()
plt.show()


In [None]:
image.size

In [None]:
logits.shape

In [None]:
predicted_class_resized = Image.fromarray(logits.astype(np.uint8)).resize(image.size, resample=Image.NEAREST)

In [None]:
import torch
import numpy as np
import matplotlib.pyplot as plt
from transformers import AutoImageProcessor, SegformerForSemanticSegmentation
from PIL import Image

# Load the image processor and model
image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
model = SegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")

# Define the path to the images
image_path_stem = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/"
image_path_ls = [image_path_stem + "00120-0.jpg"]

# Load and process each image
for image_path in image_path_ls:
    image = Image.open(image_path)

    # Preprocess the image and forward it through the model
    inputs = image_processor(images=image, return_tensors="pt")
    outputs = model(**inputs)
    logits = outputs.logits  # shape (batch_size, num_labels, height/4, width/4)

    # Get the predicted class for each pixel
    predicted_class = logits.argmax(dim=1).squeeze().cpu().numpy()

    # Resize the predicted class indices to match the original image size
    predicted_class_resized = Image.fromarray(predicted_class.astype(np.uint8)).resize(image.size, resample=Image.NEAREST)
    predicted_class_resized = np.array(predicted_class_resized)

    # Manually define the ADE20K label mapping
    ade20k_labels = {
        "0": "wall", "1": "building", "2": "sky", "3": "floor", "4": "tree", 
        "5": "ceiling", "6": "road", "7": "bed", "8": "windowpane", "9": "grass", 
        "10": "cabinet", "11": "sidewalk", "12": "person", "13": "earth", "14": "door", 
        "15": "table", "16": "mountain", "17": "plant", "18": "curtain", "19": "chair", 
        "20": "car", "21": "water", "22": "painting", "23": "sofa", "24": "shelf", 
        "25": "house", "26": "sea", "27": "mirror", "28": "rug", "29": "field", 
        "30": "armchair", "31": "seat", "32": "fence", "33": "desk", "34": "rock", 
        "35": "wardrobe", "36": "lamp", "37": "bathtub", "38": "railing", "39": "cushion", 
        "40": "base", "41": "box", "42": "column", "43": "signboard", "44": "chest of drawers", 
        "45": "counter", "46": "sand", "47": "sink", "48": "skyscraper", "49": "fireplace", 
        "50": "refrigerator", "51": "grandstand", "52": "path", "53": "stairs", "54": "runway", 
        "55": "case", "56": "pool table", "57": "pillow", "58": "screen door", "59": "stairway", 
        "60": "river", "61": "bridge", "62": "bookcase", "63": "blind", "64": "coffee table", 
        "65": "toilet", "66": "flower", "67": "book", "68": "hill", "69": "bench", 
        "70": "countertop", "71": "stove", "72": "palm", "73": "kitchen island", "74": "computer", 
        "75": "swivel chair", "76": "boat", "77": "bar", "78": "arcade machine", "79": "hovel", 
        "80": "bus", "81": "towel", "82": "light", "83": "truck", "84": "tower", 
        "85": "chandelier", "86": "awning", "87": "streetlight", "88": "booth", "89": "television receiver", 
        "90": "airplane", "91": "dirt track", "92": "apparel", "93": "pole", "94": "land", 
        "95": "bannister", "96": "escalator", "97": "ottoman", "98": "bottle", "99": "buffet", 
        "100": "poster", "101": "stage", "102": "van", "103": "ship", "104": "fountain", 
        "105": "conveyer belt", "106": "canopy", "107": "washer", "108": "plaything", "109": "swimming pool", 
        "110": "stool", "111": "barrel", "112": "basket", "113": "waterfall", "114": "tent", 
        "115": "bag", "116": "minibike", "117": "cradle", "118": "oven", "119": "ball", 
        "120": "food", "121": "step", "122": "tank", "123": "trade name", "124": "microwave", 
        "125": "pot", "126": "animal", "127": "bicycle", "128": "lake", "129": "dishwasher", 
        "130": "screen", "131": "blanket", "132": "sculpture", "133": "hood", "134": "sconce", 
        "135": "vase", "136": "traffic light", "137": "tray", "138": "ashcan", "139": "fan", 
        "140": "pier", "141": "crt screen", "142": "plate", "143": "monitor", "144": "bulletin board", 
        "145": "shower", "146": "radiator", "147": "glass", "148": "clock", "149": "flag"
    }

    # Classes to keep and display in color
    classes_to_display = [0, 1, 3, 5, 8, 12, 14, 15, 19, 42, 53, 59, 97, 104, 132, 147]
    class_names = {i: ade20k_labels[str(i)] for i in classes_to_display}

    # Manually specify distinct colors for each class
    class_colors = {
        0: [255, 0, 0],        # Red for 'wall'
        1: [0, 255, 0],        # Green for 'building'
        3: [0, 0, 255],        # Blue for 'floor'
        5: [255, 255, 0],      # Yellow for 'ceiling'
        8: [0, 255, 255],      # Cyan for 'windowpane'
        12: [255, 0, 255],     # Magenta for 'person'
        14: [128, 0, 128],     # Purple for 'door'
        15: [128, 128, 0],     # Olive for 'table'
        19: [0, 128, 128],     # Teal for 'chair'
        42: [90, 90, 90],   # Dark gray for 'column'
        53: [192, 192, 192],   # Light gray for 'stairs'
        59: [255, 165, 0],     # Orange for 'stairway'
        97: [255, 105, 180],   # Pink for 'ottoman'
        104: [255, 20, 147],   # Deep pink for 'fountain'
        132: [255, 69, 0],     # Red orange for 'sculpture'
        147: [139, 69, 19]     # Saddle brown for 'glass'
    }

    # Create a blank segmentation map (all black)
    segmentation_map = np.zeros((*predicted_class_resized.shape, 3), dtype=np.uint8)

    # Fill in the segmentation map with colors only for the specified classes
    for class_id, color in class_colors.items():
        mask = predicted_class_resized == class_id
        segmentation_map[mask] = color

    # Convert to image format
    segmentation_image = Image.fromarray(segmentation_map)

    # Display the original image, segmentation map, and legend
    plt.figure(figsize=(12, 8))

    plt.subplot(1, 2, 1)
    plt.title("Original Image")
    plt.imshow(image)
    plt.axis("off")

    plt.subplot(1, 2, 2)
    plt.title("Segmentation Map")
    plt.imshow(segmentation_image)
    plt.axis("off")

    # Create a legend plot
    for idx, (class_id, class_name) in enumerate(class_names.items()):
        plt.fill_between([0, 1], idx + 0.5, idx + 1.5, color=np.array(class_colors[class_id]) / 255.0, label=class_name)

    plt.legend(loc='center left', bbox_to_anchor=(1, 0.5), title="Classes", title_fontsize='13')
    plt.tight_layout()
    plt.show()


In [None]:
# Flatten the array to count occurrences of each class
flat_predicted_class = predicted_class.flatten()

# Count the occurrences of each class
class_counts = Counter(flat_predicted_class)

# Get the top 5 most common classes
top5_classes = class_counts.most_common(5)

# Print the top 5 most common classes with their names and counts
print("Top 5 most common classes and their counts:")
for class_idx, count in top5_classes:
    class_name = ade20k_labels.get(str(class_idx), "Unknown")
    print(f"Class '{class_name}' (Index {class_idx}): {count} occurrences")

In [None]:
0
1
3
5
8
12
14
15
19
53
59
97
104
132
147

In [None]:
## Manually define the ADE20K label mapping
ade20k_labels = {"0": "wall", "1": "building", "2": "sky", "3": "floor", "4": "tree", 
    "5": "ceiling", "6": "road", "7": "bed ", "8": "windowpane", "9": "grass", 
    "10": "cabinet", "11": "sidewalk", "12": "person", "13": "earth", "14": "door", 
    "15": "table", "16": "mountain", "17": "plant", "18": "curtain", "19": "chair", 
    "20": "car", "21": "water", "22": "painting", "23": "sofa", "24": "shelf", 
    "25": "house", "26": "sea", "27": "mirror", "28": "rug", "29": "field", 
    "30": "armchair", "31": "seat", "32": "fence", "33": "desk", "34": "rock", 
    "35": "wardrobe", "36": "lamp", "37": "bathtub", "38": "railing", "39": "cushion", 
    "40": "base", "41": "box", "42": "column", "43": "signboard", "44": "chest of drawers", 
    "45": "counter", "46": "sand", "47": "sink", "48": "skyscraper", "49": "fireplace", 
    "50": "refrigerator", "51": "grandstand", "52": "path", "53": "stairs", "54": "runway", 
    "55": "case", "56": "pool table", "57": "pillow", "58": "screen door", "59": "stairway", 
    "60": "river", "61": "bridge", "62": "bookcase", "63": "blind", "64": "coffee table", 
    "65": "toilet", "66": "flower", "67": "book", "68": "hill", "69": "bench", 
    "70": "countertop", "71": "stove", "72": "palm", "73": "kitchen island", "74": "computer", 
    "75": "swivel chair", "76": "boat", "77": "bar", "78": "arcade machine", "79": "hovel", 
    "80": "bus", "81": "towel", "82": "light", "83": "truck", "84": "tower", 
    "85": "chandelier", "86": "awning", "87": "streetlight", "88": "booth", "89": "television receiver", 
    "90": "airplane", "91": "dirt track", "92": "apparel", "93": "pole", "94": "land", 
    "95": "bannister", "96": "escalator", "97": "ottoman", "98": "bottle", "99": "buffet", 
    "100": "poster", "101": "stage", "102": "van", "103": "ship", "104": "fountain", 
    "105": "conveyer belt", "106": "canopy", "107": "washer", "108": "plaything", "109": "swimming pool", 
    "110": "stool", "111": "barrel", "112": "basket", "113": "waterfall", "114": "tent", 
    "115": "bag", "116": "minibike", "117": "cradle", "118": "oven", "119": "ball", 
    "120": "food", "121": "step", "122": "tank", "123": "trade name", "124": "microwave", 
    "125": "pot", "126": "animal", "127": "bicycle", "128": "lake", "129": "dishwasher", 
    "130": "screen", "131": "blanket", "132": "sculpture", "133": "hood", "134": "sconce", 
    "135": "vase", "136": "traffic light", "137": "tray", "138": "ashcan", "139": "fan", 
    "140": "pier", "141": "crt screen", "142": "plate", "143": "monitor", "144": "bulletin board", 
    "145": "shower", "146": "radiator", "147": "glass", "148": "clock", "149": "flag"}


In [None]:
# Flatten the array to count occurrences of each class
flat_predicted_class = predicted_class.flatten()

# Count the occurrences of each class
class_counts = Counter(flat_predicted_class)

# Get the top 5 most common classes
top5_classes = class_counts.most_common(5)

# Print the top 5 most common classes with their names and counts
print("Top 5 most common classes and their counts:")
for class_idx, count in top5_classes:
    class_name = ade20k_labels.get(str(class_idx), "Unknown")
    print(f"Class '{class_name}' (Index {class_idx}): {count} occurrences")


In [None]:
top5_classes

#### HuggingFace Tutorial: Image Segmentation

In [None]:
from transformers import pipeline
from PIL import Image
import requests


In [None]:
#image_path = "/cluster/home/wueestm/f3loc/metric3d/data/hge_customized_complete/non-aligned/rgb/00000-0.jpg"
#image = Image.open(image_path)
image = Image.open("segmentation_input.jpg")
image

In [None]:
model_path = "/cluster/home/wueestm/.cache/huggingface/hub/models--nvidia--segformer-b1-finetuned-cityscapes-1024-1024/snapshots/ec86afeba68e656629ccf47e0c8d2902f964917b"
semantic_segmentation = pipeline("image-segmentation", model=model_path)

In [None]:
results = semantic_segmentation(image)
results


In [None]:
results[-1]["mask"]

In [None]:
import time

# Measure inference time
start_time = time.time()
results = semantic_segmentation(image)
end_time = time.time()

inference_time = end_time - start_time
print(f"Inference time: {inference_time:.4f} seconds")

#### OneFormer

In [None]:
from transformers import OneFormerProcessor, OneFormerForUniversalSegmentation
from PIL import Image
import requests
import torch

# load OneFormer fine-tuned on ADE20k for universal segmentation
processor = OneFormerProcessor.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")
model = OneFormerForUniversalSegmentation.from_pretrained("shi-labs/oneformer_ade20k_swin_tiny")

url = (
    "https://huggingface.co/datasets/hf-internal-testing/fixtures_ade20k/resolve/main/ADE_val_00000001.jpg"
)
image = Image.open(requests.get(url, stream=True).raw)



In [None]:
!pip install transformers

In [None]:
# Semantic Segmentation
inputs = processor(image, ["semantic"], return_tensors="pt")


In [None]:
with torch.no_grad():
    outputs = model(**inputs)

In [None]:
# model predicts class_queries_logits of shape `(batch_size, num_queries)`
# and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
class_queries_logits = outputs.class_queries_logits
masks_queries_logits = outputs.masks_queries_logits

# you can pass them to processor for semantic postprocessing
predicted_semantic_map = processor.post_process_semantic_segmentation(
    outputs, target_sizes=[image.size[::-1]]
)[0]
f"👉 Semantic Predictions Shape: {list(predicted_semantic_map.shape)}"

In [None]:
predicted_semantic_map.max()

In [None]:
predicted_semantic_map.min()

In [None]:
# Instance Segmentation
inputs = processor(image, ["instance"], return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
# model predicts class_queries_logits of shape `(batch_size, num_queries)`
# and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
class_queries_logits = outputs.class_queries_logits
masks_queries_logits = outputs.masks_queries_logits

# you can pass them to processor for instance postprocessing
predicted_instance_map = processor.post_process_instance_segmentation(
    outputs, target_sizes=[image.size[::-1]]
)[0]["segmentation"]
f"👉 Instance Predictions Shape: {list(predicted_instance_map.shape)}"

# Panoptic Segmentation
inputs = processor(image, ["panoptic"], return_tensors="pt")

with torch.no_grad():
    outputs = model(**inputs)
# model predicts class_queries_logits of shape `(batch_size, num_queries)`
# and masks_queries_logits of shape `(batch_size, num_queries, height, width)`
class_queries_logits = outputs.class_queries_logits
masks_queries_logits = outputs.masks_queries_logits

# you can pass them to processor for panoptic postprocessing
predicted_panoptic_map = processor.post_process_panoptic_segmentation(
    outputs, target_sizes=[image.size[::-1]]
)[0]["segmentation"]
f"👉 Panoptic Predictions Shape: {list(predicted_panoptic_map.shape)}"