In [2]:
import os
import torch
import h5py
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
torch.set_grad_enabled(False)
import open_clip
import pickle
cpu_device = torch.device('cpu')
gpu_device = torch.device('cuda')

ModuleNotFoundError: No module named 'h5py'

In [None]:
# load the CLIP model

model, _, preprocess = open_clip.create_model_and_transforms("ViT-H-14", "laion2b_s32b_b79k")
model.cpu()
model.eval()
tokenizer = open_clip.get_tokenizer("ViT-H-14")

In [3]:
# load the ground truth bounding boxes
import pickle

with open('outputs.pkl', 'rb') as f:
    outputs = pickle.load(f)

In [76]:
objects = ['apple', 'milk', 'cereal', 'bread', 'banana', 'bin', 'ur5e', 'panda']
bboxes = {}

for object in objects:
    object_points = outputs['xyz_pts'][outputs['segmentation_pts'][object]]
    min_x = object_points[:, 0].min()
    max_x = object_points[:, 0].max()
    min_y = object_points[:, 1].min()
    max_y = object_points[:, 1].max()
    min_z = object_points[:, 2].min()
    max_z = object_points[:, 2].max()
    bboxes[object] = [(min_x, min_y, min_z), (max_x, max_y, max_z)]

In [91]:
# print(bboxes['apple'])
# bboxes['apple'] = [(-0.7769833, 0.3524605, 0.15017936), (-0.68268924, 0.4545155, 0.2483119)]

# print(bboxes['banana'])
# bboxes['banana'] = [(-0.37267345, 0.36484203, 0.15037148), (-0.13995615, 0.4852845, 0.19382282)]

# print(bboxes['milk'])
# bboxes['milk'] = [(-0.0752123, 0.5250703, 0.15051995), (0.0596368, 0.62811373, 0.34507185)]

# print(bboxes['cereal'])
# bboxes['cereal'] = [(-0.1052123, 0.3250703, 0.15051995), (0.0396368, 0.40811373, 0.34507185)]

# print(bboxes['bin'])
# bboxes['bin'] = [(0.0596368, 0.2750703, 0.15055764), (0.8581135, 0.707657, 0.2649597)]

[(0.0596368, 0.2750703, 0.15055764), (0.8981135, 0.707657, 0.2649597)]


In [92]:
import open3d as o3d
import numpy as np

# downsample the point cloud by randomly selecting points
downsampled_indices = np.random.choice(outputs['xyz_pts'].shape[0], 100000, replace=False)
downsampled_points = outputs['xyz_pts'][downsampled_indices]
downsampled_colors = outputs['rgb_pts'][downsampled_indices] / 255

def create_bounding_box_points_and_lines(bbox, color):
    min_point = bbox[0]
    max_point = bbox[1]

    corners = np.array([
        [min_point[0], min_point[1], min_point[2]],
        [min_point[0], max_point[1], min_point[2]],
        [max_point[0], max_point[1], min_point[2]],
        [max_point[0], min_point[1], min_point[2]],
        [min_point[0], min_point[1], max_point[2]],
        [min_point[0], max_point[1], max_point[2]],
        [max_point[0], max_point[1], max_point[2]],
        [max_point[0], min_point[1], max_point[2]],
    ])

    # Create points and colors for the bounding box
    bbox_points = o3d.utility.Vector3dVector(corners)
    bbox_colors = o3d.utility.Vector3dVector([color for _ in range(corners.shape[0])])

    return bbox_points, bbox_colors

gt_bbox_points, gt_bbox_colors = create_bounding_box_points_and_lines(bboxes['bin'], [1, 1, 0])

combined_points = np.vstack((np.asarray(downsampled_points), np.asarray(gt_bbox_points)))
combined_colors = np.vstack((np.asarray(downsampled_colors), np.asarray(gt_bbox_colors)))

point_cloud = o3d.geometry.PointCloud()
point_cloud.points = o3d.utility.Vector3dVector(combined_points)
point_cloud.colors = o3d.utility.Vector3dVector(combined_colors)

# Save the point cloud to a .ply file
o3d.io.write_point_cloud("bbox_test.ply", point_cloud)

True

In [None]:
# load the point cloud and features

with h5py.File("/home/owenburns88/concept-fusion/examples/saved-map/pointclouds/pc_points.h5", "r") as f:
    xyz_pts = torch.from_numpy(f["pc_points"][:])

with h5py.File("/home/owenburns88/concept-fusion/examples/saved-map/pointclouds/pc_embeddings.h5", "r") as f:
    pixelwise_features = torch.from_numpy(f["pc_embeddings"][:])

# with h5py.File("/home/owenburns88/concept-fusion/examples/saved-map/pointclouds/pc_colors.h5", "r") as f:
#     rgb_pts = torch.from_numpy(f["pc_colors"][:]).float().numpy() / 255.0

map_embeddings_norm = torch.nn.functional.normalize(pixelwise_features, dim=1).cpu()

In [None]:
# embed the search text

text = tokenizer(["red object"])
textfeat = model.encode_text(text.cpu())
textfeat = torch.nn.functional.normalize(textfeat, dim=-1)
textfeat = textfeat.unsqueeze(0)

In [None]:
# conduct the similarity search

cosine_similarity = torch.nn.CosineSimilarity(dim=-1).to('cpu')

# calculate the similarity between all of the pixel-level embeddings and the prompt, and scale it to the [0,1] range (normally [-1,1]) to serve as a colormap weight
similarity = cosine_similarity(
    map_embeddings_norm, textfeat
)

similarity_shifted = (similarity + 1.0) / 2.0 # shift the similarity to the [0,1] range

similarity_rel = (similarity_shifted - similarity_shifted.min()) / (
                similarity_shifted.max() - similarity_shifted.min() + 1e-12
            ) # normalize the similarity to the [0,1] range

similarity_rel_thresholded = similarity_rel.clone()
similarity_rel_thresholded[similarity_rel_thresholded < 0.6] = 0.0

In [None]:
# compute the IoU score

bounding_box = bboxes['apple']

model_chosen_points = xyz_pts[similarity_rel_thresholded[0,:] >= 0.6]
min_x = model_chosen_points[:, 0].min()
max_x = model_chosen_points[:, 0].max()
min_y = model_chosen_points[:, 1].min()
max_y = model_chosen_points[:, 1].max()
min_z = model_chosen_points[:, 2].min()
max_z = model_chosen_points[:, 2].max()
model_bounding_box = [(min_x, min_y, min_z), (max_x, max_y, max_z)]

def iou(boxA, boxB):
    # determine the (x, y)-coordinates of the intersection rectangle
    xA = max(boxA[0][0], boxB[0][0])
    yA = max(boxA[0][1], boxB[0][1])
    xB = min(boxA[1][0], boxB[1][0])
    yB = min(boxA[1][1], boxB[1][1])
    # compute the area of intersection rectangle
    interArea = max(0, xB - xA) * max(0, yB - yA)
    # compute the area of both the prediction and ground-truth rectangles
    boxAArea = (boxA[1][0] - boxA[0][0]) * (boxA[1][1] - boxA[0][1])
    boxBArea = (boxB[1][0] - boxB[0][0]) * (boxB[1][1] - boxB[0][1])
    # compute the intersection over union by taking the intersection area and dividing it by the sum of prediction + ground-truth areas - the interesection area
    iou = interArea / float(boxAArea + boxBArea - interArea)
    # return the intersection over union value
    return iou

iou_score = iou(bounding_box, model_bounding_box)
print(iou_score)