In [1]:
import h5py
import torch
torch.set_grad_enabled(False)
import open3d as o3d
import open3d.ml as _ml3d
import open3d.ml.torch as ml3d
import open_clip
gpu_device = torch.device('cuda')

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
import pickle
import numpy as np

with open('outputs.pkl', 'rb') as f:
    outputs = pickle.load(f)

In [3]:
import matplotlib.pyplot as plt
from matplotlib.backends.backend_agg import FigureCanvasAgg
from scipy.spatial import KDTree
from tqdm import tqdm
from PIL import Image
import io
import tempfile

def visualize_points(pts, colors):

    # reduce the size of the point cloud
    if pts.shape[0] > 100000:
        idx = np.random.choice(pts.shape[0], 100000, replace=False)
        pts = pts[idx]
        colors = colors[idx]

    # normalize colors
    colors = colors.astype(float) / 255.0

    # remove outliers
    pcd = o3d.geometry.PointCloud()
    pcd.points = o3d.utility.Vector3dVector(pts)
    pcd.colors = o3d.utility.Vector3dVector(colors)
    pcd, _ = pcd.remove_statistical_outlier(nb_neighbors=100, std_ratio=2.0)

    # compute camera orientation
    pcd.estimate_normals(
                search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.04, max_nn=30)
            )
    pcd.orient_normals_consistent_tangent_plane(100)
    normals = np.asarray(pcd.normals)
    average_normal = np.mean(np.asarray(normals), axis=0)
    average_normal /= np.linalg.norm(average_normal)

    elevation = np.arcsin(average_normal[2])
    azimuth = np.arctan2(average_normal[1], average_normal[0])

    elevation_deg = np.degrees(elevation)
    azimuth_deg = np.degrees(azimuth)

    # create figure for rendering
    fig = plt.figure(figsize=(6, 6), dpi=160) # 160
    ax = fig.add_subplot(111, projection="3d")

    # Set the camera view
    ax.view_init(elev=elevation_deg, azim=azimuth_deg) # +8.4 + 2.3 # +5.5, -4.4

    filtered_pts = np.asarray(pcd.points)
    filtered_colors = np.asarray(pcd.colors)

    ax.scatter(
        filtered_pts[:, 0],
        filtered_pts[:, 1],
        filtered_pts[:, 2],
        c=filtered_colors,
        s=1,  # type: ignore
    )

    ax.axis('off')
    ax.grid(False)

    plt.savefig("temp.png", transparent=True, format='png', bbox_inches='tight', pad_inches=-0.4)
    plt.close()

    img = Image.open("temp.png")
    img = np.array(img)

    return img

# idx = np.random.choice(outputs['xyz_pts'].shape[0], 500000, replace=False)
# idx = outputs['segmentation_pts']['bin']
# image = visualize_points(outputs['xyz_pts'][idx], outputs['rgb_pts'][idx])

In [4]:
model, _, preprocess = open_clip.create_model_and_transforms("ViT-H-14", "laion2b_s32b_b79k", device=gpu_device)
model.eval()

CLIP(
  (visual): VisionTransformer(
    (conv1): Conv2d(3, 1280, kernel_size=(14, 14), stride=(14, 14), bias=False)
    (patch_dropout): Identity()
    (ln_pre): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
    (transformer): Transformer(
      (resblocks): ModuleList(
        (0-31): 32 x ResidualAttentionBlock(
          (ln_1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (attn): MultiheadAttention(
            (out_proj): NonDynamicallyQuantizableLinear(in_features=1280, out_features=1280, bias=True)
          )
          (ls_1): Identity()
          (ln_2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)
          (mlp): Sequential(
            (c_fc): Linear(in_features=1280, out_features=5120, bias=True)
            (gelu): GELU(approximate='none')
            (c_proj): Linear(in_features=5120, out_features=1280, bias=True)
          )
          (ls_2): Identity()
        )
      )
    )
    (ln_post): LayerNorm((1280,), eps=1e-05, elementwi

In [5]:
import cv2

# get the scene overview
scene_img = visualize_points(outputs['xyz_pts'], outputs['rgb_pts'])
scene_img = cv2.resize(scene_img, (512, 512))
_scene_img = preprocess(Image.fromarray(scene_img)).unsqueeze(0).cuda().float()
imgfeat = model.encode_image(_scene_img)
global_feat = imgfeat.half().cuda()
global_feat = torch.nn.functional.normalize(global_feat, dim=-1)

FEAT_DIM = global_feat.shape[-1]

In [6]:
# split the points up to save memory

segmented = {}

max_points = 5000000
starting_points = outputs['xyz_pts'].shape[0]

discarded = []

for key in outputs['segmentation_pts']:

    object_indices = np.where(outputs['segmentation_pts'][key] == True)[0]
    num_to_keep = int((object_indices.shape[0] / starting_points) * max_points)

    indices_to_keep = np.random.choice(object_indices.shape[0], num_to_keep, replace=False)
    reduced_object_indices = object_indices[indices_to_keep]

    segmented[key] = {
        "points": outputs['xyz_pts'][reduced_object_indices],
        "colors": outputs['rgb_pts'][reduced_object_indices],
    }

    discarded.extend(object_indices[~np.isin(object_indices, reduced_object_indices)])


# handle the unsegmented points
outputs['xyz_pts'] = np.delete(outputs['xyz_pts'], discarded, axis=0)
outputs['rgb_pts'] = np.delete(outputs['rgb_pts'], discarded, axis=0)

num_to_keep = int((outputs['xyz_pts'].shape[0] / starting_points) * max_points)
indices_to_keep = np.random.choice(outputs['xyz_pts'].shape[0], num_to_keep, replace=False)

segmented['unsegmented'] = {
    "points": outputs['xyz_pts'][indices_to_keep],
    "colors": outputs['rgb_pts'][indices_to_keep],
}

In [7]:
# get the per-object features
cosine_similarity = torch.nn.CosineSimilarity(dim=-1)
feat_per_obj = []
obj_sim_per_unit_area = []
views = []
for key in segmented:
    obj_img = visualize_points(segmented[key]['points'], segmented[key]['colors'])
    obj_img = cv2.resize(obj_img, (512, 512))
    _obj_img= preprocess(Image.fromarray(obj_img)).unsqueeze(0).cuda().float()
    obj_feat = model.encode_image(_obj_img).half().cuda()
    feat_per_obj.append(obj_feat)

    # calculate the cosine similarity between the global feature vector and the feature vector for the object and save that as well
    _sim = cosine_similarity(global_feat, obj_feat)
    obj_sim_per_unit_area.append(_sim)

    torch.cuda.empty_cache()

    # save the key so we know the order in which these were processed
    views.append(key)


In [8]:
# Compute the inter-object similarity to determine the relevance of each

scores = torch.cat(obj_sim_per_unit_area).to(gpu_device)
feat_per_obj = torch.cat(feat_per_obj, dim=0).to(gpu_device)

# get the cosine simixlarity between the features of each object. This will be a square matrix where the (i, j)th entry is the cosine similarity between the ith and jth objects
mask_sim_mat = torch.nn.functional.cosine_similarity(
    feat_per_obj[:, :, None], feat_per_obj.t()[None, :, :]
)
mask_sim_mat_untouched = mask_sim_mat.clone()
mask_sim_mat.fill_diagonal_(0.0) # set the diagonal to 0 because we don't want to consider the similarity between the same object
mask_sim_mat = mask_sim_mat.mean(1)  # avg sim of each mask with each other mask
softmax_scores = scores.cuda() - mask_sim_mat # subtracting the object-object relevance (which can be thought of as the relevance of the object in context of the other objects) object-scene similarity (which is kind of like global relevance) gives how much more or less important that object is than all the other objects
softmax_scores = torch.nn.functional.softmax(softmax_scores, dim=0) # apply softmax to get the final scores

In [9]:
# obtain pixel aligned features

for objidx in range(len(views)):
    _weighted_feat = (
        softmax_scores[objidx] * global_feat + (1 - softmax_scores[objidx]) * feat_per_obj[objidx]
    )
    _weighted_feat = torch.nn.functional.normalize(_weighted_feat, dim=-1).half().cpu().numpy()
    repeated = np.tile(_weighted_feat, (segmented[views[objidx]]['points'].shape[0], 1))
    segmented[views[objidx]]['features'] = repeated
    
# stack up the point cloud
points = np.concatenate([segmented[key]['points'] for key in segmented], axis=0)
colors = np.concatenate([segmented[key]['colors'] for key in segmented], axis=0)
features = np.concatenate([segmented[key]['features'] for key in segmented], axis=0)

In [10]:
print(points.shape)
print(colors.shape)
print(features.shape)
np.save("points.npy", points)
np.save("colors.npy", colors)
np.save("features.npy", features)

(5489779, 3)
(5489779, 3)
(5489779, 1024)


In [13]:
import open3d.ml.torch as ml3d
import torch
torch.set_grad_enabled(False)
import numpy as np

points = np.load("points.npy")
features = np.load("features.npy")

# downsample further
idx = np.random.choice(points.shape[0], 500000, replace=False)
points = points[idx]
features = features[idx]

# voxel aggregation to blend features
points = torch.tensor(points).float().cpu()
features = torch.tensor(features).float().cpu()

i = 0
agg_rate = 0.005
while points.shape[0] > 20000:
    points, features = ml3d.ops.voxel_pooling(points, features, agg_rate, position_fn='nearest_neighbor', feature_fn='nearest_neighbor')
    i += 1

    print(f"Completed voxel aggregation with {points.shape[0]} points")

    agg_rate += 0.005

print(f"Completed voxel aggregation with {points.shape[0]} points")

# save the point cloud
torch.save(features, "clip_feaures.pt")
torch.save(points, "xyz_pts.pt")

Completed voxel aggregation with 55810 points
Completed voxel aggregation with 19208 points
Completed voxel aggregation with 19208 points
