In [1]:
import os
import torch
import numpy as np
from glob import glob
from tqdm import tqdm
from os.path import join, exists
import open3d as o3d
import clip
model, preprocess = clip.load("ViT-L/14@336px")

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.


In [2]:
# sample 3D scene path (should be pre-processed)
raw_path = "/mnt/project/AT3DCV_Data/Preprocessed_OpenScene/data/scannet_3d/example/scene0000_00_vh_clean_2.pth"
# need to have distilled features ready
distilled_feature_path = "/mnt/project/AT3DCV_Data/3D_features/scene0000_00_vh_clean_2_openscene_feat_distill.npy"
# fused features
fused_feature_path = "/mnt/project/AT3DCV_Data/Preprocessed_OpenScene/data/test_whole/scene0000_00_0.pt"

# for augmented
# sample 3D scene path (should be pre-processed)
raw_path = "/mnt/project/AT3DCV_Data/Preprocessed_OpenScene/data/augmented/scannet_3d/example/scene0000_00_vh_clean_2.pth"
# need to have distilled features ready
distilled_feature_path = "/mnt/project/AT3DCV_Data/Preprocessed_OpenScene/data/augmented/features_3D/scene0000_00_vh_clean_2_openscene_feat_distill.npy"
# fused features
fused_feature_path = "/mnt/project/AT3DCV_Data/Preprocessed_OpenScene/data/augmented/fused/scene0000_00_0.pt"

In [3]:
# loading original sample
raw_sample = torch.load(raw_path) 
raw_sample_points = raw_sample[0]
raw_sample_colors = raw_sample[1]

# loading fused 2D features
fused_f_d = torch.load(fused_feature_path)

fused_f = fused_f_d["feat"]
# Get the indices where the mask is True
inds_reverse = torch.nonzero(fused_f_d["mask_full"]).squeeze()

# loading distilled 3D features
distilled = np.load(distilled_feature_path)
#cast and normalize embeddings for distilled 
distilled_f = torch.from_numpy(distilled).half()
# masking to match the features with 2D fused ones
distilled_f = distilled_f[inds_reverse, :]
distilled_f = distilled_f / distilled_f.norm(p=2, dim=-1, keepdim=True)

In [5]:
# get text embeddings with clip
# type the query here 
query = ["table"]
with torch.no_grad():
    all_text_embeddings = []
    for category in tqdm(query):
        texts = clip.tokenize(category)  #tokenize
        texts = texts.cuda()
        text_embeddings = model.encode_text(texts)  #embed with text encoder
        text_embeddings /= text_embeddings.norm(dim=-1, keepdim=True)
        text_embedding = text_embeddings.mean(dim=0)
        text_embedding /= text_embedding.norm()
        all_text_embeddings.append(text_embedding)

    all_text_embeddings = torch.stack(all_text_embeddings, dim=1)


100%|█████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:00<00:00,  1.40it/s]


In [6]:
#pred_fusion = fused_f.half().cuda() @ all_text_embeddings
pred_fusion = (fused_f/(fused_f.norm(dim=-1, keepdim=True)+1e-5)).half().cuda() @ all_text_embeddings

pred_distill = (distilled_f/(distilled_f.norm(dim=-1, keepdim=True)+1e-5)).half().cuda() @ all_text_embeddings

feat_ensemble = distilled_f.clone().half()

mask_ = pred_distill.max(dim=-1)[0] < pred_fusion.max(dim=-1)[0]

feat_ensemble[mask_] = fused_f[mask_]

# after masking, need to normalize feat_ensemble here because we're picking the points we'll highlight
# by a certain threshold so it has to be normalized to compare other approaches with only 2D or 3D.
# if we do evaluating with max logits and labels, then there is no need for a normalization here
similarity_matrix = (feat_ensemble/(feat_ensemble.norm(dim=-1, keepdim=True)+1e-5)).cuda() @ all_text_embeddings

In [None]:
# set higher to increase the certainty (not always correct)
threshold_percentage = 0.9
cap = similarity_matrix.max().item()
found_indices = torch.nonzero(similarity_matrix > cap*threshold_percentage, as_tuple=False).squeeze().T[0]

# creating pc
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(np.asarray(raw_sample_points[inds_reverse,:]))
pcd.colors = o3d.utility.Vector3dVector(np.asarray(raw_sample_colors[inds_reverse,:]))

found_region = pcd.select_by_index(found_indices.tolist())
found_region.paint_uniform_color([1.0, 0, 0]) # paint related points to red
rest = pcd.select_by_index(found_indices.tolist(), invert=True)
o3d.visualization.draw_geometries([rest,found_region])

In [9]:
found_indices.shape

torch.Size([1551])

# experimental

here we create a zero matrix from the distilled features because it has a feature vector for all of the original points from the raw sample, then we're filling the zeros according to the mask of the corresponding points between 3D-2D with the 2D fused features to get the extended fused features tensor. There will still be zeros in this extended fused feature tensor because of the missing points from 2D data. However, our base feature tensor is from 3D distilled features, and zeros would be replaced by those. In this way we're only considering the 3D distilled features for the missing points because of the 2D data and we're able to visualize all the original points from 3D data

In [9]:
# loading original sample
raw_sample = torch.load(raw_path) 
raw_sample_points = raw_sample[0]
raw_sample_colors = raw_sample[1]

# loading fused 2D features
fused_f_d = torch.load(fused_feature_path)
fused_f = fused_f_d["feat"]
# Get the indices where the mask is True
inds_reverse = torch.nonzero(fused_f_d["mask_full"]).squeeze()

# loading distilled 3D features
distilled = np.load(distilled_feature_path)
#cast and normalize embeddings for distilled 
distilled_f = torch.from_numpy(distilled).half()
fused_extended = torch.zeros_like(distilled_f)
distilled_f = distilled_f / distilled_f.norm(p=2, dim=-1, keepdim=True)

In [10]:
fused_extended[inds_reverse,:] = fused_f

In [11]:
pred_fusion = (fused_extended/(fused_extended.norm(dim=-1, keepdim=True)+1e-5)).half().cuda() @ all_text_embeddings

pred_distill = (distilled_f/(distilled_f.norm(dim=-1, keepdim=True)+1e-5)).half().cuda() @ all_text_embeddings

feat_ensemble = distilled_f.clone().half()

mask_ =  pred_distill.max(dim=-1)[0] < pred_fusion.max(dim=-1)[0]

feat_ensemble[mask_] = fused_extended[mask_]

similarity_matrix = (feat_ensemble/(feat_ensemble.norm(dim=-1, keepdim=True)+1e-5)).cuda() @ all_text_embeddings

In [12]:
similarity_matrix.shape

torch.Size([81369, 1])

In [13]:
# set higher to increase the certainty (not always correct)
threshold_percentage = 0.4
cap = similarity_matrix.max().item()
found_indices = torch.nonzero(similarity_matrix > cap*threshold_percentage, as_tuple=False).squeeze().T[0]

# creating pc
pcd = o3d.geometry.PointCloud()
pcd.points = o3d.utility.Vector3dVector(np.asarray(raw_sample_points))
pcd.colors = o3d.utility.Vector3dVector(np.asarray(raw_sample_colors))

found_region = pcd.select_by_index(found_indices.tolist())
found_region.paint_uniform_color([1.0, 0, 0]) # paint related points to red
rest = pcd.select_by_index(found_indices.tolist(), invert=True)
o3d.visualization.draw_geometries([rest,found_region])

In [14]:
found_indices.shape

torch.Size([5254])