In [1]:
import os
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
import copy
import time
import cv2 
import numpy as np 
import random

from copy import deepcopy 
from collections import deque, defaultdict
from typing import Any, Dict, List, Optional, Set, Tuple, Union
from numpy.typing import ArrayLike, NDArray
from pydantic import dataclasses, validator

import dm_control
from dm_control import mujoco as dm_mujoco
from dm_control.utils.transformations import mat_to_quat, quat_to_euler
import mujoco
from rocobench.envs import SortOneBlockTask, CabinetTask, MoveRopeTask, SweepTask, MakeSandwichTask, PackGroceryTask, MujocoSimEnv, SimRobot, visualize_voxel_scene
import torch
import psutil
from lavis.models.eva_vit import create_eva_vit_g
from lavis.common.registry import registry
from omegaconf import OmegaConf
from PIL import Image
from tqdm import tqdm
import gc

Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
basic imports
local modules


  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
def print_mem_stats():
    mem = psutil.virtual_memory()
    total_system_memory = mem.total / (1024 ** 2)
    used_system_memory = mem.used / (1024 ** 2)
    total_gpu_memory = torch.cuda.get_device_properties(0).total_memory / (1024 ** 2)
    reserved_gpu_memory = torch.cuda.memory_reserved(0) / (1024 ** 2)
    allocated_gpu_memory = torch.cuda.memory_allocated(0) / (1024 ** 2)
    percent_gpu_total = (allocated_gpu_memory / total_gpu_memory)*100
    percent_gpu_reserved = (reserved_gpu_memory / total_gpu_memory)*100
    percent_cpu_total = (used_system_memory / total_system_memory)*100
    print(f"mem used gpu: {allocated_gpu_memory:.2f} MB, reserved gpu: {reserved_gpu_memory:.2f}MB -> {percent_gpu_total:.2f}% of total, {percent_gpu_reserved:.2f}% reserved")
    print(f"mem used cpu: {used_system_memory:.2f} MB -> {percent_cpu_total:.2f}% of total")

print_mem_stats()

mem used gpu: 0.00 MB, reserved gpu: 0.00MB -> 0.00% of total, 0.00% reserved
mem used cpu: 3774.97 MB -> 7.83% of total


In [2]:
tracked = ['sceneshotcam', 'apple', 'milk', 'cereal', 'bread', 'banana', 'bin', 'ur5e', 'panda']
env = PackGroceryTask(
    render_freq=2000,
    image_hw=(400,400), # Potentially important for getting RGBD images later on
    sim_forward_steps=300, # number of time steps forward that mujoco simulates before deciding that the llms need to pick an easier to optimize plan
    error_freq=30,
    error_threshold=1e-5,
    randomize_init=True,
    render_point_cloud=0, # Potentially useful for speeding up point fusion
    render_cameras=["face_panda","face_ur5e","teaser",],
    point_feature_cameras=tracked,
    one_obj_each=True, # TODO: Understand this
)
# gpu_device = torch.device("cuda")
# cpu_device = torch.device("cpu")

in init loading visual encoder
loading physics
rendering cameras
face_panda


In [4]:
# load visual encoder onto cpu for storage
visual_encoder = create_eva_vit_g(512, precision='fp32').to(cpu_device)
print_mem_stats()

Position interpolate from 16x16 to 36x36


NameError: name 'cpu_device' is not defined

In [None]:
# load 3D-LLM onto cpu for storage
ckpt_path = "checkpoints/pretrain_blip2_sam_flant5xl_v2.pth"
model_cfg = {
    "arch": "blip2_t5",
    "model_type": "pretrain_flant5xl",
    "use_grad_checkpoint": False,
}
model_cfg = OmegaConf.create(model_cfg)
model = registry.get_model_class("blip2_t5").from_pretrained(model_type="pretrain_flant5xl")
checkpoint = torch.load(ckpt_path, map_location="cpu")
model.load_state_dict(checkpoint["model"], strict=False)
model.eval()
processor_cfg = {"name": "blip_question", "prompt": ""}
processor_cfg = OmegaConf.create(processor_cfg)
text_processor = registry.get_processor_class(processor_cfg.name).from_config(processor_cfg)

In [None]:
outputs = env.render_feature_cameras()

In [None]:
test_img = Image.fromarray(outputs['sceneshotcam'][1])
test_img.save("sceneshotcam.png")

In [None]:
SCENE_BOUNDS=((-1.4, -0.2, -0.1), (1.7, 1.2, 1.1))
point_clouds = [
    sensor_output[0].point_cloud.filter_bounds(bounds=SCENE_BOUNDS) 
        for sensor_output in outputs.values()
]
global_point_cloud = sum(point_clouds[1:], start=point_clouds[0])

In [None]:
torch.set_grad_enabled(False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
visual_encoder = create_eva_vit_g(512, precision='fp32').to(device)
print_mem_stats()

In [None]:
cosine_similarity = torch.nn.CosineSimilarity(dim=-1)

# calculate the global feature vector
_, scene_img = outputs["sceneshotcam"]
scene_img = cv2.resize(scene_img, (512, 512))
scene_tensor = torch.tensor(scene_img[:512,:512]).permute(2, 0, 1)
scene_tensor = scene_tensor.unsqueeze(0).float().to(device)
output = visual_encoder(scene_tensor)
global_feat = output
global_feat = global_feat.half().to(device)
global_feat = global_feat.mean(1)
global_feat = torch.nn.functional.normalize(global_feat, dim=-1)
FEAT_DIM = global_feat.shape[-1]
print_mem_stats()

In [None]:
pixelwise_features = torch.zeros(global_point_cloud.xyz_pts.shape[0], FEAT_DIM, dtype=torch.half)
print_mem_stats()

In [None]:
def tensor_memory_size_in_gb(tensor):
    element_size = tensor.element_size()  # Gets size of each element in bytes
    total_elements = tensor.numel()        # Gets total number of elements in the tensor
    memory_size_in_bytes = element_size * total_elements
    memory_size_in_gb = memory_size_in_bytes / (1024 ** 3)  # Convert bytes to gigabytes
    return memory_size_in_gb

print(tensor_memory_size_in_gb(scene_tensor))
print(tensor_memory_size_in_gb(global_feat))

In [None]:
point_feature_cameras = ['sceneshotcam', 'apple', 'milk', 'cereal', 'bread', 'banana', 'bin', 'ur5e', 'panda']
specific_views = list(set(point_feature_cameras) - {"sceneshotcam"})
feat_per_obj = []
obj_sim_per_unit_area = []
for view in tqdm(specific_views):
    # crop the image to the bounding box and run it through the visual encoder to get the feature vector for the object
    _, obj_img = outputs[view]
    roi = torch.ones((512, 512, 3))
    img_roi = torch.tensor(obj_img[:512,:512])
    roi[:img_roi.shape[0], :img_roi.shape[1]] = img_roi
    img_roi = roi.permute(2, 0, 1).unsqueeze(0).to(device)
    roifeat = visual_encoder(img_roi)
    roifeat = roifeat.half().cuda()
    roifeat = roifeat.mean(1)
    roifeat = torch.nn.functional.normalize(roifeat, dim=-1)
    feat_per_obj.append(roifeat)

    # calculate the cosine similarity between the global feature vector and the feature vector for the object and save that as well
    _sim = cosine_similarity(global_feat, roifeat)
    obj_sim_per_unit_area.append(_sim)

    torch.cuda.empty_cache()

print_mem_stats()

In [None]:
scores = torch.cat(obj_sim_per_unit_area).to(device)
feat_per_obj = torch.cat(feat_per_obj, dim=0).to(device)

# get the cosine simixlarity between the features of each object. This will be a square matrix where the (i, j)th entry is the cosine similarity between the ith and jth objects
mask_sim_mat = torch.nn.functional.cosine_similarity(
    feat_per_obj[:, :, None], feat_per_obj.t()[None, :, :]
)
mask_sim_mat.fill_diagonal_(0.0) # set the diagonal to 0 because we don't want to consider the similarity between the same object
mask_sim_mat = mask_sim_mat.mean(1)  # avg sim of each mask with each other mask
softmax_scores = scores.cuda() - mask_sim_mat # subtracting the object-object relevance (which can be thought of as the relevance of the object in context of the other objects) object-scene similarity (which is kind of like global relevance) gives how much more or less important that object is than all the other objects
softmax_scores = torch.nn.functional.softmax(softmax_scores, dim=0) # apply softmax to get the final scores
print_mem_stats()

In [None]:
pixelwise_features = pixelwise_features.to(device)
for objidx in range(len(specific_views)):
    _weighted_feat = (
        softmax_scores[objidx] * global_feat + (1 - softmax_scores[objidx]) * feat_per_obj[objidx]
    )
    _weighted_feat = torch.nn.functional.normalize(_weighted_feat, dim=-1)
    pixelwise_features[global_point_cloud.segmentation_pts[specific_views[objidx]], :] += _weighted_feat
    pixelwise_features[global_point_cloud.segmentation_pts[specific_views[objidx]], :] = torch.nn.functional.normalize(
        pixelwise_features[global_point_cloud.segmentation_pts[specific_views[objidx]], :],
        dim=-1,
    ).half()
print_mem_stats()

In [None]:
outfeat = pixelwise_features.unsqueeze(0).float().to('cpu')  # interpolate is not implemented for float yet in pytorch
xyz_pts = torch.tensor(global_point_cloud.xyz_pts).unsqueeze(0).float()
print_mem_stats()

In [None]:
print_mem_stats()

In [None]:
visual_encoder = None
pixelwise_features = None
mask_sim_mat = None
softmax_scores = None
feat_per_obj = None
scores = None
global_point_cloud = None
outputs = None
env = None
del visual_encoder
del pixelwise_features
del mask_sim_mat
del softmax_scores
del feat_per_obj
del scores
del outputs
del env
del global_point_cloud
gc.collect()
print_mem_stats()

In [None]:
torch.cuda.empty_cache()
print_mem_stats()

In [None]:
num_entries = outfeat.size(1)
num_to_keep = int(num_entries * 0.6)

# Generate a random permutation of indices and select the first 80%
indices = torch.randperm(num_entries)[:num_to_keep]

# Use the selected indices to downsample the tensors
outfeat_downsampled = outfeat[:, indices, :]
xyz_pts_downsampled = xyz_pts[:, indices, :]
print_mem_stats()

In [None]:
outfeat = None
xyz_pts = None
del outfeat
del xyz_pts
gc.collect()
print_mem_stats()

In [None]:
ckpt_path = "checkpoints/pretrain_blip2_sam_flant5xl_v2.pth"
model_cfg = {
    "arch": "blip2_t5",
    "model_type": "pretrain_flant5xl",
    "use_grad_checkpoint": False,
}
model_cfg = OmegaConf.create(model_cfg)
print_mem_stats()

In [None]:
model = registry.get_model_class("blip2_t5").from_pretrained(model_type="pretrain_flant5xl")
print_mem_stats()

In [None]:
checkpoint = torch.load(ckpt_path, map_location="cpu")
print_mem_stats()

In [None]:
model.load_state_dict(checkpoint["model"], strict=False)
print_mem_stats()

In [None]:
model.eval()
print_mem_stats()

In [None]:
processor_cfg = {"name": "blip_question", "prompt": ""}
processor_cfg = OmegaConf.create(processor_cfg)
text_processor = registry.get_processor_class(processor_cfg.name).from_config(processor_cfg)
print_mem_stats()

In [None]:
print(model.device)

In [None]:
prompt = text_processor("What items do you see on the table?")
model_inputs = {"text_input": prompt, "pc_feat": outfeat_downsampled, "pc": xyz_pts_downsampled}
model_outputs = model.predict_answers(
    samples=model_inputs,
    max_len=50,
    length_penalty=1.2,
    repetition_penalty=1.5,
)
model_outputs = model_outputs[0]
print(model_outputs)
print_mem_stats()

In [32]:
model = None
text_processor = None
outfeat_downsampled = None
xyz_pts_downsampled = None
checkpoint = None
del model
del text_processor
del outfeat_downsampled
del xyz_pts_downsampled
del checkpoint
gc.collect()
torch.cuda.empty_cache()
print_mem_stats()

mem used gpu: 21.10 MB, reserved gpu: 42.00MB -> 0.09% of total, 0.19% reserved
mem used cpu: 15368.57 MB -> 31.89% of total
