<a href="https://colab.research.google.com/github/OmdenaAI/IECO/blob/seife/LAM3D.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%cd /content/drive/MyDrive/Colab Notebooks/

/content/drive/MyDrive/Colab Notebooks


In [3]:
!mkdir LAM3d

mkdir: cannot create directory ‘LAM3d’: File exists


In [4]:
%cd LAM3d

/content/drive/MyDrive/Colab Notebooks/LAM3d


In [6]:
from IPython.display import clear_output
!pip install trimesh PyMCubes
!pip install transformers
clear_output()

In [8]:
from huggingface_hub import notebook_login
notebook_login()


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [9]:
import torch
import numpy as np
import cv2
import json
import torch.nn as nn
from torchvision import transforms
import trimesh
import mcubes
import transformers
  # Replaced mcubes with pymcubes
from transformers import AutoFeatureExtractor, Dinov2Model as DINOModel


In [10]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

class PointCloudTransformer(nn.Module):
    def __init__(self, input_dim=3, latent_dim=256):
        super(PointCloudTransformer, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 256)
        self.fc3 = nn.Linear(256, latent_dim)
        self.relu = nn.ReLU()

    def forward(self, point_cloud):
        B, N, _ = point_cloud.shape
        point_cloud_flat = point_cloud.view(B * N, -1)
        x = self.relu(self.fc1(point_cloud_flat))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        tri_planes = x.view(B, N, -1)
        return tri_planes

class TriPlaneDecoder(nn.Module):
    def __init__(self, latent_dim=256, output_dim=1):
        super(TriPlaneDecoder, self).__init__()
        self.fc1 = nn.Linear(latent_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        self.relu = nn.ReLU()

    def forward(self, tri_plane):
        x = self.relu(self.fc1(tri_plane))
        x = self.relu(self.fc2(x))
        sdf_values = self.fc3(x)
        return sdf_values

class DiffusionModel(nn.Module):
    def __init__(self, latent_dim=256, timesteps=1000, img_feature_dim=768):
        super(DiffusionModel, self).__init__()
        self.timesteps = timesteps
        self.fc1 = nn.Linear(latent_dim + img_feature_dim, 128)
        self.fc2 = nn.Linear(128, latent_dim)
        self.relu = nn.ReLU()

    def forward(self, x, img_features, t):
        img_features = img_features.repeat_interleave(x.shape[1] // img_features.shape[1], dim=1)
        diff = x.shape[1] - img_features.shape[1]
        if diff > 0:
            padding = torch.zeros(img_features.shape[0], diff, img_features.shape[2], device=img_features.device)
            img_features = torch.cat([img_features, padding], dim=1)
        elif diff < 0:
            img_features = img_features[:, :x.shape[1], :]

        x = torch.cat([x, img_features], dim=-1)
        x = self.relu(self.fc1(x))
        x = self.fc2(x)
        return x

    def sample_noise(self, shape):
        return torch.randn(shape, device=device)

    def forward_diffusion(self, x0, t):
        alpha = 1 - 0.02 * t / self.timesteps
        noise = self.sample_noise(x0.shape)
        xt = alpha * x0 + (1 - alpha) * noise
        return xt, noise

In [None]:
def load_image(image_path):
    image = cv2.imread(image_path)
    image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
    return image

def load_point_cloud(json_path):
    with open(json_path, 'r') as f:
        point_cloud = np.array(json.load(f))
    return point_cloud

def extract_image_features(image, feature_extractor, model):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
    ])
    image_tensor = transform(image).unsqueeze(0).to(device)
    with torch.no_grad():
        features = model(image_tensor).last_hidden_state
    return features

def process_point_cloud(point_cloud, transformer_model):
    point_cloud_tensor = torch.tensor(point_cloud, dtype=torch.float32).unsqueeze(0).to(device)
    latent_tri_plane = transformer_model(point_cloud_tensor)
    return latent_tri_plane

def align_image_point_cloud(image_features, latent_tri_plane, diffusion_model, timesteps=1000):
    aligned_features = latent_tri_plane.clone()
    for t in range(timesteps, 0, -1):
        noisy_latent, noise = diffusion_model.forward_diffusion(aligned_features, t)
        denoised_latent = diffusion_model(noisy_latent, image_features, t)
        aligned_features = denoised_latent - noise
    return aligned_features

def reconstruct_mesh(aligned_features, decoder):
    sdf_values = decoder(aligned_features)
    sdf_values = sdf_values.view(-1, sdf_values.shape[-1])
    vertices, triangles = mcubes.marching_cubes(sdf_values.detach().cpu().numpy(), 0)
    mesh = trimesh.Trimesh(vertices=vertices, faces=triangles)
    return mesh

def save_mesh(mesh, output_path):
    mesh.export(output_path)

def main(image_path, json_path, output_mesh_path):
    image = load_image(image_path)
    point_cloud = load_point_cloud(json_path)

    feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/dino-vitb16")
    dino_model = DINOModel.from_pretrained("facebook/dino-vitb16").to(device)

    point_cloud_transformer = PointCloudTransformer().to(device)
    tri_plane_decoder = TriPlaneDecoder().to(device)
    diffusion_model = DiffusionModel().to(device)

    image_features = extract_image_features(image, feature_extractor, dino_model)
    latent_tri_plane = process_point_cloud(point_cloud, point_cloud_transformer)
    aligned_features = align_image_point_cloud(image_features, latent_tri_plane, diffusion_model)
    mesh = reconstruct_mesh(aligned_features, tri_plane_decoder)

    save_mesh(mesh, output_mesh_path)
    print(f"Mesh saved to {output_mesh_path}")

if __name__ == "__main__":
    image_path = "/content/drive/MyDrive/Colab Notebooks/ee3cf4dc3889801f14774a529a7ff2c02251a6d5/google.jpg"
    json_path = "/content/drive/MyDrive/Colab Notebooks/ee3cf4dc3889801f14774a529a7ff2c02251a6d5/dsm.json"
    output_mesh_path = "/content/drive/MyDrive/Colab Notebooks/ee3cf4dc3889801f14774a529a7ff2c02251a6d5/output/mesh.obj"

    main(image_path, json_path, output_mesh_path)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/244 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

You are using a model of type vit to instantiate a model of type dinov2. This is not supported for all configurations of models and can yield errors.


pytorch_model.bin:   0%|          | 0.00/343M [00:00<?, ?B/s]

Some weights of Dinov2Model were not initialized from the model checkpoint at facebook/dino-vitb16 and are newly initialized: ['embeddings.mask_token', 'encoder.layer.0.layer_scale1.lambda1', 'encoder.layer.0.layer_scale2.lambda1', 'encoder.layer.0.mlp.fc1.bias', 'encoder.layer.0.mlp.fc1.weight', 'encoder.layer.0.mlp.fc2.bias', 'encoder.layer.0.mlp.fc2.weight', 'encoder.layer.0.norm1.bias', 'encoder.layer.0.norm1.weight', 'encoder.layer.0.norm2.bias', 'encoder.layer.0.norm2.weight', 'encoder.layer.1.layer_scale1.lambda1', 'encoder.layer.1.layer_scale2.lambda1', 'encoder.layer.1.mlp.fc1.bias', 'encoder.layer.1.mlp.fc1.weight', 'encoder.layer.1.mlp.fc2.bias', 'encoder.layer.1.mlp.fc2.weight', 'encoder.layer.1.norm1.bias', 'encoder.layer.1.norm1.weight', 'encoder.layer.1.norm2.bias', 'encoder.layer.1.norm2.weight', 'encoder.layer.10.layer_scale1.lambda1', 'encoder.layer.10.layer_scale2.lambda1', 'encoder.layer.10.mlp.fc1.bias', 'encoder.layer.10.mlp.fc1.weight', 'encoder.layer.10.mlp.fc2.