# **General**

In [1]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install open3d

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-dssp3en4
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-dssp3en4
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=7f00c68dd77e5be257ba5f87dd0bb889bd08c876fa7a03b1d1efdb1461b3539a
  Stored in directory: /tmp/pip-ephem-wheel-cache-g_ap67c5/wheels/da/2b/4c/d6691fa9597aac8bb

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/AML Project: 3D Affordance/Affordance_Highlighting_Project_2024-main

/content/drive/.shortcut-targets-by-id/1YTt044XOkry2sxZchKykdntCTnmMraQo/AML Project: 3D Affordance/Affordance_Highlighting_Project_2024-main


# **Model Functions**

In [None]:

import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=True, sigma=5.0):
        """
        Inizializza il modello Neural Highlighter.

        :param depth: Numero di layer della rete neurale.
        :param width: Numero di neuroni in ogni layer.
        :param out_dim: Dimensione dell'output (ad esempio, numero di classi).
        :param input_dim: Dimensione dell'input (default 3, per vertici 3D).
        :param positional_encoding: Se attivato, usa la codifica posizionale.
        :param sigma: Parametro per la trasformazione Fourier (se la codifica posizionale è attiva).
        """
        super(NeuralHighlighter, self).__init__()

        layers = []

        layers.append(nn.Linear(input_dim, width))  # Solo una rete a partire dall'input originale
        layers.append(nn.ReLU())
        layers.append(nn.LayerNorm([width]))

        # Aggiungi i layer successivi, il numero di layer è determinato dal parametro 'depth'
        for _ in range(depth):
            layers.append(nn.Linear(width, width))  # Linear layer
            layers.append(nn.ReLU())  # Funzione di attivazione ReLU
            layers.append(nn.LayerNorm([width]))  # Normalizzazione del layer

        # Layer finale
        layers.append(nn.Linear(width, out_dim))  # Ultimo layer per ottenere l'output
        layers.append(nn.Softmax(dim=1))  # Softmax per normalizzare l'output delle classi

        # Assegna la lista di layer al modulo
        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        #raise NotImplementedError("Implement the Neural Highlighter Model as described in the project instructions")
        for layer in self.mlp:
            x = layer(x)  # Applica ogni layer del modello
        return x

def get_clip_model(clipmodel):
    #raise NotImplementedError("Load the clip model from the clip module")
    #Load CLIP model
    clip_model, preprocess = clip.load(clipmodel, device)
    return clip_model, preprocess

# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=4,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')



def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):
    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)

        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                            torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                            encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)

            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    return 1 + (loss/n_augs)


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


In [None]:


# Choose affordance class
affordance_class = 'wrap_grasp'



# HYPERPARAMETERS
# Choose hyper-parameters
LR = 0.0001
augs = 7
views = 5
dep = 4





### Our model Highlithed Mesh

In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
learning_rate = LR
n_iter = 2500
res = 224
obj='jagermeister'
obj_path = f'/content/drive/MyDrive/AML Project: 3D Affordance/mesh_create/{obj}.obj'
# Output path
output_dir_custom = f'./output_estenzione/{obj}_openable'
n_augs = augs
output_dir = output_dir_custom
clip_model = 'ViT-B/32'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

#parametri NeuralHighlighter
depth = dep
width = 256
n_classes=2
input_dim=3
positional_encoding=True
sigma=5.0

# CLIP and Augmentation Transforms
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res)),
    clip_normalizer
])
augment_transform = transforms.Compose([
              transforms.RandomResizedCrop(res, scale=(1, 1)),
              transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
              clip_normalizer
          ])




# MLP Settings
mlp = NeuralHighlighter(depth, width, n_classes, input_dim, positional_encoding, sigma).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)


# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
clip_model, preprocess = get_clip_model(clip_model)

# encode prompt with CLIP
prompt = "A 3D render of a gray {} with the {} area highlighted".format('bottle', 'openable')
with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = clip_model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)


vertices = copy.deepcopy(mesh.vertices)
n_views = views

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)
    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=4,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model)
    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

name='prova'
# save results
save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

  0%|          | 1/2500 [00:00<25:03,  1.66it/s]

Last 100 CLIP score: 0.70849609375


  4%|▍         | 101/2500 [00:50<19:20,  2.07it/s]

Last 100 CLIP score: 0.668642578125


  8%|▊         | 201/2500 [01:40<21:49,  1.76it/s]

Last 100 CLIP score: 0.6596044921875


 12%|█▏        | 301/2500 [02:29<17:35,  2.08it/s]

Last 100 CLIP score: 0.654482421875


 16%|█▌        | 401/2500 [03:20<17:23,  2.01it/s]

Last 100 CLIP score: 0.65201171875


 20%|██        | 501/2500 [04:10<20:32,  1.62it/s]

Last 100 CLIP score: 0.6510107421875


 24%|██▍       | 601/2500 [05:00<14:59,  2.11it/s]

Last 100 CLIP score: 0.64943359375


 28%|██▊       | 701/2500 [05:50<14:44,  2.03it/s]

Last 100 CLIP score: 0.648671875


 32%|███▏      | 801/2500 [06:41<15:20,  1.85it/s]

Last 100 CLIP score: 0.6524072265625


 36%|███▌      | 901/2500 [07:31<13:42,  1.94it/s]

Last 100 CLIP score: 0.647802734375


 40%|████      | 1001/2500 [08:21<12:21,  2.02it/s]

Last 100 CLIP score: 0.6480322265625


 44%|████▍     | 1101/2500 [09:11<12:09,  1.92it/s]

Last 100 CLIP score: 0.6478955078125


 48%|████▊     | 1201/2500 [10:01<10:21,  2.09it/s]

Last 100 CLIP score: 0.64623046875


 52%|█████▏    | 1301/2500 [10:52<10:14,  1.95it/s]

Last 100 CLIP score: 0.6502783203125


 56%|█████▌    | 1401/2500 [11:41<09:01,  2.03it/s]

Last 100 CLIP score: 0.646865234375


 60%|██████    | 1501/2500 [12:31<08:03,  2.07it/s]

Last 100 CLIP score: 0.64966796875


 64%|██████▍   | 1601/2500 [13:21<07:46,  1.93it/s]

Last 100 CLIP score: 0.6496337890625


 68%|██████▊   | 1701/2500 [14:10<06:22,  2.09it/s]

Last 100 CLIP score: 0.6466943359375


 72%|███████▏  | 1801/2500 [15:02<05:39,  2.06it/s]

Last 100 CLIP score: 0.6494384765625


 76%|███████▌  | 1901/2500 [15:51<05:29,  1.82it/s]

Last 100 CLIP score: 0.647490234375


 80%|████████  | 2001/2500 [16:40<03:58,  2.10it/s]

Last 100 CLIP score: 0.649814453125


 84%|████████▍ | 2101/2500 [17:30<03:14,  2.05it/s]

Last 100 CLIP score: 0.6502587890625


 88%|████████▊ | 2201/2500 [18:20<02:40,  1.86it/s]

Last 100 CLIP score: 0.64708984375


 92%|█████████▏| 2301/2500 [19:09<01:36,  2.07it/s]

Last 100 CLIP score: 0.646044921875


 96%|█████████▌| 2401/2500 [19:59<00:49,  2.02it/s]

Last 100 CLIP score: 0.648681640625


100%|██████████| 2500/2500 [20:48<00:00,  2.00it/s]
