In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-0dga7hza
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-0dga7hza
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=fd3ba5a713b1a34742c1b914f3e8c80464f88622745754c36bb20bef259f6108
  Stored in directory: /tmp/pip-ephem-wheel-cache-xpb2kzch/wheels/da/2b/4c/d6691fa9597aac8bb

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)



%cd /content/drive/MyDrive/Affordance_Highlighting_Project_2024-main


Mounted at /content/drive
/content/drive/.shortcut-targets-by-id/1OpRkGfgt1qt8itV_2dMAkMw82cSVCok4/Affordance_Highlighting_Project_2024-main


In [None]:

import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=True, sigma=5.0):
        """
        Inizializza il modello Neural Highlighter.

        :param depth: Numero di layer della rete neurale.
        :param width: Numero di neuroni in ogni layer.
        :param out_dim: Dimensione dell'output (ad esempio, numero di classi).
        :param input_dim: Dimensione dell'input (default 3, per vertici 3D).
        :param positional_encoding: Se attivato, usa la codifica posizionale.
        :param sigma: Parametro per la trasformazione Fourier (se la codifica posizionale è attiva).
        """
        super(NeuralHighlighter, self).__init__()

        layers = []
        '''
        # Se la codifica posizionale è attiva, applica la trasformazione Fourier
        if positional_encoding:
            layers.append(FourierFeatureTransform(input_dim, width, sigma))  # Codifica posizionale (da implementare)
            layers.append(nn.Linear(width * 2 + input_dim, width))  # Combinazione delle informazioni posizionali e originali
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        else:
        '''
        layers.append(nn.Linear(input_dim, width))  # Solo una rete a partire dall'input originale
        layers.append(nn.ReLU())
        layers.append(nn.LayerNorm([width]))

        # Aggiungi i layer successivi, il numero di layer è determinato dal parametro 'depth'
        for _ in range(depth):
            layers.append(nn.Linear(width, width))  # Linear layer
            layers.append(nn.ReLU())  # Funzione di attivazione ReLU
            layers.append(nn.LayerNorm([width]))  # Normalizzazione del layer

        # Layer finale
        layers.append(nn.Linear(width, out_dim))  # Ultimo layer per ottenere l'output
        layers.append(nn.Softmax(dim=1))  # Softmax per normalizzare l'output delle classi

        # Assegna la lista di layer al modulo
        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        #raise NotImplementedError("Implement the Neural Highlighter Model as described in the project instructions")
        for layer in self.mlp:
            x = layer(x)  # Applica ogni layer del modello
        return x

def get_clip_model(clipmodel):
    #raise NotImplementedError("Load the clip model from the clip module")
    #Load CLIP model
    clip_model, preprocess = clip.load(clipmodel, device)
    return clip_model, preprocess

# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=4,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


'''
def clip_loss():

    raise NotImplementedError("\
    Implement the Neural Highlighter Model as described in the project instructions \
    Pass to this function the language embedding, the rendered images and the clip model\
    return the calculatd loss     \
    ")
'''

def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):
    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)

        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                            torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                            encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)

            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    return 1 + (loss/n_augs)


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


Warp 1.5.0 initialized:
   CUDA Toolkit 12.6, Driver 12.2
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   Kernel cache:
     /root/.cache/warp/1.5.0


In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
learning_rate = 0.00001
n_iter = 2500
res = 224
#res = 336
obj_path = 'data/horse.obj'
#obj_path = 'data/candle.obj'
n_augs = 5
output_dir = './output_horse_hat_LR00001/'
clip_model = 'ViT-B/32'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

#parametri NeuralHighlighter
depth = 4
width = 256
n_classes=2
input_dim=3
positional_encoding=True
sigma=5.0

# CLIP and Augmentation Transforms
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res)),
    clip_normalizer
])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(1, 1)),
    transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
    clip_normalizer
])

# MLP Settings
mlp = NeuralHighlighter(depth, width, n_classes, input_dim, positional_encoding, sigma).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
clip_model, preprocess = get_clip_model(clip_model)

# encode prompt with CLIP
prompt = "A 3D render of a gray {} with highlighted {}".format('horse', 'hat')
with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = clip_model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)


vertices = copy.deepcopy(mesh.vertices)
n_views = 7

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=4,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model)
    loss.backward(retain_graph=True)

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

name='prova'
# save results
save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 103MiB/s]
  0%|          | 1/2500 [00:02<1:46:44,  2.56s/it]

Last 100 CLIP score: 0.68701171875


  4%|▍         | 101/2500 [00:40<16:36,  2.41it/s]

Last 100 CLIP score: 0.6596240234375


  8%|▊         | 201/2500 [01:18<16:27,  2.33it/s]

Last 100 CLIP score: 0.65318359375


 12%|█▏        | 301/2500 [01:57<13:51,  2.64it/s]

Last 100 CLIP score: 0.6505517578125


 16%|█▌        | 401/2500 [02:36<13:20,  2.62it/s]

Last 100 CLIP score: 0.6512255859375


 20%|██        | 501/2500 [03:15<12:45,  2.61it/s]

Last 100 CLIP score: 0.6489306640625


 24%|██▍       | 601/2500 [03:53<13:19,  2.38it/s]

Last 100 CLIP score: 0.6489697265625


 28%|██▊       | 701/2500 [04:31<11:04,  2.71it/s]

Last 100 CLIP score: 0.6486572265625


 32%|███▏      | 801/2500 [05:09<10:29,  2.70it/s]

Last 100 CLIP score: 0.6496728515625


 36%|███▌      | 901/2500 [05:47<10:23,  2.57it/s]

Last 100 CLIP score: 0.647734375


 40%|████      | 1001/2500 [06:25<10:26,  2.39it/s]

Last 100 CLIP score: 0.6488427734375


 44%|████▍     | 1101/2500 [07:02<08:39,  2.69it/s]

Last 100 CLIP score: 0.6475


 48%|████▊     | 1201/2500 [07:41<09:11,  2.36it/s]

Last 100 CLIP score: 0.6464111328125


 52%|█████▏    | 1301/2500 [08:19<07:49,  2.55it/s]

Last 100 CLIP score: 0.648486328125


 56%|█████▌    | 1401/2500 [08:57<07:38,  2.40it/s]

Last 100 CLIP score: 0.648447265625


 60%|██████    | 1501/2500 [09:34<06:09,  2.70it/s]

Last 100 CLIP score: 0.6483251953125


 64%|██████▍   | 1601/2500 [10:12<05:29,  2.73it/s]

Last 100 CLIP score: 0.6486376953125


 68%|██████▊   | 1701/2500 [10:50<05:10,  2.57it/s]

Last 100 CLIP score: 0.6483251953125


 72%|███████▏  | 1801/2500 [11:28<04:44,  2.46it/s]

Last 100 CLIP score: 0.6456201171875


 76%|███████▌  | 1901/2500 [12:05<03:39,  2.73it/s]

Last 100 CLIP score: 0.64720703125


 80%|████████  | 2001/2500 [12:43<03:03,  2.71it/s]

Last 100 CLIP score: 0.6466162109375


 84%|████████▍ | 2101/2500 [13:22<02:32,  2.61it/s]

Last 100 CLIP score: 0.649072265625


 88%|████████▊ | 2201/2500 [13:59<02:04,  2.40it/s]

Last 100 CLIP score: 0.647236328125


 92%|█████████▏| 2301/2500 [14:37<01:12,  2.73it/s]

Last 100 CLIP score: 0.6488818359375


 96%|█████████▌| 2401/2500 [15:15<00:36,  2.69it/s]

Last 100 CLIP score: 0.6481982421875


100%|██████████| 2500/2500 [15:53<00:00,  2.62it/s]


In [None]:
!cp -r ./output /content/drive/MyDrive/AMLProject:3DAffordance/outputs_our_code/output_18_12_horse_glasses