# **General**

In [1]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html
!pip install open3d

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-y57902q8
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-y57902q8
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Downloading ftfy-6.3.1-py3-none-any.whl (44 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.8/44.8 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: clip
  Building wheel for clip (setup.py) ... [?25l[?25hdone
  Created wheel for clip: filename=clip-1.0-py3-none-any.whl size=1369489 sha256=c57e81f518940d89be5c2840828c6360d726f545f5962000352c876b6a1aa164
  Stored in directory: /tmp/pip-ephem-wheel-cache-mgtazpc2/wheels/3f/7c/a4/9b490845988bf7a4d

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/AML Project: 3D Affordance/Affordance_Highlighting_Project_2024-main

/content/drive/MyDrive/AML Project: 3D Affordance/Affordance_Highlighting_Project_2024-main


# **Load dataset**

In [None]:
from dataset import load_data

data = load_data('/content/drive/MyDrive/AML Project: 3D Affordance/Affordance_Highlighting_Project_2024-main/full_shape_train_data.pkl')

Type: <class 'list'>


# **Model Functions**

In [None]:

import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh

class NeuralHighlighter(nn.Module):
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=True, sigma=5.0):
        """
        Inizializza il modello Neural Highlighter.

        :param depth: Numero di layer della rete neurale.
        :param width: Numero di neuroni in ogni layer.
        :param out_dim: Dimensione dell'output (ad esempio, numero di classi).
        :param input_dim: Dimensione dell'input (default 3, per vertici 3D).
        :param positional_encoding: Se attivato, usa la codifica posizionale.
        :param sigma: Parametro per la trasformazione Fourier (se la codifica posizionale è attiva).
        """
        super(NeuralHighlighter, self).__init__()

        layers = []
        '''
        # Se la codifica posizionale è attiva, applica la trasformazione Fourier
        if positional_encoding:
            layers.append(FourierFeatureTransform(input_dim, width, sigma))  # Codifica posizionale (da implementare)
            layers.append(nn.Linear(width * 2 + input_dim, width))  # Combinazione delle informazioni posizionali e originali
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        else:
        '''
        layers.append(nn.Linear(input_dim, width))  # Solo una rete a partire dall'input originale
        layers.append(nn.ReLU())
        layers.append(nn.LayerNorm([width]))

        # Aggiungi i layer successivi, il numero di layer è determinato dal parametro 'depth'
        for _ in range(depth):
            layers.append(nn.Linear(width, width))  # Linear layer
            layers.append(nn.ReLU())  # Funzione di attivazione ReLU
            layers.append(nn.LayerNorm([width]))  # Normalizzazione del layer

        # Layer finale
        layers.append(nn.Linear(width, out_dim))  # Ultimo layer per ottenere l'output
        layers.append(nn.Softmax(dim=1))  # Softmax per normalizzare l'output delle classi

        # Assegna la lista di layer al modulo
        self.mlp = nn.ModuleList(layers)

    def forward(self, x):
        #raise NotImplementedError("Implement the Neural Highlighter Model as described in the project instructions")
        for layer in self.mlp:
            x = layer(x)  # Applica ogni layer del modello
        return x

def get_clip_model(clipmodel):
    #raise NotImplementedError("Load the clip model from the clip module")
    #Load CLIP model
    clip_model, preprocess = clip.load(clipmodel, device)
    return clip_model, preprocess

# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=4,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


'''
def clip_loss():

    raise NotImplementedError("\
    Implement the Neural Highlighter Model as described in the project instructions \
    Pass to this function the language embedding, the rendered images and the clip model\
    return the calculatd loss     \
    ")
'''

def clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model):
    if n_augs == 0:
        clip_image = clip_transform(rendered_images)
        encoded_renders = clip_model.encode_image(clip_image)
        encoded_renders = encoded_renders / encoded_renders.norm(dim=1, keepdim=True)

        if encoded_text.shape[0] > 1:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                            torch.mean(encoded_text, dim=0), dim=0)
        else:
            loss = torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                            encoded_text)
    elif n_augs > 0:
        loss = 0.0
        for _ in range(n_augs):
            augmented_image = augment_transform(rendered_images)
            encoded_renders = clip_model.encode_image(augmented_image)

            if encoded_text.shape[0] > 1:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0),
                                                torch.mean(encoded_text, dim=0), dim=0)
            else:
                loss -= torch.cosine_similarity(torch.mean(encoded_renders, dim=0, keepdim=True),
                                                encoded_text)
    return 1 + (loss/n_augs)


def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


# **Mesh from point cloud**

In [None]:
import plotly.graph_objects as go
import open3d as o3d
from scipy.spatial import cKDTree
from render import Renderer
from utils import device, color_mesh

**Point cloud to mesh function**

In [None]:
def pc_to_mesh(data, selected_obj_index):
    shape_id = data[selected_obj_index]['shape_id']
    semantic_class = data[selected_obj_index]['semantic class']
    affordances = data[selected_obj_index]['affordance']
    coordinates = data[selected_obj_index]['full_shape']['coordinate']
    labels = data[selected_obj_index]['full_shape']['label']
    #Save coordinate file
    np.savetxt("coordinates.txt", coordinates, fmt="%.6f", comments="")

    #Point Cloud without normals
    pcd = o3d.io.read_point_cloud("coordinates.txt", format='xyz')

    # Adding normals to the point cloud
    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamHybrid(radius=0.1, max_nn=20))
    pcd.orient_normals_consistent_tangent_plane(k=20)

    #Creating mesh with Poisson surface reconstruction
    mesh_from_pc, densities = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(pcd, depth=9)
    return mesh_from_pc, coordinates,  labels




**Visualizing function**

In [None]:
def show_mesh(mesh, coordinates, th, labels = None):
  # Colors
  base_color = 'rgb(169, 169, 169)' #grey
  highlithed_color = 'rgb(255, 255, 0)' #yellow

  # Get info mesh
  vertices = np.asarray(mesh.vertices)
  faces = np.asarray(mesh.triangles)

  if labels is None:
    print("Mesh without Highlighting")
    # Create Plotly 3D mesh plot without highlithing
    fig = go.Figure(
            data=[
                go.Mesh3d(
                    x=vertices[:, 0], y=vertices[:, 1], z=vertices[:, 2],
                    i=faces[:, 0], j=faces[:, 1], k=faces[:, 2],
                    color=base_color,
                    opacity=1
                  )
            ])

  else:
    print("Mesh with Highlighting")
    # Handling coloring mesh
    kdtree = cKDTree(coordinates)
    # Trova l'indice del punto più vicino nella point cloud per ogni vertice della mesh
    _, idx = kdtree.query(vertices)
    # Colori dei vertici basati sui valori della label
    vertex_colors = labels[idx]

    vertex_intensity = np.where(vertex_colors > th, vertex_colors, 0)  # Highlight only where vertex_colors > 0.4

    # Create Plotly 3D mesh plot with highlithing
    fig = go.Figure(
        data=[
            go.Mesh3d(
                x=vertices[:, 0], y=vertices[:, 1], z=vertices[:, 2],
                i=faces[:, 0], j=faces[:, 1], k=faces[:, 2],
                intensity=vertex_intensity,  # Colori basati sui valori interpolati
                colorscale = [
                  [0, base_color],  # Grey (Low density)
                  [1, highlithed_color]     # Yellow (High density)
                ],  # Mappa colori
                colorbar=dict(title='Density'),  # Barra colori
                opacity=1
        )])


  # Set plot titles, labels, and aspect ratio
  fig.update_layout(
      title= 'Highlithed Mesh' if labels is not None else '3D Mesh from Point Cloud',
      scene=dict(
          xaxis=dict(showgrid=False, showticklabels=False, showbackground=False, title=''),
          yaxis=dict(showgrid=False, showticklabels=False, showbackground=False, title=''),
          zaxis=dict(showgrid=False, showticklabels=False, showbackground=False, title=''),
          aspectmode='data'
      ),
  )

  fig.show()



# **Evaluation pipeline**

**Choose settings**

In [None]:
eval_indexes = [7368, 7370, 7371, 7374, 7375, 7376]

# Choose specific object
index = eval_indexes[3]
# Choose affordance class
affordance_class = 'wrap_grasp'

# Choose threshold for density
th = 0.2

# HYPERPARAMETERS
# Choose hyper-parameters
LR = 0.0001
augs = 7
views = 5
dep = 4

# Choose augmentation transforms
extra_augs = False #True if you want additional augmentation transforms, False if you want std augmentation transforms

# Choose Learing Rate schedule
add_sheduler = False #True if you want a stepLR schedule
# Scheduler settings
step_size = 1000  # Ogni quanto ridurre il learning rate (in termini di iterazioni/epoche)
gamma = 0.1  # Riduci il learning rate di un fattore 0.1 ad ogni step_size


# Output path
output_dir_custom = f'./output_val/bottle_{index}/output_LR00001'




### GT Highlithed Mesh

In [None]:
# Creating mesh
mesh0, pcd, labels = pc_to_mesh(data, index)
# Saving mesh
o3d.io.write_triangle_mesh(f"mesh_{index}.obj", mesh0)
# visualizing mesh
show_mesh(mesh0, pcd, th, labels[affordance_class].flatten())

### Our model Highlithed Mesh

In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


render_res = 224
learning_rate = LR
n_iter = 2500
res = 224

obj_path = f'mesh_{index}.obj'

n_augs = augs
output_dir = output_dir_custom
clip_model = 'ViT-B/32'

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

#parametri NeuralHighlighter
depth = dep
width = 256
n_classes=2
input_dim=3
positional_encoding=True
sigma=5.0

# CLIP and Augmentation Transforms
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res)),
    clip_normalizer
])

if extra_augs == False:
          augment_transform = transforms.Compose([
              transforms.RandomResizedCrop(res, scale=(1, 1)),
              transforms.RandomPerspective(fill=1, p=0.8, distortion_scale=0.5),
              clip_normalizer
          ])
else:
          augment_transform = transforms.Compose([
              transforms.RandomResizedCrop(res, scale=(0.8, 1.0)), # Ridimensiona e ritaglia in modo casuale, mantenendo la scala (80% - 100%)
              transforms.RandomPerspective(fill=1, p=0.5, distortion_scale=0.3), # Modifica leggera della prospettiva per simulare leggere distorsioni

              transforms.RandomHorizontalFlip(p=0.5), # Riflessione orizzontale casuale per aumentare la varietà
              transforms.RandomRotation(degrees=15), # Leggere rotazioni per simulare differenti angolazioni (max ±15 gradi)
              transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.05), # Piccole variazioni di luminosità, contrasto, saturazione e tonalità
              transforms.RandomAffine(degrees=10, translate=(0.05, 0.05), scale=(0.9, 1.1)), # Traslazione, rotazione e scaling con gradi contenuti per aumentare la varietà spaziale

              clip_normalizer
          ])


# MLP Settings
mlp = NeuralHighlighter(depth, width, n_classes, input_dim, positional_encoding, sigma).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

if add_sheduler:
    scheduler = torch.optim.lr_scheduler.StepLR(optim, step_size=step_size, gamma=gamma)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
clip_model, preprocess = get_clip_model(clip_model)

# encode prompt with CLIP
prompt = "A 3D render of a gray {} with the {} area highlighted".format('bottle', 'grasped and wrapped')
with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = clip_model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)


vertices = copy.deepcopy(mesh.vertices)
n_views = views

losses = []

# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)
    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=4,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(n_augs, rendered_images, encoded_text, clip_transform, augment_transform, clip_model)
    loss.backward(retain_graph=True)

    optim.step()

    if add_sheduler:
          scheduler.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())

    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

name='prova'
# save results
save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(output_dir, prompt), "w") as f:
    f.write('')

# Calculate metric mIOU

In [None]:
def compute_iou_single_class(pred, gt):
    """
    Calcola l'IoU per una singola classe.

    Args:
        pred (numpy.ndarray): Array binario delle predizioni (0 o 1).
        gt (numpy.ndarray): Array binario del ground truth (0 o 1).

    Returns:
        float: IoU calcolata per la singola classe.
    """
    # Intersezione: dove pred e gt sono entrambi 1
    intersection = np.sum((pred == 1) & (gt == 1))

    # Unione: dove pred o gt sono 1
    union = np.sum((pred == 1) | (gt == 1))

    # Calcola l'IoU
    if union == 0:
        return 1.0  # Se non c'è né pred né gt, l'IoU è perfetta
    else:
        iou = intersection / union
        return iou

In [None]:
dominance = (pred_class[:, 0] > pred_class[:, 1]).long()
# Handling coloring mesh
kdtree = cKDTree(pcd)
# Trova l'indice del punto più vicino nella point cloud per ogni vertice della mesh
_, idx = kdtree.query(vertices.cpu())
# Colori dei vertici basati sui valori della label
vertex_colors = labels[affordance_class].flatten()[idx]
#numpy
pred=dominance.cpu().numpy()
print(pred)
gt = (vertex_colors > th).astype(int)
print(gt)

iou = compute_iou_single_class(pred, gt)
print("")
print("IoU:", iou)
