# Octree Transformer Hands on Tutorial

### Some Text for the introduction, motivation and author

In [32]:
from torch.utils.data import Dataset
import os
import numpy as np
from torch import nn
import torch
import itertools
import math
from tqdm import tqdm
from tqdm.auto import trange
import pytorch_lightning as pl
import ignite as ig
from torch.utils.data import DataLoader
from torch.optim import AdamW
from torch.optim.lr_scheduler import LambdaLR
from torch.nn.utils.rnn import pad_sequence


In [33]:
import sys
sys.path.append('../')
from utils.load_hsp import load_hsp
from utils.save_obj import save_obj

In [34]:
PADDING_VALUE = 0
NUM_VOCAB = 3+1
RESOLUTION = 64
SPATIAL_DIM = 3

## Load the Data

Get the data from the ShapeNet data set

# Octree Data Structure
The Octree data structure is a hierarchical representation of 3D voxel data. It starts with a root node that encompasses the entire object. This root node is then recursively split into eight children, forming an octagonal subdivision. Each child node can further be divided into eight children of its own, and this subdivision process continues until a certain resolution is reached. The resolution determines the maximum level of subdivision, and it impacts the number of leaf nodes in the Octree. Specifically, an Octree with a resolution of R has a maximum of 8^(ld(R))=R³ leaf nodes. Leaf nodes represent the smallest subvolumes in the Octree, containing voxel data.

![Octree Structure visualized](../images/octree_explained.png)

<small>Picture by [The Infinite Loop](https://geidav.wordpress.com/2014/07/18/advanced-octrees-1-preliminaries-insertion-strategies-and-max-tree-depth/)</small>


To optimize the Octree's efficiency, we utilize modifications to the traditional Octree structure. One key modification is the representation of subvolumes with the same value for all their voxels. In such cases, instead of representing each individual voxel, we can replace them with a single node that represents the common value. This pruning technique significantly reduces the number of nodes in the Octree, resulting in a more compact representation of the voxel data. 

Implementation Steps:

    Determine the node value:
        1: If all child elements are empty
        2: If there are at least two child elements with different values
        3: If all child elements are occupied
    Prune the tree if the node value is 1 or 3:
        Remove all child nodes of the current node and replace them with a single node representing the value of the current node.
    Recursively split the tree if the node value is 2:
        Create eight child nodes representing the subvolumes within the current node.
        Repeat this splitting process for each child node until the desired resolution is reached.

In [35]:
def linearize(array: np.ndarray,
                    max_resolution: int = 8096):
    #max resolution rauswerfen?
    """ Performs a quick linearisation of given voxel array into value, depth and position sequences.

    Args:
        array (np.ndarray): Numpy array holding pixels/voxels of a discretized shape.
        pos_encoding (optional, str): Defines position encoding. Defaults to "centered".
        max_resolution (optional, int): Parses voxels only until 'max_resolution'.

    Returns:
        tuple(np.ndarray, np.ndarray, np.ndarray): Linearised value, depth and position sequences.
    """
    def recursive_linearise(array: np.ndarray, pos: np.ndarray, dep: int = 1):
        """ Recursive internal function to linearise given voxel array.

        Note: Uses variables of parent function to store values.

        Args:
            array (np.ndarray): Numpy array (or subarray) holding pixels/voxels of a discretized shape.
            pos (np.ndarray): Position of parent array
            dep (int, optional): Current recursion depth. Defaults to 0.
        """
        # split input into an octree/quadtree
        subarrays = split(array)
        num_subarrays = len(subarrays)

        # initialize dictionary only on first pass
        if dep not in value:
            value[dep] = []
            depth[dep] = []
            position[dep] = []

        # compute values for each subarray
        for idx, sub in enumerate(subarrays):
            value[dep] += [1] if np.max(sub) == 0 else [3] if np.min(sub) > 0 else [2]
            depth[dep] += [dep]
            position[dep] += [2 * pos + dirs[idx]]

        # process each subarray recursivelly
        for idx, sub in enumerate(subarrays):
            cur_idx = -num_subarrays + idx
            if value[dep][cur_idx] == 2 and dep < max_dep:
                recursive_linearise(sub, position[dep][cur_idx], dep + 1)

    # initialise memory
    value = {}
    depth = {}
    position = {}
    dirs = np.array(list(itertools.product([1, 2], repeat=array.ndim)))
    init_pos = np.array(array.ndim * [0])
    max_dep = int(math.log2(max_resolution))

    # call function recursivelly
    recursive_linearise(array, init_pos)

    # flatten dictionaries
    value = np.array(list(itertools.chain(*value.values())))
    depth = np.array(list(itertools.chain(*depth.values())))
    position = np.array(list(itertools.chain(*position.values())))

    return value, depth, position

We have to implement some further function...

In [36]:
def split(array: np.ndarray) -> np.ndarray:
    """ Splits the given array along each axis in half.

    Args:
        elements (np.ndarray): Numpy array of arbitary dimension.

    Returns:
        np.ndarray: Array of splited elements with an additional dimension along the first axis.
    """
    ndim = array.ndim
    array = np.expand_dims(array, axis=0)
    for i in range(ndim, 0, -1):
        array = np.concatenate(np.split(array, indices_or_sections=2, axis=i), axis=0)
    return array

In [37]:
class kdTree():
    """ Implements a kd-tree data structure for volumetric/spatial objects. Works with arrays of spatial data as well
    as linearised token sequence representations.

    This class allows to transform array with spatial elements into kd-trees, where k can by any natural number. Each
    node represents mixed elements, which can be split in its branches. Each leaf represents a final element which is
    either completly empty or completly occupied. These structure can be than linearized as a sequence of tokens, which
    is equivalent to the kd-tree. In the same way, as arrays with spatial elements can be transformed into kd-trees,
    token sequences can be transformed into kd-trees. This allows to seamlessly transform arrays of spatial data into
    token sequences and vice versa.
    """
    def __init__(self):
        """ Initializes the kd-tree for the right spatial dimensionality.

        Args:
            SPATIAL_DIM: Defines the spatial dimensionality of the kd-tree, e.g. '2' for images/pixels and '3' for
                volumes/voxels.
            pos_encoding: Defines the positional encoding of positions. It uses either a centered position,
                where each position relates to the center of all pixels/voxels or an intertwined encoding, where each
                layer uses an ascending, axis aligned enumeration, thus the position values are intertwined.
        """
        super().__init__()
        self.SPATIAL_DIM = 3
        self.dirs = np.array(list(itertools.product([1, 2], repeat=self.SPATIAL_DIM)))


    def concat(self, array: np.ndarray) -> np.ndarray:
        """ Concats elements of the array along each dimension, where each subarray is given in the first axis.

        Args:
            array (np.ndarray): Numpy array, holding subarrays in the first axis.

        Return:
            np.ndarray: Array of elements with concatenated subarrays along each axis.
        """
        for i in range(1, self.SPATIAL_DIM + 1):
            array = np.concatenate(np.split(array, indices_or_sections=2, axis=0), axis=i)
        return np.squeeze(array, axis=0)

    def insert_element_array(self, elements, max_depth=float('Inf'), depth=0, pos=None):
        """ Inserts an array of element values which is converted into a kd-tree.

        Args:
            elements: A numpy array of element values, with the dimensionality of the kd-tree.
            max_depth: The maximum depth of the resulting kd-tree. All nodes at `max_depth` are marked as final.
            depth: The current depth of the kd-tree. Used to recursively define the tree depth.
            pos: Defines the mean position of all elements at the current node.

        Return:
            The current node containing inserted values. The returned node should be the root node of the kd-tree.
        """
        self.depth = depth
        self.resolution = np.array(elements.shape[0])
        self.final = True
        self.pos = np.array(self.SPATIAL_DIM * [0]) if pos is None else pos
        # '1' - all elements are empty
        # '2' - elements are empty and occupied
        # '3' - all elements are occupied
        self.value = 1 if np.max(elements) == 0 else 3 if np.min(elements) > 0 else 2

        # input has a resolution of 1 and cannot be splitt anymore
        if self.resolution <= 1:
            return self

        # splitt only when elements are mixed and we are not at maximum depth
        if self.value == 2 and depth <= max_depth:
            self.final = False

            # split elements into subarrays
            sub_elements = split(elements)

            # compute new positions for future nodes
            # layerwise intertwined_positions
            new_pos = [2 * self.pos + d for d in self.dirs]

            # create child nodes
            self.child_nodes = [
                kdTree().insert_element_array(e, max_depth, depth + 1, p)
                for e, p in zip(sub_elements, new_pos)
            ]

        return self

    def get_element_array(self, depth=float('Inf'), mode='occupancy'):
        """ Converts the kd-tree into an array of elements.

        Args:
            depth: Defines the maximum depth of the children nodes, of which the value will be returned in the array.
            mode: Defines how the value of each node should be represented in the returned array. `occupancy` - returns
                all padding and empty values as '0' and all mixed and occupied values as '1'. `value` - return the
                exact value stored in the node. `color` - returns the values based on a colormap defined in `_cmap`,
                where the stored value is subtracted by 1 and the padding value is returned as '0'. `depth` - returns
                the current depth of the node as value in the array. `random` - returns a random number in the range of
                [0, 19] for each node.

        Return:
            A numpy array with the dimensionality of the kd-tree, which hold values defined by `mode`.

        """
        res = self.SPATIAL_DIM * [self.resolution]
        if self.final or self.depth == depth:
            # return empty array if all elements are empty
            if self.value == 1:
                return np.tile(0, res)
            # else return value based on `mode`
            elif mode == 'occupancy':
                return np.tile(1, res)

        return self.concat(np.array([node.get_element_array(depth, mode) for node in self.child_nodes]))

    def insert_token_sequence(self, value, resolution, max_depth=float('Inf')):
        """ Inserts a token sequence which is parsed into a kd-tree.

        Args:
            value: A token sequence representing a spatial object. The values should consist only of '1', '2' and '3'.
                The sequence can be eiter a string or an array of strings or integers.
            resolution: The resolution of the token sequence. This value should be a power of 2.
            max_depth: The maximum depth up to which the token sequence will be parsed.
            autorepair_errors: Select if the parser should try to automatically repair malformed input sequenced by
                adding padding tokens up to a required length. Each node with a value of '2' should have
                2**`SPATIAL_DIM` children nodes.
            silent: Select if errors and warnings should be printed into the output console.

        Return:
            A node which represents the given token sequence. The returned node should be the root node of the kd-tree.
        """
        # fail-fast: malformed input sequence
        all_tokens_valid = all(str(c) in '123' for c in value)
        if not all_tokens_valid:
            raise ValueError(
                "ERROR: Input sequence consists of invalid tokens. Check token values and array type." +
                f"Valid tokens consist of 1 (white), 2 (mixed) and 3 (black). Sequence: {value}."
            )

        # initialize self
        self.value = 0
        self.depth = 0
        self.pos = np.array(self.SPATIAL_DIM * [0])
        self.resolution = np.array(resolution)
        self.final = False

        # initialize parser
        depth = 1
        final_layer = False
        resolution = resolution // 2

        # initialize first nodes
        open_set = []
        self.child_nodes = [kdTree() for _ in range(2**self.SPATIAL_DIM)]
        open_set.extend(self.child_nodes)
        node_counter = len(open_set)

        # compute new positions for future nodes
        # layerwise intertwined_positions
        pos_set = [2 * self.pos + d for d in self.dirs]

        while len(value) > 0 and depth <= max_depth and len(open_set) > 0:
            # consume first token of sequence
            head = int(value[0])
            value = value[1:] if len(value) > 0 else value
            node_counter -= 1

            # get next node that should be populated
            node = open_set.pop(0)

            # assign values to node
            node.value = head
            node.depth = depth
            node.pos = pos_set.pop(0)
            node.resolution = np.array(resolution)

            # final node:
            # - head is '1' or '3', thus all elements have the same value
            # - the resolution is 1, thus the elements cannot be split anymore
            # - we are in the last depth layer, thus all nodes are final
            node.final = head in (1, 3) or np.array_equal(resolution, [1]) or final_layer
            if not node.final:
                node.child_nodes = [kdTree() for _ in range(2**self.SPATIAL_DIM)]
                open_set.extend(node.child_nodes)

                # TODO: add 'intertwined' position encoding
                # compute new positions for future nodes - center of all pixels
                pos_set.extend([node.pos + node.resolution // 2 * d for d in self.dirs])

            # update depth
            if node_counter <= 0:
                depth += 1
                resolution = np.array(resolution // 2)
                # return if the resolution becomes less than 1 - no visible elements
                if resolution < 1:
                    return self

                #TODO delete?
                node_counter = len(open_set)
                # fail-fast: malformed input sequence
                if len(value) < node_counter:
                    # perform simple sequence repair by appending missing tokens
                    value = np.append(value, [0 for _ in range(node_counter - len(value))])
                    if not silent:
                        print(
                            f"WARNING: Resolved error - Modified input sequence: {value}, " +
                            f"Current length: {len(value)}"
                        )

                if len(value) == node_counter:
                    final_layer = True

        return self

    def get_token_sequence(self, depth=float('Inf'), return_depth=False, return_pos=False):
        """ Returns a linearised sequence representation of the kd-tree.

        Args:
            depth: Defines the maximum depth of the nodes, up to which the tree is parsed.
            return_depth: Selects if the corresponding depth sequence should be returned.
            return_pos: Selects if the corresponding position sequence should be returned.

        Return
            A numpy array consisting of integer values representing the linearised kd-tree. Returns additionally the
            corresponding depth and position sequence if specified in `return_depth` or `return_pos`. The values are
            returned in the following order: (value, depth, position).
        """
        seq_value = []
        seq_depth = []
        seq_pos = []
        open_set = []

        # start with root node
        open_set.extend(self.child_nodes)

        while len(open_set) > 0:
            node = open_set.pop(0)

            # reached sufficient depth - return sequence so far
            if node.depth > depth:
                break

            seq_value += [node.value]
            seq_depth += [node.depth]
            seq_pos += [node.pos]

            if not node.final:
                open_set += node.child_nodes

        seq_value = np.asarray(seq_value)
        seq_depth = np.asarray(seq_depth)
        seq_pos = np.asarray(seq_pos)

        # output format depends in flags 'return_depth' and 'return_pos'
        output = [seq_value]
        if return_depth:
            output += [seq_depth]
        if return_pos:
            output += [seq_pos]
        return output


After obtaining the Octree representation of the voxel data, the Octree has to be linearized to be processed by the Transformer. This is done by traversing the tree in a depth-first manner and storing the node values in a list.

Now we are able to create an Octree for a given voxel data. The next step is to create a custom data set, which we can use for training the model.

Hier eventuell einen Octree und ein Voxel ausgeben (den Octree linearized) aber ist wahrscheinlich nicht so anschaulich #TODO

## Data Set
To create a pytorch data set, we can inheret from the torch.utils.data.Dataset class. This class requires us to implement the __len__ and __getitem__ methods. The __len__ method returns the number of samples in the data set and the __getitem__ method returns a sample from the data set at a given index.
In our case, before returning an item from the data set, we first have to create an Octree from the voxel data and linearize it.

In [38]:
class ShapeNet(Dataset):
    # Hardcoden
    def __init__(self, shape_dir, transform=None, target_transform=None):
        self.shape_dir = shape_dir
        self.transform = transform
        self.target_transform = target_transform
        # Get the list of files in shape_dir
        self.file_list = os.listdir(shape_dir)
        # Store the paths of objects in a list
        self.path = [os.path.join(shape_dir, file) for file in self.file_list]
        
    def __len__(self):
        return len(self.path)

    def __getitem__(self, idx):
        voxels = np.load(self.path[idx])
        #octree = kdTree().insert_element_array(voxels)
        #seq = octree.get_token_sequence(
            #depth=math.log2(RESOLUTION),
            #return_depth=True,
            #return_pos=True,
        #)
        seq = linearize(voxels)
        return seq

An dieser Stelle eventuell noch data augmentation auf dem voxel druchführen, für später. In Blender die data augmentation darstellen. #TODO

In [39]:
class EncoderOnlyCollate():
    """ Creates a collate module, which pads batched sequences to equal length with the padding token '0'. """

    def __call__(self, batch):
        """ Pads and packs a list of samples for the 'encoder_only' architecture. """
        # pad batched sequences with '0' to same length
        seq = pad_batch(batch)
        # return as (sequence, class)
        #TODO hier müsste doppelt seq returnt werden 
        # update : müsste doch stimmen? das eine ist unser input gebatch das andere die labels nicht im batch oder seq[-1]
        return seq, seq[0]

def to_sequences(batch):
    """ Transform a list on numpy arrays into sequences of pytorch tensors. """
    batch = [(torch.tensor(v), torch.tensor(d), torch.tensor(p)) for v, d, p in batch]

    # unpack batched sequences
    return zip(*batch)


def pad_batch(batch):
    """ Unpack batch and pad each sequence to a tensor of equal length. """
    val, dep, pos = to_sequences(batch)

    # pad each sequence
    val_pad = pad_sequence(val, batch_first=True, padding_value=PADDING_VALUE)
    dep_pad = pad_sequence(dep, batch_first=True, padding_value=PADDING_VALUE)
    pos_pad = pad_sequence(pos, batch_first=True, padding_value=PADDING_VALUE)

    return val_pad, dep_pad, pos_pad

Compression implementieren #TODO

# Get the Model

## Embedding
When working with tokens, we have to use a prober embedding for the tokens. Furthermore, we will use learned positional encodings.
The learned positional encoding represent the position of the token in the Octree. They are learned and can thereby be trained as part of the model. We create the positional encoding by adding the encoded value of the x,y,z coordinates.

In [40]:
class PositionalEncoding(nn.Module):
    def __init__(self, resolution, embed_dim):
        super(PositionalEncoding, self).__init__()
        # *2 weil die Sequence length maximal 2*resolution ist
        self.x_encoding = nn.Embedding(2*resolution,embed_dim,padding_idx=PADDING_VALUE)
        self.y_encoding = nn.Embedding(2*resolution,embed_dim,padding_idx=PADDING_VALUE)
        self.z_encoding = nn.Embedding(2*resolution,embed_dim,padding_idx=PADDING_VALUE)

    def forward(self, position):
        x = self.x_encoding(position[:,:,0])
        y = self.y_encoding(position[:,:,1])
        z = self.z_encoding(position[:,:,2])
        return x+y+z

Now we can create the whole embedding layer for the input using the positional encoding layer and another embedding for the values of the nodes.
Finally, we will add both outputs and return the result.

In [41]:
class EmbeddingLayer(nn.Module):
    def __init__(self,num_vocab, embed_dim, resolution):
        #num_vocab should be 1,2,3 and then add 1 for padding = 0
        super(EmbeddingLayer, self).__init__()
        self.positional_encoding = PositionalEncoding(resolution,embed_dim)
        self.value_embedding = nn.Embedding(num_vocab,embed_dim,padding_idx=PADDING_VALUE)

    def forward(self, value, depht, position):
        # this should work... eventuell noch mal maske invertieren 1 -> 0?
        self.mask = (value == PADDING_VALUE)
        pos = self.positional_encoding(position)
        val = self.value_embedding(value)
        return pos+val
    
    def get_masks(self):
        return self.mask

There is still one problem, the sie of the linearized tree grwos exponentially, thereby we have to compress some of the information in the lower levels of the tree. This is done by applying convolutions over the embeddes sequence.

In [42]:
class ConvolutionEmbedding(nn.Module):
    def __init__(self,num_vocab, embed_dim, resolution, conv_size):
        super(ConvolutionEmbedding,self).__init__()
        self.num_vocab = num_vocab
        self.embed_dim = embed_dim
        self.resolution = resolution
        self.chunk_size = conv_size
        self.embedding = EmbeddingLayer(num_vocab, embed_dim, resolution)
        self.conv = nn.Conv1d(embed_dim,embed_dim,kernel_size=conv_size, stride=conv_size)
    
    def forward(self, value, depth, position):
        #brauche ich das?? #TODO
        self.mask = padding_mask(value[:, ::self.chunk_size])
        embedded_seq = self.embedding(value, depth, position)
        return self.conv(embedded_seq.permute(0,2,1)).permute(0,2,1)
    
    def padding_mask(self):
        return self.mask

Finally we can create the embedding layer for our sequence, in which we use greater compression on the lower levels of the tree. Note, that convolution of size 1 does not compress any information and is just used for consistency.

In [43]:
class SequenceEmbedding(nn.Module):
    
    def __init__(self, num_vocab, embed_dim, resolution):
        super(SequenceEmbedding,self).__init__()
        self.embeddings = [
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=1),
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=1),
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=1),
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=4),
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=8),
            ConvolutionEmbedding(num_vocab, embed_dim, resolution, conv_size=8),
        ]

    def forward(self, value, depth, position):
        batch_depth = torch.max(depth)
        batch_size = len(value)

        batch = []
        padding_mask = []
        # extract value, depth and position sequence of current sample
        #Aus dem batch holen
        val, dep, pos = value[0], depth[0], position[0]
        b_emb = torch.tensor([], device=value.device)

        # embed layerwise
        for layer_idx, embedding in enumerate(self.embeddings):
            layer_depth = layer_idx + 1

            val_seq = val[dep == layer_depth]
            dep_seq = dep[dep == layer_depth]
            pos_seq = pos[dep == layer_depth]

            if(val_seq.shape[0] == 0):
                break

            # compute layer embedding
            layer_emb = embedding(
                val_seq.unsqueeze(0),
                dep_seq.unsqueeze(0),
                pos_seq.unsqueeze(0),
            )[0]
            b_emb = torch.cat([b_emb, layer_emb])

        # append embedding
        sequence = b_emb.unsqueeze(0)
        # create padding mask
        #TODO delete?! Padding 1 because 1=True=not masked
        padding_mask = [torch.zeros(b_emb.shape[0], dtype=torch.bool)]
        self.mask = pad_sequence(padding_mask, batch_first=True, padding_value=1)
        # pad embedding sequence
        return pad_sequence(sequence, batch_first=True, padding_value=0.0)

    def get_mask(self):
        """ Returns a padding mask, where padding tokens '0' of the value sequence are masked out. """
        return self.mask

In [44]:
def padding_mask(input_sequence):
    """ Create a padding mask for the given input.

        Always assumens '0' as a padding value. `input_sequence` has the shape (N, S).
    """
    return torch.zeros_like(input_sequence).masked_fill(input_sequence == 0, 1).bool()


In [45]:
lin = nn.Linear(4,3)
test = torch.tensor([[[1,2,3,4.],[5,6,7,8.],[5,6,7,8.]],[[1,2,3,4.],[5,6,7,8.],[5,6,7,8.]]])
#test = test.unsqueeze(0)
print(test.shape)
out = lin(test)
print(out.shape)

torch.Size([2, 3, 4])
torch.Size([2, 3, 3])


## Generative Head

The next step is to create a generative head that generates the respective logits for each node. For the generative head we utilize a linear layer which operatoes on the output of the trasnformer stack and another layer to get positional encodigns for the logits.


In [46]:
class LinearHead(nn.Module):
    def __init__(self, num_vocab,embed_dim,resolution):
        super().__init__()
        print(f"lin: {embed_dim}->{num_vocab}")
        self.linear = nn.Linear(embed_dim, num_vocab)
        self.pos_enc = PositionalEncoding(resolution, num_vocab)
        self.activation = nn.GELU()

    def forward(self, x, value, depth, pos):
        x = self.activation(x)
        x = self.linear(x)
        pos_enc = self.pos_enc(pos)
        x = x + pos_enc
        return x

Because we shortened the sequence in the beginning with the compression scheme, we now have to generate more tokens, depending on the current layer.

In [47]:
class ConvolutionalHead(nn.Module):
    def __init__(self, num_vocab, embed_dim, head_dim,resolution, conv_size):
        super(ConvolutionalHead, self).__init__()
        self_conv_size = conv_size

        self.activation = nn.GELU()
        print(f"Deconv: {embed_dim} -> {head_dim}")
        self.deconvolution = nn.ConvTranspose1d(embed_dim, head_dim, conv_size, stride=conv_size)
        print(f"BlockConv: {head_dim} -> {head_dim}")
        self.convolution = BlockConvolution(head_dim, head_dim, conv_size)
        print(f"Embed: {num_vocab} -> {head_dim}")
        self.embed = EmbeddingLayer(num_vocab, head_dim,resolution)
        print(f"Linear: {head_dim} -> {num_vocab}")
        self.linear = nn.Linear(head_dim, num_vocab)

    def forward(self,x,value,depth,pos):

        x = self.activation(x)
        x = self.deconvolution(x.transpose(1,2)).transpose(1,2)
        embed = self.embed(value,depth,pos)
        embed = self.activation(x)
        embed = self.convolution(embed[:, :x.shape[1]])
        x = x+embed
        
        x = self.activation(x)
        return self.linear(x)



In [48]:
class BlockConvolution(nn.Module):
    def __init__(self, source_dim, target_dim, block_size):
        """ Performs masked blockwise convolution on an input sequence.
            The mask is always an upper right triangle matrix with zeros on the diagonal.

        Args:
            source_dim: Defines the embedding dimension of the input sequence.
            target_dim: Defines the embedding dimension of the output sequence.
            block_size: Defines the size of the block over which we convolute.
        """
        super(BlockConvolution, self).__init__()

        self.block_size = block_size
        self.convolution = nn.Conv1d(source_dim, target_dim, (1,), bias=False)
        sigma = math.sqrt(1. / (block_size * source_dim))
        self.bias = nn.Parameter(torch.empty(block_size))
        nn.init.uniform_(self.bias, -sigma, sigma)

    def forward(self, seq_vector):
        """ Convolute tokens to reduce sequence length

        Args:
            seq_vector: Sequence vector with elements of the shape [N, S, E].

        Return:
            Sequence vector with the same length and target embedding dimension [N, S, E']
        """
        #basic convolution with some transpose for dimension fit
        features = self.convolution(seq_vector.transpose(1, 2)).transpose(1, 2)
        #vecor in the from of sqvec filled wtih 0
        out = torch.zeros_like(seq_vector)
        #add the learned bias to every block
        out[:, ::self.block_size] += self.bias[0]
        for i in range(self.block_size):
            #loop over block element
            for j in range(i):
                #add to one element the feature vector of all previous elements but not the elemnts infron of it
                out[:, i::self.block_size] += features[:, j::self.block_size]
            #add bias i to every following block elemnt on pos i
            out[:, i::self.block_size] += self.bias[i]

        return out

To finish the generative head, let's put it all together

In [49]:
class GenerativeHead(nn.Module):
    def __init__(self, num_vocab, embed_dim, head_dim, resolution):
        super().__init__()
        self.num_vocab = num_vocab
        self.embed_dim = embed_dim
        self.head_dim = head_dim
        self.resolution = resolution
        self.fc = nn.Linear(embed_dim, num_vocab)
        self.heads = [
            LinearHead(num_vocab, embed_dim, resolution),
            LinearHead(num_vocab, embed_dim, resolution),
            LinearHead(num_vocab, embed_dim, resolution),
            ConvolutionalHead(num_vocab, embed_dim, head_dim,resolution, conv_size=4),
            ConvolutionalHead(num_vocab, embed_dim, head_dim, resolution, conv_size=8),
            ConvolutionalHead(num_vocab, embed_dim, head_dim,  resolution, conv_size=8)
        ]
        self.reduction_factor = {
            1: 1,
            2: 1,
            3: 1,
            4: 4,
            5: 8,
            6: 8
        }
    def forward(self, x, value, depth, position):
        """ Transforms the output of the transformer target value logits.

        Args:
            x: Output of the transformer, the latent vector [N, T, E].
            value: Target value token sequence [N, T].
            depth: Target depth token sequence [N, T].
            position: Target position token sequence [N, T, A].
            last_only: Flag to switch processing, to decode only last depth layer.

        Return
            Logits of target value sequence.
        """
        batch_depth = torch.max(depth)
        out = []

        # process each sample individually N=1 also eigentlich redundant ein mal squeeze?
        for latent_vec, val, dep, pos in zip(x, value, depth, position):

            logits = torch.tensor([], device=x.device)
            vector_idx = 0

            # compute logits layerwise
            for layer_idx, head in enumerate(self.heads):
                layer_depth = layer_idx + 1
                # get value, depth and position sequence of current layer
                layer_val = val[dep == layer_depth]
                layer_dep = dep[dep == layer_depth]
                layer_pos = pos[dep == layer_depth]
                if(layer_pos.shape[0]==0):
                    break
                # compute number of vectors in latent vector of current layer
                # because we might have X tokens but only X/red_factor many feature vectors because of reduction
                num_vectors = torch.sum(dep == layer_depth) // self.reduction_factor[layer_depth]

                # filter latent vector of current layer
                layer_vec = latent_vec[vector_idx:vector_idx + num_vectors]
                # compute layer logits
                layer_logits = head(
                    layer_vec.unsqueeze(0),
                    layer_val.unsqueeze(0),
                    layer_dep.unsqueeze(0),
                    layer_pos.unsqueeze(0),
                )[0]
                logits = torch.cat([logits, layer_logits])

                # discard processed tokens
                vector_idx += num_vectors
            out += [logits]

        # pad embedding sequence
        return pad_sequence(out, batch_first=True, padding_value=0.0)



## Transformer Stack

The next step is to create the transformer stack
explain forward function and mask etc.  #TODO

In [50]:
class Transformer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, num_layers):
        super().__init__()
        self.sos = nn.Parameter(torch.zeros(embed_dim))
        nn.init.normal_(self.sos)

        encoder_layer = nn.TransformerEncoderLayer(
            d_model=embed_dim,
            nhead=num_heads,
            dim_feedforward=4 * embed_dim,
            dropout=dropout,
            activation='gelu',
            batch_first=True
        )
        self.transformer = nn.TransformerEncoder(
            encoder_layer, 
            num_layers=num_layers,
            norm=nn.LayerNorm(embed_dim))
    #cls raus
    def forward(self, input_seq, padding_mask):
        # Für was brauche ich die cls??? -> eventuell weg
        #create tensor of shape batch_sizex1(=seq_len)xembed_dim filled with sos token
        sos = self.sos.unsqueeze(0).unsqueeze(0).repeat(input_seq.shape[0],1,1)
        batch_size,seq_len,_ = input_seq.shape
        input_seq = torch.cat([sos, input_seq[:,:-1]], dim=1)
        mask = get_mask(seq_len)
        # process input sequence by the Transformer stack, get output sequence
        #Weil in forward schon von batch size = 1 ausgegangen wird, sollte man auch hier src_key_padding mask verwerfen und oben gar nicht speichern TODO
        output_seq = self.transformer(
            src=input_seq,  
            mask=mask,  # [L, L]
            src_key_padding_mask=padding_mask,  # [N, L]
        ) 
        return output_seq


implement get_mask and explain the reason for masks

In [51]:

def get_mask(seq_len, device=None):
    """ Creates a diagonal mask, which prevents the self-attention to look ahead. """
    attn_mask = torch.full((seq_len, seq_len), -float("Inf"), device=device)
    return torch.triu(attn_mask, diagonal=1)

## Octree Transfromer

After creating all relevant parts of the model, we can now combine them to create the Octree Transformer.

In [52]:
class OctreeTransformer(nn.Module):
    def __init__(self, embed_dim, num_heads, dropout, num_layers, num_vocab,head_dim, resolution):
        print("please note that num_vocab should include the padding index")
        super().__init__()
        self.embedding = SequenceEmbedding(num_vocab, embed_dim, resolution)
        self.transformer = Transformer(embed_dim, num_heads, dropout, num_layers)
        self.head = GenerativeHead(num_vocab,embed_dim,head_dim, resolution)

    def forward(self, sequence):
        #sequence should be tuple value depth position
        #hier sequence entpacken
        value, depth, position = sequence
        x = self.embedding(value, depth, position)
        x = self.transformer(x,self.embedding.get_mask())
        x = self.head(x,value,depth,position)
        return x

## Temp: Test the Model

In [53]:

# Set the device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the input sequence
values = torch.tensor([[2, 3, 1, 0]])  # Example values sequence
depths = torch.tensor([[2, 3, 1, 1]])  # Example depths sequence
positions = torch.tensor([[[1, 1, 1], [2, 2, 2], [0, 0, 0], [0, 0, 0]]])  # Example positions sequence
sequence = (values, depths, positions)


# Define the model parameters
embed_dim = 64
num_heads = 4
dropout = 0.1
num_layers = 2
num_vocab = 3+1
resolution = 64
head_dim = 16

# Create an instance of the OctreeTransformer model
model = OctreeTransformer(embed_dim, num_heads, dropout, num_layers, num_vocab, head_dim,resolution).to(device)

# Set the model to evaluation mode
model.eval()

# Perform the forward pass
with torch.no_grad():
    sequence = (values.to(device), depths.to(device), positions.to(device))
    output = model(sequence)

# Print the output
print("Output:")
print(output.shape)

please note that num_vocab should include the padding index
lin: 64->4
lin: 64->4
lin: 64->4
Deconv: 64 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4
Deconv: 64 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4
Deconv: 64 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4
Output:
torch.Size([1, 4, 4])


  return torch._transformer_encoder_layer_fwd(


## Train the Model
To train our model we use pytorch lightning. Pytorch lightning is a lightweight wrapper for pytorch that simplifies the training process. It provides a lot of useful features like automatic checkpointing, logging, and distributed training. To use pytorch lightning, we have to create a pytorch lightning module. This module is similar to a pytorch module, but it requires us to implement the training step and configure optimizer function. After implementing these functions we can comforably train our model.

In [54]:
class OctreeTransformer_pl(pl.LightningModule):
    def __init__(self, embed_dim, num_heads, dropout, num_layers, num_vocab, resolution):
        super().__init__()     
        print("please note that num_vocab should include the padding index")
        self.embedding = SequenceEmbedding(num_vocab, embed_dim, resolution)
        self.transformer = Transformer(embed_dim, num_heads, dropout, num_layers)
        self.head = GenerativeHead(num_vocab,embed_dim,head_dim, resolution)

    def forward(self, sequence):
        #sequence should be tuple value depth position
        #hier sequence entpacken
        value, depth, position = sequence
        embeddings = self.embedding(value,depth, position)
        encoder_output = self.transformer(embeddings ,self.embedding.get_mask())
        x = self.head(encoder_output,value,depth,position)
        return x
    
    def step(self, batch, batch_idx):
        sequence, target = batch
        output = self(sequence)
        loss = self.calculate_loss(output, target)
        return loss

    def training_step(self, batch, batch_idx):
        loss = self.step(batch, batch_idx)
        self.log('train_loss', loss)
        return loss

    def validation_step(self, batch, batch_idx):
        loss = self.step(batch, batch_idx)
        self.log('val_loss', loss)

    def test_step(self, batch, batch_idx):
        loss = self.step(batch, batch_idx)
        self.log('test_loss', loss)

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01)
        lr_scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=10)
        warm_up = ig.handlers.param_scheduler.create_lr_scheduler_with_warmup(lr_scheduler,0,1000)
        return [optimizer], [warm_up]
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01)
        total_steps = 10  # Total number of training steps
        warmup_steps = 1000  # Number of warm-up steps
        lr_scheduler = create_custom_lr_scheduler(optimizer, total_steps, warmup_steps)
        return [optimizer], [lr_scheduler]

    def calculate_loss(self, output, target):
        loss_function = nn.CrossEntropyLoss(ignore_index=PADDING_VALUE)
        # batch size x class x d_1 x ... x d_n expected for cross entropy loss
        output = output.permute(0,2,1)
        return loss_function(output, target)

Cite the author of this code #TODO

In [55]:
def create_custom_lr_scheduler(optimizer, total_steps, warmup_steps):
    def lr_lambda(current_step):
        if current_step < warmup_steps:
            return float(current_step) / warmup_steps
        else:
            return max(
                0.0, 0.5 * (1.0 + math.cos(math.pi * (current_step - warmup_steps) / (total_steps - warmup_steps)))
            )

    lr_scheduler = LambdaLR(optimizer, lr_lambda)
    return {
        'scheduler': lr_scheduler,
        'interval': 'step',
        'frequency': 1
    }

After everything is set up, we are finally able to train our model

In [56]:

resolution = 64
embed_dim = 256
num_heads = 4
dropout = 0.1
num_layers = 8
#print all hyperparameters
print(f"Resolution: {resolution}")
print(f"Embedding Dimension: {embed_dim}")
print(f"Number of Heads: {num_heads}")
print(f"Dropout: {dropout}")
print(f"Number of Layers: {num_layers}")

model = OctreeTransformer_pl(embed_dim, num_heads, dropout, num_layers, NUM_VOCAB, resolution)

# Load the dataset
dataset = ShapeNet("../data")
# Create data loaders
train_loader = DataLoader(dataset, collate_fn=EncoderOnlyCollate(), batch_size=1, shuffle=True)

# Initialize Trainer
trainer = pl.Trainer(max_epochs=3)
trainer.fit(model, train_loader)

Resolution: 64
Embedding Dimension: 256
Number of Heads: 4
Dropout: 0.1
Number of Layers: 8
please note that num_vocab should include the padding index
lin: 256->4
lin: 256->4
lin: 256->4
Deconv: 256 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4
Deconv: 256 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4
Deconv: 256 -> 16
BlockConv: 16 -> 16
Embed: 4 -> 16
Linear: 16 -> 4


  rank_zero_warn(
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
  rank_zero_warn(

  | Name        | Type              | Params
--------------------------------------------------
0 | embedding   | SequenceEmbedding | 0     
1 | transformer | Transformer       | 6.3 M 
2 | head        | GenerativeHead    | 1.0 K 
--------------------------------------------------
6.3 M     Trainable params
0         Non-trainable params
6.3 M     Total params
25.280    Total estimated model params size (MB)
  rank_zero_warn(
  rank_zero_warn(


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=3` reached.


## Generate Shapes

To generate shapes, we have to create a generator. This generator samples tokens in an autoregressive way with respect to the kernel size which was used for compression. Thereby, it samples kernel size many tokens each time step.

In [57]:
class Generator:
    def __init__(self, model, num_tokens=1, **_):
        """ Create token generator instance which samples 'num_tokens' in one pass.

        Args:
            compute_logits_fn: Pointer to function, which computes logits of given sequence.
            num_tokens: Defines the number of sampled tokens in each step.
        """
        self.model = model
        self.kernel_size = num_tokens

    def __call__(self, val, dep, pos, temperature=1.0):
        """ Sample autoregressive current value token sequence and return updated value sequence.

        Args:
            val: Value token sequence of current layer.
            dep: Depth token sequence of current layer.
            pos: Position token sequence of current layer.
            memory: Latent sequence vector of the previous layer.
            idx: Currently sampled transformer layer index.
            temperature: Defines the randomness of the samples.

        Return:
            Sampled token sequence with values of the current layer.
        """
        # compute indices
        token_idx = 0
        sampled_idx = len(torch.cat(val[:-1])) if len(val) > 1 else 0

        # sample tokens autoregressive
        for _ in trange(len(val[-1]) // self.kernel_size, leave=False, desc="Tokens"):
            for block_idx in range(self.kernel_size):
                # concat layers and slice sequence for speed_up
                seq = (
                    torch.cat(val)[:sampled_idx + token_idx + self.kernel_size].unsqueeze(0),
                    torch.cat(dep)[:sampled_idx + token_idx + self.kernel_size].unsqueeze(0),
                    torch.cat(pos)[:sampled_idx + token_idx + self.kernel_size].unsqueeze(0),
                )

                logits = self.model(seq)[0]

                # retrieve only logits for for current index
                sampled_token_logits = logits[sampled_idx + token_idx + block_idx]

                # compute token probabilities from logits
                sampled_token_logits[0] = -float("Inf")  # 'padding' token
                print(sampled_token_logits)
                probs = torch.nn.functional.softmax(sampled_token_logits / temperature, dim=-1)  # [t, V]
                print(probs)
                # sample next sequence token
                val[-1][token_idx + block_idx] = torch.multinomial(probs, num_samples=1)[0]

            # update indices
            token_idx += self.kernel_size

        return val[-1]

Now we can create a Sampler class. We have a Generator for every layer, to predict more tokens based on the compression we applied earlier.

In [58]:
class Sampler:
    def __init__(self, model, max_resolution, **_):
        """ Provides a basic implementation of the sampler for the 'encoder_only' architecture.

        Args:
            model: Model which is used for sampling.
            head: Generative head type used in the model.
            spatial_dim: The spatial dimensionality of the array of elements.
            max_resolution: Maximum resolution the model is trained on.
            position_encoding: Defines the positional encoding of the data.
            device: Device on which, the data should be stored. Either "cpu" or "cuda" (gpu-support).
        """
        self.generators = [
            Generator(model,1),
            Generator(model,1),
            Generator(model,1), 
            Generator(model,4), 
            Generator(model,8),
            Generator(model,8), 
        ]

        self.max_resolution = max_resolution

    def __call__(self, precondition_resolution, target_resolution, temperature):
        """ Perform an iterative sampling of the given sequence until reaching the end of sequence, the maximum sequence
            length or the desired resolution.

        Args:
            precondition: An array of elements (pixels/voxels) as an numpy array.
            precondition_resolution: Resolution at which the autoencoder will reconstruct the layer.
            target_resolution: Resolution up to which an object should be sampled.
            temperature: Defines the randomness of the samples.
            cls: class label for conditional generation.

        Return:
            A token sequence with values, encoding the final sample.
        """
        #get sample
        val, dep, pos = self.generate_sample(precondition_resolution)

        # compute the number of finished (current) layers and the maximum sampleable layer
        cur_layer = len(val)
        max_layer = int(math.log2(min(target_resolution, self.max_resolution)))

        with torch.no_grad():

            # sample layer-wise
            for idx in tqdm(range(cur_layer, max_layer), initial=cur_layer, total=max_layer, leave=True, desc="Layers"):

                # init sequences for next layer
                next_val, next_dep, next_pos = next_layer_tokens(
                    val, dep, pos, self.max_resolution
                )
                # predict value tokens for current layer
                next_val = self.generators[idx](
                    val=val + [next_val],
                    dep=dep + [next_dep],
                    pos=pos + [next_pos],
                    temperature=temperature
                )

                # append sampled tokens to current sequence
                val += [next_val]
                dep += [next_dep]
                pos += [next_pos]

                if torch.sum(next_val == 2) == 0:
                    break  # early-out, no mixed tokens sampled

        return postprocess(val, target_resolution)
    
    def generate_sample(self, precondition_resolution):

        #generate random numpy array with values between 0-2
        array_size = 3 * [self.max_resolution]
        precondition = torch.randint(low=0, high=2, size=array_size, dtype=torch.long).numpy()

        # convert input array into token sequence
        tree = kdTree()
        tree = tree.insert_element_array(precondition, max_depth=math.log2(precondition_resolution) + 1)
        value, depth, position = tree.get_token_sequence(
            depth=math.log2(precondition_resolution), return_depth=True, return_pos=True
        )

        val = []
        dep = []
        pos = []
        # extract each depth layer separately and convert to PyTorch as a long tensor
        for d in range(1, max(depth) + 1):
            val += [torch.tensor(value[depth == d], dtype=torch.long, device=device)]
            dep += [torch.tensor(depth[depth == d], dtype=torch.long, device=device)]
            pos += [torch.tensor(position[depth == d], dtype=torch.long, device=device)]

        return val, dep, pos
    
    def postprocess(self,value, target_resolution):
        """ Transform sequence of value tokens into an array of elements (voxels/pixels).

        Args:
            value: List of value token sequences for each layer as pytorch tensors.
            target_resolution: Resolution up to which an object should be sampled.
            spatial_dim: The spatial dimensionality of the array of elements.
            pos_encoding: Defines the positional encoding of the data.

        Return:
            An array of elements as a numpy array.
        """
        # concat all layers
        value = torch.cat(value)

        # move value sequence to the cpu and convert to numpy array
        value = value.cpu().numpy()

        # insert the sequence into a kd-tree
        tree = kdTree().insert_token_sequence(
            value,
            resolution=target_resolution
        )

        # retrive pixels/voxels from the kd-tree
        return tree.get_element_array(mode="occupancy")


In [59]:
def next_layer_tokens(value, depth, position, max_resolution):
    """ Creates artificial tokens for the next layer of the value sequence, to match the predefined shape. Precomputes
    corresponding depth and position tokens of the sequence, too.

    Args:
        value: List of value token sequences for each layer as pytorch tensors.
        depth: List of depth token sequences for each layer as pytorch tensors.
        position: List of position token sequences for each layer as pytorch tensors.
        spatial_dim: The spatial dimensionality of the value sequence.
        max_resolution: The maximal resolution the corresponding model is trained for.
        pos_encoding: Defines the positional encoding of the data.

    Return:
        Pre-initialised next layer sequence (value, depth, position).
    """
    cur_device = value[0].device
    dirs = np.array(list(itertools.product([1, 2], repeat=3)))
    num_children = 2**SPATIAL_DIM

    # got an empty input - initialize with default values and return
    if len(value[0]) == 0:
        value = torch.tensor(num_children * [1], device=cur_device, dtype=torch.long)
        depth = torch.tensor(num_children * [1], device=cur_device, dtype=torch.long)
        pos = (
            torch.ones(num_children, SPATIAL_DIM, device=cur_device, dtype=torch.long) *
            torch.tensor(dirs, device=cur_device)
        )
    # compute next layer depth and number of future tokens
    cur_depth = len(value)
    num_future_tokens = num_children * torch.sum(value[-1] == 2)

    # compute future sequence (non padding token) and future depth sequence
    nl_value = torch.tensor([1], device=cur_device, dtype=torch.long).repeat(num_future_tokens)
    nl_depth = torch.tensor([cur_depth + 1], device=cur_device, dtype=torch.long).repeat(num_future_tokens)

    # retrive and copy mixed tokens positions
    pos_token = position[-1][value[-1] == 2]
    nl_pos = torch.repeat_interleave(pos_token, num_children, dim=0)

    # compute position difference and add it to future positions with respect to predefined pattern

    nl_pos = 2 * nl_pos + torch.tensor(dirs, device=cur_device).repeat(pos_token.shape[0], 1)
    return nl_value, nl_depth, nl_pos

In [60]:
def postprocess(value, target_resolution):
    """ Transform sequence of value tokens into an array of elements (voxels/pixels).

    Args:
        value: List of value token sequences for each layer as pytorch tensors.
        target_resolution: Resolution up to which an object should be sampled.
        spatial_dim: The spatial dimensionality of the array of elements.
        pos_encoding: Defines the positional encoding of the data.

    Return:
        An array of elements as a numpy array.
    """
    # concat all layers
    value = torch.cat(value)

    # move value sequence to the cpu and convert to numpy array
    value = value.cpu().numpy()

    # TODO: define trinary transformation based on list of embeddings

    # insert the sequence into a kd-tree
    tree = kdTree().insert_token_sequence(
        value,
        resolution=target_resolution
    )

    # retrive pixels/voxels from the kd-tree
    return tree.get_element_array(mode="occupancy")


Lets write a sample script where we sample a random numpy array, convert it into an octree representation, generate a shape of our given resolution with our trained model. #TODO preprocessing and postprocessing würde kdtree class benötigen! array linearite und dann abschneiden eventuell als precondition? Dann Script schreiben, dass aus linearized octree ein np array macht und das visualisiert

In [61]:
sampler = Sampler(model,max_resolution=RESOLUTION)
sample = sampler(4,8,1)
save_obj(sample, "test")

Layers:  67%|██████▋   | 2/3 [00:00<?, ?it/s]

Tokens:   0%|          | 0/512 [00:00<?, ?it/s]

Layers:  67%|██████▋   | 2/3 [00:00<?, ?it/s]

tensor([-inf, nan, nan, nan])
tensor([nan, nan, nan, nan])





RuntimeError: probability tensor contains either `inf`, `nan` or element < 0

In [None]:
sample

array([[[0, 1, 1, 0, 1, 0, 0, 1],
        [0, 1, 1, 0, 0, 0, 0, 1],
        [0, 1, 1, 0, 1, 1, 1, 1],
        [0, 1, 1, 0, 1, 0, 1, 1],
        [0, 1, 1, 1, 1, 0, 1, 1],
        [0, 1, 1, 1, 0, 0, 1, 1],
        [0, 1, 0, 1, 0, 1, 0, 1],
        [0, 1, 0, 0, 0, 0, 1, 1]],

       [[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 0, 1, 0, 1, 1],
        [0, 1, 1, 0, 1, 1, 1, 1],
        [1, 1, 0, 0, 1, 1, 1, 1],
        [1, 1, 1, 0, 1, 0, 1, 1],
        [1, 0, 0, 0, 1, 0, 0, 1],
        [0, 1, 0, 0, 1, 0, 1, 1],
        [0, 1, 1, 1, 1, 0, 1, 1]],

       [[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 0, 1, 1, 1, 1],
        [0, 1, 1, 1, 1, 0, 1, 0],
        [0, 1, 0, 0, 1, 0, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1]],

       [[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1],
        

In [None]:
from obj2html import obj2html
from IPython.display import display, HTML

In [None]:
obj2html('../samples/shape_0.obj', 'index.html')
display(HTML('index.html'))

In [None]:
# Install k3d library if not already installed
# !pip install k3d

import k3d
import numpy as np
from pywavefront import Wavefront

# Load the OBJ file using pywavefront
# Replace 'path_to_your_obj_file.obj' with the actual path to your OBJ file
obj_file_path = '../samples/shape_0.obj'
mesh = Wavefront(obj_file_path)
print("finish wavefront")

# Extract vertices and faces from the loaded OBJ file
vertices = np.array(mesh.vertices)
faces = np.array(mesh.mesh_list[0].faces)
print("finish extract vertices and faces")
# Create a k3d plot
plot = k3d.plot()

# Create a k3d mesh object
mesh_k3d = k3d.mesh(vertices, faces)
plot += mesh_k3d

# Display the plot
plot.display()



# Visualization of Results