<!-- TABS -->
# Build multimodal embedding models

In [9]:
# <tab: Text>
from pinnacledb.ext.sentence_transformers import SentenceTransformer

# Load the pre-trained sentence transformer model
pinnaclemodel = SentenceTransformer(identifier='all-MiniLM-L6-v2')

In [10]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel.predict_one('some text')

In [11]:
# <tab: Image>
import torch
import clip
from torchvision import transforms
from pinnacledb.ext.torch import TorchModel

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = CLIPVisionEmbedding()
pinnaclemodel = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')

In [12]:
# <tab: Text-2-Image>

import torch
import clip
from torchvision import transforms
from pinnacledb import Model
from pinnacledb.ext.torch import TorchModel

class CLIPTextEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, _ = clip.load("RN50", device=self.device)
        
    def __call__(self, text):
        features = clip.tokenize([text])
        return self.model.encode_text(features)
        
model = CLIPTextEmbedding()
pinnaclemodel_text = Model(identifier='clip-text', object=model)

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = CLIPVisionEmbedding()
pinnaclemodel_image = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')

In [13]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel_image.predict_one(Image.fromarray(np.ones((256,256,3)).astype(np.uint8)))

[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m344 [0m | [1mInitializing TorchModel : clip-vision[0m
[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m347 [0m | [1mInitialized  TorchModel : clip-vision successfully[0m


In [14]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel_text.predict_one('some text')

[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m344 [0m | [1mInitializing ObjectModel : clip-text[0m
[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m347 [0m | [1mInitialized  ObjectModel : clip-text successfully[0m
