<!-- TABS -->
# Build multimodal embedding models

In [9]:
# <tab: Text>
from pinnacledb.ext.sentence_transformers import SentenceTransformer

# Load the pre-trained sentence transformer model
pinnaclemodel = SentenceTransformer(identifier='all-MiniLM-L6-v2')

In [10]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel.predict_one('some text')

In [11]:
# <tab: Image>
import torch
import clip
from torchvision import transforms
from pinnacledb.ext.torch import TorchModel

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = CLIPVisionEmbedding()
pinnaclemodel = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')

In [12]:
# <tab: Text-2-Image>

import torch
import clip
from torchvision import transforms
from pinnacledb import Model
from pinnacledb.ext.torch import TorchModel

class CLIPTextEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, _ = clip.load("RN50", device=self.device)
        
    def __call__(self, text):
        features = clip.tokenize([text])
        return self.model.encode_text(features)
        
model = CLIPTextEmbedding()
pinnaclemodel_text = Model(identifier='clip-text', object=model)

class CLIPVisionEmbedding:
    def __init__(self):
        # Load the CLIP model
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model, self.preprocess = clip.load("RN50", device=self.device)
        
    def preprocess(self, image):
        # Load and preprocess the image
        image = self.preprocess(image).unsqueeze(0).to(self.device)
        return image
        
model = CLIPVisionEmbedding()
pinnaclemodel_image = TorchModel(identifier='clip-vision', object=model.model, preprocess=model.preprocess, forward_method='encode_image')

In [13]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel_image.predict_one(Image.fromarray(np.ones((256,256,3)).astype(np.uint8)))

[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m344 [0m | [1mInitializing TorchModel : clip-vision[0m
[32m 2024-Mar-27 14:47:07.83[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m347 [0m | [1mInitialized  TorchModel : clip-vision successfully[0m


In [14]:
# <testing>
import numpy as np
from PIL import Image

embeddings = pinnaclemodel_text.predict_one('some text')

[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m344 [0m | [1mInitializing ObjectModel : clip-text[0m
[32m 2024-Mar-27 14:47:07.92[0m| [1mINFO    [0m | [36mkartiks-MacBook-Air.local[0m| [36mpinnacledb.components.component[0m:[36m347 [0m | [1mInitialized  ObjectModel : clip-text successfully[0m


In [2]:
# <tab Audio>
!pip install librosa
import librosa
import numpy as np
from pinnacledb import Model

def audio_embedding(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    return mfccs
pinnaclemodel = Model(identifier='my-model-audio', object=audio_embedding)

Collecting librosa
  Downloading librosa-0.10.1-py3-none-any.whl.metadata (8.3 kB)
Collecting audioread>=2.1.9 (from librosa)
  Downloading audioread-3.0.1-py3-none-any.whl.metadata (8.4 kB)
Collecting numba>=0.51.0 (from librosa)
  Downloading numba-0.59.1-cp311-cp311-macosx_11_0_arm64.whl.metadata (2.7 kB)
Collecting soundfile>=0.12.1 (from librosa)
  Downloading soundfile-0.12.1-py2.py3-none-macosx_11_0_arm64.whl.metadata (14 kB)
Collecting soxr>=0.3.2 (from librosa)
  Downloading soxr-0.3.7-cp311-cp311-macosx_11_0_arm64.whl.metadata (5.5 kB)
Collecting lazy-loader>=0.1 (from librosa)
  Downloading lazy_loader-0.3-py3-none-any.whl.metadata (4.3 kB)
Collecting llvmlite<0.43,>=0.42.0dev0 (from numba>=0.51.0->librosa)
  Downloading llvmlite-0.42.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (4.8 kB)
Downloading librosa-0.10.1-py3-none-any.whl (253 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.7/253.7 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m[31m1

In [4]:
# <testing>
import wave
import struct

sample_rate = 44100 
duration = 1 
frequency = 440
amplitude = 0.5

# Generate the sine wave
num_samples = int(sample_rate * duration)
t = np.linspace(0, duration, num_samples, False)
signal = amplitude * np.sin(2 * np.pi * frequency * t)

# Open a new WAV file
output_file = 'dummy_audio.wav'
wav_file = wave.open(output_file, 'w')

# Set the parameters for the WAV file
nchannels = 1  # Mono audio
sampwidth = 2  # Sample width in bytes (2 for 16-bit audio)
framerate = sample_rate
nframes = num_samples

# Set the parameters for the WAV file
wav_file.setparams((nchannels, sampwidth, framerate, nframes, 'NONE', 'not compressed'))

# Write the audio data to the WAV file
for sample in signal:
    wav_file.writeframes(struct.pack('h', int(sample * (2 ** 15 - 1))))

# Close the WAV file
wav_file.close()

# Test
pinnaclemodel.predict_one(output_file)