<!-- TABS -->
# Multimodal vector search

<!-- TABS -->
## Configure your production system

:::note
If you would like to use the production features 
of SuperDuperDB, then you should set the relevant 
connections and configurations in a configuration 
file. Otherwise you are welcome to use "development" mode 
to get going with SuperDuperDB quickly.
:::

In [None]:
import os

os.makedirs('.pinnacledb', exist_ok=True)
os.environ['pinnacleDB_CONFIG'] = '.pinnacledb/config.yaml'

In [None]:
# <tab: MongoDB Community>
CFG = '''
data_backend: mongodb://127.0.0.1:27017/documents
artifact_store: filesystem://./artifact_store
cluster:
  cdc:
    strategy: null
    uri: ray://127.0.0.1:20000
  compute:
    uri: ray://127.0.0.1:10001
  vector_search:
    backfill_batch_size: 100
    type: in_memory
    uri: http://127.0.0.1:21000
'''

In [None]:
# <tab: MongoDB Atlas>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
        type: native
databackend: mongodb+srv://<user>:<password>@<mongo-host>:27017/documents
'''

In [None]:
# <tab: SQLite>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: sqlite://<path-to-db>.db
'''

In [None]:
# <tab: MySQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mysql://<user>:<password>@<host>:<port>/database
'''

In [None]:
# <tab: Oracle>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: mssql://<user>:<password>@<host>:<port>
'''

In [None]:
# <tab: PostgreSQL>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: postgres://<user>:<password>@<host>:<port</<database>
'''

In [None]:
# <tab: Snowflake>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: snowflake://<user>:<password>@<account>/<database>
'''

In [None]:
# <tab: Clickhouse>
CFG = '''
artifact_store: filesystem://<path-to-artifact-store>
metadata_store: sqlite://<path-to-sqlite-db>.db
cluster: 
    compute: ray://<ray-host>
    cdc:    
        uri: http://<cdc-host>:<cdc-port>
    vector_search:
        uri: http://<vector-search-host>:<vector-search-port>
databackend: clickhouse://<user>:<password>@<host>:<port>
'''

In [None]:
with open(os.environ['pinnacleDB_CONFIG'], 'w') as f:
    f.write(CFG)

<!-- TABS -->
## Start your cluster

:::note
Starting a SuperDuperDB cluster is useful in production and model development
if you want to enable scalable compute, access to the models by multiple users for collaboration, 
monitoring.

If you don't need this, then it is simpler to start in development mode.
:::

In [None]:
# <tab: Experimental Cluster>
!python -m pinnacledb local-cluster up

In [None]:
# <tab: Docker-Compose>
!make testenv_image
!make testenv_init

In [None]:
from pinnacledb import pinnacle

db = pinnacle()

<!-- TABS -->
## Connect to SuperDuperDB

:::note
Note that this is only relevant if you are running SuperDuperDB in development mode.
Otherwise refer to "Configuring your production system".
:::

In [None]:
# <tab: MongoDB>
from pinnacledb import pinnacle

db = pinnacle('mongodb://localhost:27017/documents')

In [None]:
# <tab: SQLite>
from pinnacledb import pinnacle
db = pinnacle('sqlite://my_db.db')

In [None]:
# <tab: MySQL>
from pinnacledb import pinnacle

user = 'pinnacle'
password = 'pinnacle'
port = 3306
host = 'localhost'
database = 'test_db'

db = pinnacle(f"mysql://{user}:{password}@{host}:{port}/{database}")

In [None]:
# <tab: Oracle>
from pinnacledb import pinnacle

user = 'sa'
password = 'pinnacle#1'
port = 1433
host = 'localhost'

db = pinnacle(f"mssql://{user}:{password}@{host}:{port}")

In [None]:
# <tab: PostgreSQL>
!pip install psycopg2
from pinnacledb import pinnacle

user = 'postgres'
password = 'postgres'
port = 5432
host = 'localhost'
database = 'test_db'
db_uri = f"postgres://{user}:{password}@{host}:{port}/{database}"

db = pinnacle(db_uri, metadata_store=db_uri.replace('postgres://', 'postgresql://'))

In [None]:
# <tab: Snowflake>
from pinnacledb import pinnacle

user = "pinnacleuser"
password = "pinnaclepassword"
account = "XXXX-XXXX"  # ORGANIZATIONID-USERID
database = "FREE_COMPANY_DATASET/PUBLIC"

snowflake_uri = f"snowflake://{user}:{password}@{account}/{database}"

db = pinnacle(
    snowflake_uri, 
    metadata_store='sqlite:///your_database_name.db',
)

In [None]:
# <tab: Clickhouse>
from pinnacledb import pinnacle

user = 'default'
password = ''
port = 8123
host = 'localhost'

db = pinnacle(f"clickhouse://{user}:{password}@{host}:{port}", metadata_store=f'mongomock://meta')

In [None]:
# <tab: DuckDB>
from pinnacledb import pinnacle

db = pinnacle('duckdb://mydb.duckdb')

In [None]:
# <tab: Pandas>
from pinnacledb import pinnacle

db = pinnacle(['my.csv'], metadata_store=f'mongomock://meta')

In [None]:
# <tab: MongoMock>
from pinnacledb import pinnacle

db = pinnacle('mongomock:///test_db')

<!-- TABS -->
## Get useful sample data

In [None]:
# <tab: Text>
!curl -O https://pinnacledb-public-demo.s3.amazonaws.com/text.json
import json

with open('text.json', 'r') as f:
    data = json.load(f)

In [None]:
# <tab: PDF>
!curl -O https://pinnacledb-public-demo.s3.amazonaws.com/pdfs.zip && unzip -o pdfs.zip
import os

data = [f'pdfs/{x}' for x in os.listdir('./pdfs')]
data

In [None]:
# <tab: Image>
!curl -O s3://pinnacledb-public-demo/images.zip && unzip images.zip
import os

data = [f'images/{x}' for x in os.listdir('./images')]

In [None]:
# <tab: Video>
!curl -O s3://pinnacledb-public-demo/videos.zip && unzip videos.zip
import os

data = [f'videos/{x}' for x in os.listdir('./videos')]

In [None]:
# <tab: Audio>
!curl -O s3://pinnacledb-public-demo/audio.zip && unzip audio.zip
import os

data = [f'audios/{x}' for x in os.listdir('./audios')]

<!-- TABS -->
## Create datatype

Data types such as "text" or "integer" which are natively support by your `db.databackend` don't need a datatype.

In [None]:
datatype = None

Otherwise do one of the following:

In [None]:
# <tab: PDF>
!pip install PyPDF2
from pinnacledb import DataType
from pinnacledb.components.datatype import File

datatype = DataType('pdf', encodable='file')

In [None]:
# <tab: Image>
from pinnacledb.ext.pillow import pil_image
import PIL.Image

datatype = pil_image

In [None]:
# <tab: Audio>
from pinnacledb.ext.numpy import array
from pinnacledb import DataType
import scipy.io.wavfile
import io


def encoder(data):
    buffer = io.BytesIO()
    fs = data[0]
    content = data[1]
    scipy.io.wavfile.write(buffer, fs, content)
    return buffer.getvalue()


def decoder(data):
    buffer = io.BytesIO(data)
    content = scipy.io.wavfile.read(buffer)
    return content


datatype = DataType(
    'wav',
    encoder=encoder,
    decoder=decoder,
    encodable='artifact',
)

In [None]:
# <tab: Video>
from pinnacledb import DataType

# Create an instance of the Encoder with the identifier 'video_on_file' and load_hybrid set to False
datatype = DataType(
    identifier='video_on_file',
    encodable='artifact',
)

<!-- TABS -->
## Setup tables or collections

In [None]:
# <tab: MongoDB>
# Note this is an optional step for MongoDB
# Users can also work directly with `DataType` if they want to add
# custom data
from pinnacledb import Schema, DataType
from pinnacledb.backends.mongodb import Collection

table_or_collection = Collection('documents')
USE_SCHEMA = False
datatype = None

if USE_SCHEMA and isinstance(datatype, DataType):
    schema = Schema(fields={'x': datatype})
    db.apply(schema)

In [None]:
# <tab: SQL>
from pinnacledb.backends.ibis import Table
from pinnacledb import Schema, DataType
from pinnacledb.backends.ibis.field_types import dtype

datatype = "str"

if isinstance(datatype, DataType):
    schema = Schema(identifier="schema", fields={"id": dtype("str"), "x": datatype})
else:
    schema = Schema(
        identifier="schema", fields={"id": dtype("str"), "x": dtype(datatype)}
    )

table_or_collection = Table('documents', schema=schema)

db.apply(table_or_collection)

<!-- TABS -->
## Insert data

In order to create data, we need to create a `Schema` for encoding our special `Datatype` column(s) in the databackend.

In [None]:
# <tab: MongoDB>
from pinnacledb import Document

def do_insert(data):
    schema = None
    
    if schema is None and datatype is None:
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    elif schema is None and datatype is not None:
        data = [Document({'x': datatype(x)}) for x in data]
        db.execute(table_or_collection.insert_many(data))
    else:
        data = [Document({'x': x}) for x in data]
        db.execute(table_or_collection.insert_many(data, schema='my_schema'))

In [None]:
# <tab: SQL>
from pinnacledb import Document

def do_insert(data):
    db.execute(table_or_collection.insert([Document({'id': str(idx), 'x': x}) for idx, x in enumerate(data)]))

In [None]:
do_insert(data[:-len(data) // 4])

<!-- TABS -->
## Build simple select queries

In [None]:
# <tab: MongoDB>

select = table_or_collection.find({})

In [None]:
# <tab: SQL>

select = table_or_collection.to_query()

<!-- TABS -->
## Apply a chunker for search

:::note
Note that applying a chunker is ***not*** mandatory for search.
If your data is already chunked (e.g. short text snippets or audio) or if you
are searching through something like images, which can't be chunked, then this
won't be necessary.
:::

In [None]:
# <tab: Text>
from pinnacledb import objectmodel

CHUNK_SIZE = 200

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False}, datatype=model_output_dtype)
def chunker(text):
    text = text.split()
    chunks = [' '.join(text[i:i + CHUNK_SIZE]) for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <tab: PDF>
!pip install -q "unstructured[pdf]"
from pinnacledb import objectmodel
from unstructured.partition.pdf import partition_pdf

CHUNK_SIZE = 500

@objectmodel(flatten=True, model_update_kwargs={'document_embedded': False}, datatype=model_output_dtype)
def chunker(pdf_file):
    elements = partition_pdf(pdf_file)
    text = '\n'.join([e.text for e in elements])
    chunks = [text[i:i + CHUNK_SIZE] for i in range(0, len(text), CHUNK_SIZE)]
    return chunks

In [None]:
# <tab: Video>
!pip install opencv-python
import cv2
import tqdm
from PIL import Image
from pinnacledb.ext.pillow import pil_image
from pinnacledb import ObjectModel, Schema


@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'image': pil_image}),
)
def chunker(video_file):
    # Set the sampling frequency for frames
    sample_freq = 10
    
    # Open the video file using OpenCV
    cap = cv2.VideoCapture(video_file)
    
    # Initialize variables
    frame_count = 0
    fps = cap.get(cv2.CAP_PROP_FPS)
    extracted_frames = []
    progress = tqdm.tqdm()

    # Iterate through video frames
    while True:
        ret, frame = cap.read()
        if not ret:
            break
        
        # Get the current timestamp based on frame count and FPS
        current_timestamp = frame_count // fps
        
        # Sample frames based on the specified frequency
        if frame_count % sample_freq == 0:
            extracted_frames.append({
                'image': Image.fromarray(frame[:,:,::-1]),  # Convert BGR to RGB
                'current_timestamp': current_timestamp,
            })
        frame_count += 1
        progress.update(1)
    
    # Release resources
    cap.release()
    cv2.destroyAllWindows()
    
    # Return the list of extracted frames
    return extracted_frames

In [None]:
# <tab: Audio>
from pinnacledb import objectmodel, Schema

CHUNK_SIZE = 10  # in seconds

@objectmodel(
    flatten=True,
    model_update_kwargs={'document_embedded': False},
    output_schema=Schema(identifier='output-schema', fields={'audio': datatype}),
)
def chunker(audio):
    chunks = []
    for i in range(0, len(audio), CHUNK_SIZE):
        chunks.append(audio[1][i: i + CHUNK_SIZE])
    return [(audio[0], chunk) for chunk in chunks]

Now we apply this chunker to the data by wrapping the chunker in `Listener`:

In [None]:
from pinnacledb import Listener

upstream_listener = Listener(
    model=chunker,
    select=select,
    key='x',
)

db.apply(upstream_listener)

<!-- TABS -->
## Build multimodal embedding models

Some embedding models such as [CLIP](https://github.com/openai/CLIP) come in pairs of `model` and `compatible_model`.
Otherwise:

In [None]:
compatible_model = None

In [None]:
# <tab: Text>
from pinnacledb.ext.sentence_transformers import SentenceTransformer
from pinnacledb import vector

# Load the pre-trained sentence transformer model
model = SentenceTransformer(
    identifier='all-MiniLM-L6-v2',
    postprocess=lambda x: x.tolist(),
    datatype=vector(shape=(784,)),
)

In [None]:
# <tab: Image>
from torchvision import transforms
import torch
import torch.nn as nn
import torchvision.models as models

import warnings

# Import custom modules
from pinnacledb.ext.torch import TorchModel, tensor

# Define a series of image transformations using torchvision.transforms.Compose
t = transforms.Compose([
    transforms.Resize((224, 224)),   # Resize the input image to 224x224 pixels (must same as here)
    transforms.CenterCrop((224, 224)),  # Perform a center crop on the resized image
    transforms.ToTensor(),  # Convert the image to a PyTorch tensor
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # Normalize the tensor with specified mean and standard deviation
])

# Define a preprocess function that applies the defined transformations to an input image
def preprocess(x):
    try:
        return t(x)
    except Exception as e:
        # If an exception occurs during preprocessing, issue a warning and return a tensor of zeros
        warnings.warn(str(e))
        return torch.zeros(3, 224, 224)

# Load the pre-trained ResNet-50 model from torchvision
resnet50 = models.resnet50(pretrained=True)

# Extract all layers of the ResNet-50 model except the last one
modules = list(resnet50.children())[:-1]
resnet50 = nn.Sequential(*modules)

# Create a TorchModel instance with the ResNet-50 model, preprocessing function, and postprocessing lambda
model = TorchModel(
    identifier='resnet50',
    preprocess=preprocess,
    object=resnet50,
    postprocess=lambda x: x[:, 0, 0],  # Postprocess by extracting the top-left element of the output tensor
    encoder=tensor(torch.float, shape=(2048,))  # Specify the encoder configuration
)

In [None]:
# <tab: Text+Image>
import clip
from pinnacledb import vector
from pinnacledb.ext.torch import TorchModel

# Load the CLIP model and obtain the preprocessing function
model, preprocess = clip.load("RN50", device='cpu')

# Define a vector with shape (1024,)
e = vector(shape=(1024,))

# Create a TorchModel for text encoding
compatible_model = TorchModel(
    identifier='clip_text', # Unique identifier for the model
    object=model, # CLIP model
    preprocess=lambda x: clip.tokenize(x)[0],  # Model input preprocessing using CLIP 
    postprocess=lambda x: x.tolist(), # Convert the model output to a list
    encoder=e,  # Vector encoder with shape (1024,)
    forward_method='encode_text', # Use the 'encode_text' method for forward pass 
)

# Create a TorchModel for visual encoding
model = TorchModel(
    identifier='clip_image',  # Unique identifier for the model
    object=model.visual,  # Visual part of the CLIP model    
    preprocess=preprocess, # Visual preprocessing using CLIP
    postprocess=lambda x: x.tolist(), # Convert the output to a list 
    encoder=e, # Vector encoder with shape (1024,)
)

In [None]:
# <tab: Audio>
!pip install librosa
import librosa
import numpy as np
from pinnacledb import Model

def audio_embedding(audio_file):
    # Load the audio file
    y, sr = librosa.load(audio_file)
    mfccs = librosa.feature.mfcc(y=y, sr=sr)
    return mfccs

model= Model(identifier='my-model-audio', object=audio_embedding, datatype=vector(shape=(1000,)))

## Select outputs of upstream listener

:::note
This is useful if you have performed a first step, such as pre-computing 
features, or chunking your data. You can use this query to 
operate on those outputs.
:::

In [None]:
# <tab: MongoDB>
from pinnacledb.backends.mongodb import Collection

indexing_key = upstream_listener.outputs_key
select = Collection(upstream_listener.outputs).find()

In [None]:
# <tab: SQL>
indexing_key = upstream_listener.outputs_key
select = db.load("table", upstream_listener.outputs).to_query()

Depending on whether we have chunked the data, 
the indexing key will be different:

In [None]:
# <tab: Chunked Search>
indexing_key = upstream_listener.outputs
compatible_key = 'y'

In [None]:
# <tab: Un-chunked Search>
indexing_key = 'x'
compatible_key = 'y'

## Create vector-index

In [None]:
vector_index_name = 'my-vector-index'

In [None]:
# <tab: 1-Modality>
from pinnacledb import VectorIndex, Listener

jobs, _ = db.add(
    VectorIndex(
        vector_index_name,
        indexing_listener=Listener(
            key=indexing_key,      # the `Document` key `model` should ingest to create embedding
            select=select,       # a `Select` query telling which data to search over
            model=model,         # a `_Predictor` how to convert data to embeddings
        )
    )
)

In [None]:
# <tab: 2-Modalities>
from pinnacledb import VectorIndex, Listener

jobs, _ = db.add(
    VectorIndex(
        vector_index_name,
        indexing_listener=Listener(
            key=indexing_key,      # the `Document` key `model` should ingest to create embedding
            select=select,       # a `Select` query telling which data to search over
            model=model,         # a `_Predictor` how to convert data to embeddings
        ),
        compatible_listener=Listener(
            key=compatible_key,      # the `Document` key `model` should ingest to create embedding
            model=compatible_model,         # a `_Predictor` how to convert data to embeddings
            active=False,
            select=None,
        )
    )
)

In [None]:
query_table_or_collection = select.table_or_collection

<!-- TABS -->
## Perform a vector search

In [None]:
from pinnacledb import Document

item = Document({indexing_key: sample_datapoint})

Once we have this search target, we can execute a search as follows:

In [None]:
# <tab: MongoDB>
select = query_table_or_collection.like(item, vector_index=vector_index_name, n=10).find()

In [None]:
# <tab: SQL>
select = query_table_or_collection.like(item)

In [None]:
results = db.execute(select)

<!-- TABS -->
## Visualize Results

In [None]:
# <tab: Text>
from IPython.display import Markdown, display

def visualize(item, source):
    display(Markdown(item))

In [None]:
# <tab: Image>
from IPython.display import display

def visualize(item, source):
    display(item)        # item is a PIL.Image

In [None]:
# <tab: Audio>
from IPython.display import Audio, display

def visualize(item, source):
    display(Audio(item[1], fs=item[0]))

In [1]:
# <tab: PDF>
from IPython.display import IFrame, display

def visualize(item, source):
    display(IFrame(item))

In [None]:
# <tab: Video>
from IPython.display import display, HTML

timestamp = 0     # increment to the frame you want to start at

# Create HTML code for the video player with a specified source and controls
video_html = f"""
<video width="640" height="480" controls>
    <source src="{video['video'].uri}" type="video/mp4">
</video>
<script>
    // Get the video element
    var video = document.querySelector('video');
    
    // Set the current time of the video to the specified timestamp
    video.currentTime = {timestamp};
    
    // Play the video automatically
    video.play();
</script>
"""

display(HTML(video_html))

If your use-case involved chunking, you will want to be able to recover original rows/ documents, 
after getting the result of a vector-search:

In [None]:
# <tab: MongoDB>
def get_original(_source):
    return db.execute(table_or_collection.find_one({'_id': source}))

In [None]:
# <tab: SQL>
def get_original(_source):
    return next(db.execute(table_or_collection.filter(table_or_collection.id == source).limit(1)))

In [None]:
for result in results:
    source = None
    if '_source' in result:
        source = result['_source']
        result = get_original(source)
    visualize(result['x'], source=source)

## Check the system stays updated

In [None]:
# <tab: Development>

do_insert(data[-len(data) // 4:])

In [None]:
# <tab: Cluster>

# As an example with MongoDB, we show that inserting to/ updating the DB with a different client (potentially from different source)
# still means that the system stays up-to-date. This should work with any Cluster mode compatible DB (see "Configuring your production system")

collection = pymongo.MongoClient('mongodb://<mongo-host>:/27017/<database>')['<database>'].documents
collection.insert_many([{'x': x} for x in data[-len(data) // 4:])