# Vector Search and RAG function application based on SuperDuperDB

In [1]:
import os
import click
from tqdm import tqdm

import sentence_transformers
from dotenv import load_dotenv
from superduper import (
    Document,
    Listener,
    model,ObjectModel,
    Schema,
    VectorIndex,
    superduper,
    vector
)
# from superduper.backends.mongodb import
import superduper_mongodb
load_dotenv()

  from tqdm.autonotebook import tqdm, trange


True

## Connect to mongodb database

In [2]:
mongodb_uri = os.getenv("MONGODB_URI", "superduperdb-demo")
artifact_store = os.getenv("ARTIFACT_STORE", "data/artifact_store")

db = superduper(mongodb_uri, artifact_store=f"filesystem://{artifact_store}")

2024-Sep-01 02:57:15.24| INFO     | localhost.localdomain| superduper.base.build:56   | Data Client is ready. MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-01 02:57:15.24| INFO     | localhost.localdomain| superduper.base.build:35   | Connecting to Metadata Client with engine:  MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-01 02:57:15.25| INFO     | localhost.localdomain| superduper.backends.local.artifacts:29   | Creating artifact store directory
2024-Sep-01 02:57:15.25| INFO     | localhost.localdomain| superduper.base.build:141  | Connecting to compute client: Compute(uri=None, compute_kwargs={}, _path='superduper.backends.local.compute.LocalComputeBackend')
2024-Sep-01 02:57:15.25| INFO     | localhost.localdomain| superduper.base.datalayer:106  | Building Data Layer
2024-Sep-01 02:57:15.25| INFO     | localhost.locald

In [None]:
 db.show()

## Parse pdf files and store them in the database

In [3]:
from superduper.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

pdf_folder = 'pdf-folders'

pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
# collection = superduper_mongodb("source")
to_insert = [
    Document({"elements": unstructured_encoder(pdf_path)}) for pdf_path in pdf_paths
]
# db.execute(collection.insert_many(to_insert))
# _ = db['source'].insert_many(to_insert).execute()
db['source'].insert_many(to_insert).execute()

2024-Sep-01 02:57:19.51| INFO     | localhost.localdomain| superduper.backends.mongodb.data_backend:226  | Table source does not exist, auto creating...


([ObjectId('66d41e7e9d6c6ec6f94998be'), ObjectId('66d41e7e9d6c6ec6f94998bf')],
 None)

In [None]:
db.show()

In [None]:
db['source'].find_one().execute().unpack()

## Create a chunking model to chunk pdf chunks

In [5]:
def merge_metadatas(metadatas, return_center=False):
    MAX_NUM = 999999999
    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata["coordinates"]["points"]
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    if return_center:
        points = {"x": (p1[0] + p3[0]) / 2, "y": (p1[1] + p3[1]) / 2}
        page_number = metadata["page_number"]
    return {"points": points, "page_number": page_number}


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append(
            {"txt": chunk, "metadata": merge_metadatas(metadatas, return_center=True)}
        )
    return datas


@model(flatten=True, model_update_kwargs={'document_embedded': False})
def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links


In [6]:
MODEL_IDENTIFIER_CHUNK = "chunk"
upstream_listener= Listener(
        model=get_chunks,
        select=db['source'].select(),
        key="elements",
       uuid=MODEL_IDENTIFIER_CHUNK
)
db.apply(upstream_listener)

2024-Sep-01 02:58:49.03| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.chunk with ids: [ObjectId('66d41e7e9d6c6ec6f94998be'), ObjectId('66d41e7e9d6c6ec6f94998bf')]
2024-Sep-01 02:58:49.08| INFO     | localhost.localdomain| superduper.backends.local.compute:58   | Submitting job. function:<function method_job at 0x7fd8bc4b54e0>
2024-Sep-01 02:58:49.09| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [get_chunks] with predict_id chunk
Using select {'_base': '?source-find', '_builds': {'source-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': 'source.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66d41e7e9d6c6ec6f94998be'), ObjectId('66d41e7e9d6c6ec6f94998bf')]
2024-Sep-01 02:58:50.37| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 2 model outputs to `db`
2024-Sep-01 02:58:51.68| SUCCESS  | localhost.localdomain| su

(['listener.chunk'],
 Listener(identifier='chunk', uuid='chunk', upstream=None, plugins=None, key='elements', model=ObjectModel(identifier='get_chunks', uuid='f4fb194f-a6b9-4736-b116-75e7faf18a82', upstream=None, plugins=None, signature='*args,**kwargs', datatype=None, output_schema=None, flatten=True, model_update_kwargs={'document_embedded': False}, predict_kwargs={}, compute_kwargs={}, validation=None, metric_values={}, num_workers=0, object=<function get_chunks at 0x7fd86e5c5080>), select=source.find(), active=True, predict_kwargs={}))

In [7]:
upstream_listener.outputs_key

'_outputs.chunk'

In [None]:
# MODEL_IDENTIFIER_CHUNK = "chunk"
# from superduper import ObjectModel
# chunk_model = ObjectModel(
#     identifier=MODEL_IDENTIFIER_CHUNK,
#     object=get_chunks,
#     flatten=True,
#     model_update_kwargs={"document_embedded": False},
#     output_schema=Schema(identifier="myschema", fields={"txt": "string"}),
# )

# db.add(
#     Listener(
#         model=chunk_model,
#         select=select,
#         key="elements",
#     )
# )
# upstream_listener= Listener(
#         model=get_chunks,
#         select=db['source'].find(),
#         key="elements",
#        uuid=MODEL_IDENTIFIER_CHUNK
# )
# db.apply(upstream_listener)

## Embedding all text blocks and building vector indexes

In [9]:
SOURCE_KEY = "elements"
MODEL_IDENTIFIER_EMBEDDING = "embedding"
VECTOR_INDEX_IDENTIFIER = "vector-index"
# COLLECTION_NAME_CHUNK = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
COLLECTION_NAME_CHUNK = f"_outputs.{MODEL_IDENTIFIER_CHUNK}"
# CHUNK_OUTPUT_KEY = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
CHUNK_OUTPUT_KEY = f"_outputs.{MODEL_IDENTIFIER_CHUNK}.txt"
indexing_key = upstream_listener.outputs_key # Same as CHUNK_OUTPUT_KEY
chunk_collection = db[COLLECTION_NAME_CHUNK]
print(COLLECTION_NAME_CHUNK)
def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["txt"]
    return x
from superduper_sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(
    identifier=MODEL_IDENTIFIER_EMBEDDING,
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda"),
    datatype=vector(shape=(1024,)),
    # predict_method="encode",
    # preprocess=preprocess,
    postprocess=lambda x: x.tolist(),
    # batch_predict=True,
    predict_kwargs={"show_progress_bar": True},

)

db.apply(
    VectorIndex(
        VECTOR_INDEX_IDENTIFIER,
        indexing_listener=Listener(
            select=chunk_collection.select(),
            key=CHUNK_OUTPUT_KEY,  # Key for the documents
            # key=indexing_key,  # Key for the documents
            model=embedding_model,  # Specify the model for processing
            # predict_kwargs={"max_chunk_size": 64},
        )
    )
)

_outputs.chunk
2024-Sep-01 03:06:59.90| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.66cec0cb-7944-43fa-8cf6-5e71211850b9 with ids: [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('66d41ebb9d6c6ec6f94998d3'), ObjectId('66d41ebb9d6c6ec6f94998d4'), ObjectId('66d41ebb9d6c6ec6f94998d5'), ObjectId('66d41ebb9d6c6ec6f94998d6'), ObjectId('66d41ebb9d6c6ec6f94998d7'), ObjectId('66d41ebb9d6c6ec6f94998d8'), ObjectId('66d41ebb9d6c6ec

  return torch.load(io.BytesIO(b))


2024-Sep-01 03:07:02.34| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [embedding] with predict_id 66cec0cb-7944-43fa-8cf6-5e71211850b9
Using select {'_base': '?-outputs-chunk-find', '_builds': {'-outputs-chunk-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': '_outputs.chunk.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('

Batches:   0%|          | 0/116 [00:00<?, ?it/s]

2024-Sep-01 03:12:02.57| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 3699 model outputs to `db`
2024-Sep-01 03:12:21.73| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.371ec3ca-1c08-432d-a36e-d2e822d47562 with ids: [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('66d41ebb9d6c6ec6f94998d3'), ObjectId('66d41ebb9d6c6ec6f94998d4'), ObjectId('66d41ebb9d6c6ec6f94998d5'), ObjectId('66d41ebb9d6c6ec6

3699it [00:00, 70401.82it/s]


Batches:   0%|          | 0/116 [00:00<?, ?it/s]

2024-Sep-01 03:17:43.44| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 3699 model outputs to `db`
2024-Sep-01 03:18:07.40| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.371ec3ca-1c08-432d-a36e-d2e822d47562 with ids: [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('66d41ebb9d6c6ec6f94998d3'), ObjectId('66d41ebb9d6c6ec6f94998d4'), ObjectId('66d41ebb9d6c6ec6f94998d5'), ObjectId('66d41ebb9d6c6ec6

0it [00:00, ?it/s]

2024-Sep-01 03:18:09.91| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7fd8a80eeb50>.  function:<function method_job at 0x7fd8bc4b54e0> future:1f13e335-3b14-4c09-bc68-b17bffb52e91
2024-Sep-01 03:18:09.91| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.66cec0cb-7944-43fa-8cf6-5e71211850b9 with ids: [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d




2024-Sep-01 03:18:11.47| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [embedding] with predict_id 66cec0cb-7944-43fa-8cf6-5e71211850b9
Using select {'_base': '?-outputs-chunk-find', '_builds': {'-outputs-chunk-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': '_outputs.chunk.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('

0it [00:00, ?it/s]

2024-Sep-01 03:18:11.55| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7fd8a80eeb50>.  function:<function method_job at 0x7fd8bc4b54e0> future:4a187623-f1e0-45d7-9eee-26a966c0d416
2024-Sep-01 03:18:11.61| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7fd8a80eeb50>.  function:<function method_job at 0x7fd8bc4b54e0> future:25799f8b-cfc3-4d36-8d89-0379bcbb45c7
2024-Sep-01 03:18:11.66| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7fd8a80eeb50>.  function:<function method_job at 0x7fd8bc4b54e0> future:06385347-843f-4b99-a9ec-72d053bd7e0e





2024-Sep-01 03:18:17.54| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for vector_index.vector-index with ids: [ObjectId('66d41ebb9d6c6ec6f94998c4'), ObjectId('66d41ebb9d6c6ec6f94998c5'), ObjectId('66d41ebb9d6c6ec6f94998c6'), ObjectId('66d41ebb9d6c6ec6f94998c7'), ObjectId('66d41ebb9d6c6ec6f94998c8'), ObjectId('66d41ebb9d6c6ec6f94998c9'), ObjectId('66d41ebb9d6c6ec6f94998ca'), ObjectId('66d41ebb9d6c6ec6f94998cb'), ObjectId('66d41ebb9d6c6ec6f94998cc'), ObjectId('66d41ebb9d6c6ec6f94998cd'), ObjectId('66d41ebb9d6c6ec6f94998ce'), ObjectId('66d41ebb9d6c6ec6f94998cf'), ObjectId('66d41ebb9d6c6ec6f94998d0'), ObjectId('66d41ebb9d6c6ec6f94998d1'), ObjectId('66d41ebb9d6c6ec6f94998d2'), ObjectId('66d41ebb9d6c6ec6f94998d3'), ObjectId('66d41ebb9d6c6ec6f94998d4'), ObjectId('66d41ebb9d6c6ec6f94998d5'), ObjectId('66d41ebb9d6c6ec6f94998d6'), ObjectId('66d41ebb9d6c6ec6f94998d7'), ObjectId('66d41ebb9d6c6ec6f94998d8'), ObjectId('66d41ebb9d6c6ec6f94998d9'), ObjectId('66d41ebb9d6c

Loading vectors into vector-table...: 3699it [00:03, 1119.27it/s]

2024-Sep-01 03:18:28.18| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7fd8a80eeb50>.  function:<function callable_job at 0x7fd8bc4b5580> future:51d842bc-0208-4c35-8d4d-27c15b9d315c





(['listener.66cec0cb-7944-43fa-8cf6-5e71211850b9',
  'vector_index.vector-index'],
 VectorIndex(identifier='vector-index', uuid='cc4a8ecc-da93-4bf5-85ae-d0ef99b26e1d', upstream=None, plugins=None, indexing_listener=Listener(identifier='66cec0cb-7944-43fa-8cf6-5e71211850b9', uuid='66cec0cb-7944-43fa-8cf6-5e71211850b9', upstream=None, plugins=None, key='_outputs.chunk.txt', model=SentenceTransformer(preferred_devices=('cuda', 'mps', 'cpu'), device='cpu', identifier='embedding', uuid='289bd8a5-7052-4476-a7b5-281b6a3020f3', upstream=None, plugins=None, signature='singleton', datatype=DataType(identifier='vector[1024]', uuid='8a264243-c5a5-4db3-9f6d-d726fb9bda47', upstream=None, plugins=None, encoder=None, decoder=None, info=None, shape=(1024,), directory=None, encodable='native', bytes_encoding=<BytesEncoding.BYTES: 'Bytes'>, intermediate_type='bytes', media_type=None), output_schema=None, flatten=False, model_update_kwargs={}, predict_kwargs={'show_progress_bar': True}, compute_kwargs={},

In [None]:
print(len(embedding_model.predict("What is superduper")))

# Create vector-index

In [None]:
# jobs, _ = db.apply(
#     VectorIndex(
#         identifier=VECTOR_INDEX_IDENTIFIER,
#         indexing_listener=Listener(
#             select=chunk_collection.find(),
#             key=CHUNK_OUTPUT_KEY,  # Key for the documents
#             model=model,  # Specify the model for processing
#             predict_kwargs={"max_chunk_size": 64},
#         ),
#     )
# )

In [None]:
db.show()

In [10]:
db[COLLECTION_NAME_CHUNK].find_one().execute().unpack()


{'_id': ObjectId('66d41ebb9d6c6ec6f94998c4'),
 '_outputs': {'chunk': {'txt': "DEAR VOLVO OWNER,\nCongratulations on your new truck and thank you for your vote of confidence! We hope that you will derive great satisfaction and benefit from your truck for many years to come.\nThis Driver Guide contains information tailored to your particular truck. It describes the truck's equipment, care and maintenance, and gives tips for safe and fuel-efficient driving.\nYou have the Driver Guide app in the side display in your truck. There is also an app that can be downloaded to Android and Apple devices. You find the app at your device's app store. The Driver Guide is available in web format at the following address: https://driverguide.volvotrucks.com\nTo get direct access to the Driver Guide for your truck, scan the QR code.\nIf you have any questions or want to know more about your truck, please contact your authorised Volvo dealer.\n©2012 Volvo Trucks, Göteborg.\nReproduction of the contents of

## Define a vector search function

In [109]:
from pprint import pprint
def vector_search(query, top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    out = db.execute(
        collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=top_k,
        ).select({})
    )

    if out:
        out = sorted(out, key=lambda x: x['score'], reverse=True)
    for r in out:
        score = r['score']
        # chunk_data = r.outputs("elements", "chunk")
        chunk_data = r['_outputs.chunk'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        chunk_message = {}
        chunk_message["score"] = score
        chunk_message["metadata"] = metadata
        txt = chunk_data["txt"]
        print(txt)
        print()
        print(chunk_message)
        print("\n\n", '-' * 20)

In [110]:
vector_search("What is the function of keys 10 to 12 on the left steering wheel keypad?")

2024-Sep-01 09:43:47.96| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

NOTE
For the sake of road safety, it is advised that you primarily use voice control (if available) or the steering wheel keypads when driving.
Steering wheel keypads
Keys 10 and 11 are used for phone calls. The others are used for navigating in the displays and controlling the infotainment system. The function of each key is the following:
1 Navigate left.
AA 338361
2 Navigate up.
3 Navigate right.
4 Navigate down.
5 Select.

{'score': 0.830868562605052, 'metadata': {'points': {'x': 169.35586999999998, 'y': 225.92667}, 'page_number': 267}}


 --------------------
wheel is locked in its new position.
Steering wheel keypads
Left-hand keypad
Keys 10 to 12 control the audio in the truck. The others control cruise control or adaptive cruise control. The function of each key is the following:
1 Resume the previously set speed.
2 Increase speed.
3 Decrease speed.
4 Select current speed as set speed.
5 Switch off cruise control or adaptive
cruise control.

{'score': 0.7987963239782733, 'metad

## Define an LLM model

In [None]:
MODEL_IDENTIFIER_LLM = "llm"
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# from superduper.ext.llm.vllm import VllmModel
from superduper.ext.openai import OpenAIChatCompletion

llm = VllmModel(
    identifier=MODEL_IDENTIFIER_LLM,
    model_name="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
    prompt_template=prompt_template,
    vllm_kwargs={"max_model_len": 2048, "quantization": "awq"},
    inference_kwargs={"max_tokens": 2048},
)
# Add the llm instance

db.add(llm)

## Define a QA function

In [None]:
from IPython.display import Markdown
from IPython.display import display
import pandas as pd
def qa(query, vector_search_top_k=5):
    collection = Collection(COLLECTION_NAME_CHUNK)
    output, out = db.predict(
        model_name=MODEL_IDENTIFIER_LLM,
        input=query,
        context_select=collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=vector_search_top_k,
        ).find({}),
        context_key=f"{CHUNK_OUTPUT_KEY}.0.txt",
    )
    if out:
        out = sorted(out, key=lambda x: x.content["score"], reverse=True)
    page_messages = []
    for source in out:
        chunk_data = source.outputs("elements", "chunk")
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output.content)
    display(df)

In [None]:
qa("What is the function of keys 10 to 12 on the left steering wheel keypad?")