# Vector Search and RAG function application based on SuperDuperDB

In [None]:
import os
import click
from tqdm import tqdm

import sentence_transformers
from dotenv import load_dotenv
from superduper import (
    Document,
    Listener,
    model,ObjectModel,
    Schema,
    VectorIndex,
    superduper,
    vector
)
# from superduper.backends.mongodb import
import superduper_mongodb
load_dotenv()

## Connect to mongodb database

In [2]:
mongodb_uri = os.getenv("MONGODB_URI", "superduperdb-demo")
artifact_store = os.getenv("ARTIFACT_STORE", "data/artifact_store")

db = superduper(mongodb_uri, artifact_store=f"filesystem://{artifact_store}")

2024-Sep-05 20:08:52.45| INFO     | localhost.localdomain| superduper.base.build:56   | Data Client is ready. MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-05 20:08:52.45| INFO     | localhost.localdomain| superduper.base.build:35   | Connecting to Metadata Client with engine:  MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-05 20:08:52.46| INFO     | localhost.localdomain| superduper.base.build:141  | Connecting to compute client: Compute(uri=None, compute_kwargs={}, _path='superduper.backends.local.compute.LocalComputeBackend')
2024-Sep-05 20:08:52.46| INFO     | localhost.localdomain| superduper.base.datalayer:106  | Building Data Layer
2024-Sep-05 20:08:52.46| INFO     | localhost.localdomain| superduper.base.build:208  | Configuration: 
 +----------------+-----------------------------------+
| Configuration  |          

## Parse pdf files and store them in the database

In [3]:
from superduper.ext.unstructured.encoder import unstructured_encoder

db.apply(unstructured_encoder)

pdf_folder = 'pdf-folders'

pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
# collection = superduper_mongodb("source")
to_insert = [
    Document({"elements": unstructured_encoder(pdf_path)}) for pdf_path in pdf_paths
]
# db.execute(collection.insert_many(to_insert))
# _ = db['source'].insert_many(to_insert).execute()
db['source'].insert_many(to_insert).execute()

2024-Sep-05 20:09:43.79| INFO     | localhost.localdomain| superduper.backends.mongodb.data_backend:226  | Table source does not exist, auto creating...


([ObjectId('66da56765ebeedc9cbee8787'), ObjectId('66da56765ebeedc9cbee8788')],
 None)

In [4]:
db.show()

[{'type_id': 'datatype', 'identifier': 'unstructured'}]

In [None]:
db['source'].find_one().execute().unpack()

## Create a chunking model to chunk pdf chunks

In [5]:
def merge_metadatas(metadatas, return_center=False):
    MAX_NUM = 999999999
    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata["coordinates"]["points"]
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    if return_center:
        points = {"x": (p1[0] + p3[0]) / 2, "y": (p1[1] + p3[1]) / 2}
        page_number = metadata["page_number"]
    return {"points": points, "page_number": page_number}


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append(
            {"txt": chunk, "metadata": merge_metadatas(metadatas, return_center=True)}
        )
    return datas


@model(flatten=True, model_update_kwargs={'document_embedded': False})
def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links


In [6]:
MODEL_IDENTIFIER_CHUNK = "chunker"
upstream_listener= Listener(
        model=get_chunks,
        select=db['source'].select(),
        key="elements",
       uuid=MODEL_IDENTIFIER_CHUNK
)
db.apply(upstream_listener)

2024-Sep-05 20:10:53.43| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.chunker with ids: [ObjectId('66da56765ebeedc9cbee8787'), ObjectId('66da56765ebeedc9cbee8788')]
2024-Sep-05 20:10:53.48| INFO     | localhost.localdomain| superduper.backends.local.compute:58   | Submitting job. function:<function method_job at 0x7f6fed414720>
2024-Sep-05 20:10:53.48| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [get_chunks] with predict_id chunker
Using select {'_base': '?source-find', '_builds': {'source-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': 'source.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66da56765ebeedc9cbee8787'), ObjectId('66da56765ebeedc9cbee8788')]
2024-Sep-05 20:10:54.72| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 2 model outputs to `db`
2024-Sep-05 20:10:55.97| SUCCESS  | localhost.localdomain

(['listener.chunker'],
 Listener(identifier='chunker', uuid='chunker', upstream=None, plugins=None, key='elements', model=ObjectModel(identifier='get_chunks', uuid='07e06313-91dd-42cf-bc0e-9993d0741d90', upstream=None, plugins=None, signature='*args,**kwargs', datatype=None, output_schema=None, flatten=True, model_update_kwargs={'document_embedded': False}, predict_kwargs={}, compute_kwargs={}, validation=None, metric_values={}, num_workers=0, object=<function get_chunks at 0x7f6f9e7eca40>), select=source.find(), active=True, predict_kwargs={}))

In [7]:
db.show()

[{'type_id': 'datatype', 'identifier': 'unstructured'},
 {'type_id': 'listener', 'identifier': 'chunker'},
 {'type_id': 'model', 'identifier': 'get_chunks'}]

In [8]:
db.databackend.db.list_collection_names() 


['_jobs', '_outputs.chunker', '_parent_child_mappings', 'source', '_objects']

In [9]:
upstream_listener.outputs_key
# '_outputs.chunker'

'_outputs.chunker'

In [None]:
# MODEL_IDENTIFIER_CHUNK = "chunk"
# from superduper import ObjectModel
# chunk_model = ObjectModel(
#     identifier=MODEL_IDENTIFIER_CHUNK,
#     object=get_chunks,
#     flatten=True,
#     model_update_kwargs={"document_embedded": False},
#     output_schema=Schema(identifier="myschema", fields={"txt": "string"}),
# )

# db.add(
#     Listener(
#         model=chunk_model,
#         select=select,
#         key="elements",
#     )
# )
# upstream_listener= Listener(
#         model=get_chunks,
#         select=db['source'].find(),
#         key="elements",
#        uuid=MODEL_IDENTIFIER_CHUNK
# )
# db.apply(upstream_listener)

## Embedding all text blocks and building vector indexes

In [10]:
# SOURCE_KEY = "elements"
MODEL_IDENTIFIER_EMBEDDING = "embedding"
VECTOR_INDEX_IDENTIFIER = "vector-index"
# COLLECTION_NAME_CHUNK = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
COLLECTION_NAME_CHUNK = f"_outputs.{MODEL_IDENTIFIER_CHUNK}" # _outputs.chunk
# CHUNK_OUTPUT_KEY = f"_outputs.{SOURCE_KEY}.{MODEL_IDENTIFIER_CHUNK}"
CHUNK_OUTPUT_KEY = f"_outputs.{MODEL_IDENTIFIER_CHUNK}.txt"
indexing_key = upstream_listener.outputs_key # Same as CHUNK_OUTPUT_KEY
chunk_collection = db[COLLECTION_NAME_CHUNK]

def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["txt"]
    return x
from superduper_sentence_transformers import SentenceTransformer
embedding_model = SentenceTransformer(
    identifier=MODEL_IDENTIFIER_EMBEDDING,
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda"),
    datatype=vector(shape=(1024,)),
    device="cuda",
    # predict_method="encode",
    # preprocess=preprocess,
    postprocess=lambda x: x.tolist(),
    # batch_predict=True,
    predict_kwargs={"show_progress_bar": True},
    # device='cuda'

)
# Create vector-index
vector_index = \
    VectorIndex(
        VECTOR_INDEX_IDENTIFIER,
        indexing_listener=Listener(
            select=chunk_collection.select(),
            key=CHUNK_OUTPUT_KEY,  # Key for the documents
            # key=indexing_key,  # Key for the documents
            model=embedding_model,  # Specify the model for processing
            # predict_kwargs={"max_chunk_size": 64},
            uuid="embedding-bge-large",
            identifier="embedding-bge-large-listener"
        )
    )
# db.apply()

## Start Indexing Embeddings

In [12]:
db.apply(vector_index)

2024-Sep-05 20:17:40.48| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.embedding-bge-large-listener with ids: [ObjectId('66da569f5ebeedc9cbee878d'), ObjectId('66da569f5ebeedc9cbee878e'), ObjectId('66da569f5ebeedc9cbee878f'), ObjectId('66da569f5ebeedc9cbee8790'), ObjectId('66da569f5ebeedc9cbee8791'), ObjectId('66da569f5ebeedc9cbee8792'), ObjectId('66da569f5ebeedc9cbee8793'), ObjectId('66da569f5ebeedc9cbee8794'), ObjectId('66da569f5ebeedc9cbee8795'), ObjectId('66da569f5ebeedc9cbee8796'), ObjectId('66da569f5ebeedc9cbee8797'), ObjectId('66da569f5ebeedc9cbee8798'), ObjectId('66da569f5ebeedc9cbee8799'), ObjectId('66da569f5ebeedc9cbee879a'), ObjectId('66da569f5ebeedc9cbee879b'), ObjectId('66da569f5ebeedc9cbee879c'), ObjectId('66da569f5ebeedc9cbee879d'), ObjectId('66da569f5ebeedc9cbee879e'), ObjectId('66da569f5ebeedc9cbee879f'), ObjectId('66da569f5ebeedc9cbee87a0'), ObjectId('66da569f5ebeedc9cbee87a1'), ObjectId('66da569f5ebeedc9cbee87a2'), ObjectId('

  return torch.load(io.BytesIO(b))


2024-Sep-05 20:17:43.06| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [embedding] with predict_id embedding-bge-large
Using select {'_base': '?-outputs-chunker-find', '_builds': {'-outputs-chunker-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': '_outputs.chunker.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66da569f5ebeedc9cbee878d'), ObjectId('66da569f5ebeedc9cbee878e'), ObjectId('66da569f5ebeedc9cbee878f'), ObjectId('66da569f5ebeedc9cbee8790'), ObjectId('66da569f5ebeedc9cbee8791'), ObjectId('66da569f5ebeedc9cbee8792'), ObjectId('66da569f5ebeedc9cbee8793'), ObjectId('66da569f5ebeedc9cbee8794'), ObjectId('66da569f5ebeedc9cbee8795'), ObjectId('66da569f5ebeedc9cbee8796'), ObjectId('66da569f5ebeedc9cbee8797'), ObjectId('66da569f5ebeedc9cbee8798'), ObjectId('66da569f5ebeedc9cbee8799'), ObjectId('66da569f5ebeedc9cbee879a'), ObjectId('66da569f5ebeedc9cbee879b'), ObjectId('66da569f5eb

Batches:   0%|          | 0/116 [00:00<?, ?it/s]

2024-Sep-05 20:18:00.40| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 3699 model outputs to `db`
2024-Sep-05 20:18:15.65| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.embedding-bge-large-listener with ids: [ObjectId('66da569f5ebeedc9cbee878d'), ObjectId('66da569f5ebeedc9cbee878e'), ObjectId('66da569f5ebeedc9cbee878f'), ObjectId('66da569f5ebeedc9cbee8790'), ObjectId('66da569f5ebeedc9cbee8791'), ObjectId('66da569f5ebeedc9cbee8792'), ObjectId('66da569f5ebeedc9cbee8793'), ObjectId('66da569f5ebeedc9cbee8794'), ObjectId('66da569f5ebeedc9cbee8795'), ObjectId('66da569f5ebeedc9cbee8796'), ObjectId('66da569f5ebeedc9cbee8797'), ObjectId('66da569f5ebeedc9cbee8798'), ObjectId('66da569f5ebeedc9cbee8799'), ObjectId('66da569f5ebeedc9cbee879a'), ObjectId('66da569f5ebeedc9cbee879b'), ObjectId('66da569f5ebeedc9cbee879c'), ObjectId('66da569f5ebeedc9cbee879d'), ObjectId('66da569f5ebeedc9cbee879e'), ObjectId('66da569f5ebeedc9cbee879f

0it [00:00, ?it/s]

2024-Sep-05 20:18:17.59| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7f6fd07cd550>.  function:<function method_job at 0x7f6fed414720> future:63cd17f4-8cc7-45bd-8099-5f9c8d83ebc7
2024-Sep-05 20:18:17.64| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7f6fd07cd550>.  function:<function method_job at 0x7f6fed414720> future:94528a9e-b417-4d13-b4d4-05fe3fbc139c





2024-Sep-05 20:18:21.07| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for vector_index.vector-index with ids: [ObjectId('66da569f5ebeedc9cbee878d'), ObjectId('66da569f5ebeedc9cbee878e'), ObjectId('66da569f5ebeedc9cbee878f'), ObjectId('66da569f5ebeedc9cbee8790'), ObjectId('66da569f5ebeedc9cbee8791'), ObjectId('66da569f5ebeedc9cbee8792'), ObjectId('66da569f5ebeedc9cbee8793'), ObjectId('66da569f5ebeedc9cbee8794'), ObjectId('66da569f5ebeedc9cbee8795'), ObjectId('66da569f5ebeedc9cbee8796'), ObjectId('66da569f5ebeedc9cbee8797'), ObjectId('66da569f5ebeedc9cbee8798'), ObjectId('66da569f5ebeedc9cbee8799'), ObjectId('66da569f5ebeedc9cbee879a'), ObjectId('66da569f5ebeedc9cbee879b'), ObjectId('66da569f5ebeedc9cbee879c'), ObjectId('66da569f5ebeedc9cbee879d'), ObjectId('66da569f5ebeedc9cbee879e'), ObjectId('66da569f5ebeedc9cbee879f'), ObjectId('66da569f5ebeedc9cbee87a0'), ObjectId('66da569f5ebeedc9cbee87a1'), ObjectId('66da569f5ebeedc9cbee87a2'), ObjectId('66da569f5ebe

Loading vectors into vector-table...: 3699it [00:03, 1184.85it/s]

2024-Sep-05 20:18:28.04| SUCCESS  | localhost.localdomain| superduper.backends.local.compute:64   | Job submitted on <superduper.backends.local.compute.LocalComputeBackend object at 0x7f6fd07cd550>.  function:<function callable_job at 0x7f6fed4147c0> future:2519f094-6bd6-454a-b43e-e14183699780





(['listener.embedding-bge-large-listener', 'vector_index.vector-index'],
 VectorIndex(identifier='vector-index', uuid='5e1ca825-aed9-4e99-868a-b505b6b3732d', upstream=None, plugins=None, indexing_listener=Listener(identifier='embedding-bge-large-listener', uuid='embedding-bge-large', upstream=None, plugins=None, key='_outputs.chunker.txt', model=SentenceTransformer(preferred_devices=('cuda', 'mps', 'cpu'), device='cuda', identifier='embedding', uuid='92233180-af9c-4632-b6c0-1d684c6fa199', upstream=None, plugins=None, signature='singleton', datatype=DataType(identifier='vector[1024]', uuid='fbc9d03a-3ee4-4f2d-9787-680638f26900', upstream=None, plugins=None, encoder=None, decoder=None, info=None, shape=(1024,), directory=None, encodable='native', bytes_encoding=<BytesEncoding.BYTES: 'Bytes'>, intermediate_type='bytes', media_type=None), output_schema=None, flatten=False, model_update_kwargs={}, predict_kwargs={'show_progress_bar': True}, compute_kwargs={}, validation=None, metric_values=

In [13]:
print(len(embedding_model.predict("What is superduper")))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1024


In [14]:
db.show()

[{'type_id': 'datatype', 'identifier': 'unstructured'},
 {'type_id': 'listener', 'identifier': 'chunker'},
 {'type_id': 'listener', 'identifier': 'embedding-bge-large-listener'},
 {'type_id': 'model', 'identifier': 'embedding'},
 {'type_id': 'model', 'identifier': 'get_chunks'},
 {'type_id': 'vector_index', 'identifier': 'vector-index'}]

In [15]:
db[COLLECTION_NAME_CHUNK].find_one().execute().unpack()


{'_id': ObjectId('66da569f5ebeedc9cbee878d'),
 '_outputs': {'chunker': {'txt': "DEAR VOLVO OWNER,\nCongratulations on your new truck and thank you for your vote of confidence! We hope that you will derive great satisfaction and benefit from your truck for many years to come.\nThis Driver Guide contains information tailored to your particular truck. It describes the truck's equipment, care and maintenance, and gives tips for safe and fuel-efficient driving.\nYou have the Driver Guide app in the side display in your truck. There is also an app that can be downloaded to Android and Apple devices. You find the app at your device's app store. The Driver Guide is available in web format at the following address: https://driverguide.volvotrucks.com\nTo get direct access to the Driver Guide for your truck, scan the QR code.\nIf you have any questions or want to know more about your truck, please contact your authorised Volvo dealer.\n©2012 Volvo Trucks, Göteborg.\nReproduction of the contents 

## Define a vector search function

In [18]:
from pprint import pprint
def vector_search(query, top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    out = db.execute(
        collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=top_k,
        ).select({})
    )

    if out:
        out = sorted(out, key=lambda x: x['score'], reverse=True)
    for r in out:
        score = r["score"]
        # chunk_data = r.outputs("elements", "chunk")
        chunk_data = r[upstream_listener.outputs_key] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        chunk_message = {}
        chunk_message["score"] = score
        chunk_message["metadata"] = metadata
        txt = chunk_data["txt"]
        print(txt)
        print()
        print(chunk_message)
        print("\n\n", '-' * 20)

In [19]:
vector_search("What is the function of keys 10 to 12 on the left steering wheel keypad?")

2024-Sep-05 20:21:22.87| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

NOTE
For the sake of road safety, it is advised that you primarily use voice control (if available) or the steering wheel keypads when driving.
Steering wheel keypads
Keys 10 and 11 are used for phone calls. The others are used for navigating in the displays and controlling the infotainment system. The function of each key is the following:
1 Navigate left.
AA 338361
2 Navigate up.
3 Navigate right.
4 Navigate down.
5 Select.

{'score': 0.8308685339539639, 'metadata': {'points': {'x': 169.35586999999998, 'y': 225.92667}, 'page_number': 267}}


 --------------------
wheel is locked in its new position.
Steering wheel keypads
Left-hand keypad
Keys 10 to 12 control the audio in the truck. The others control cruise control or adaptive cruise control. The function of each key is the following:
1 Resume the previously set speed.
2 Increase speed.
3 Decrease speed.
4 Select current speed as set speed.
5 Switch off cruise control or adaptive
cruise control.

{'score': 0.7987961896635563, 'meta

## Define an LLM model

In [30]:
from superduper_anthropic import AnthropicCompletions
MODEL_IDENTIFIER_LLM = "llm"
# import os
# os.environ["ANTHROPIC_API_KEY"] = "sk-ant-apixxx"
predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm = AnthropicCompletions(
    identifier=MODEL_IDENTIFIER_LLM,
    model='claude-2.1',
    predict_kwargs=predict_kwargs
)
llm.predict("Tell me a joke")

"Why can't a bicycle stand up by itself? Because it's two-tired!"

In [31]:
db.apply(llm)

([],
 AnthropicCompletions(identifier='llm', uuid='2586fe40-e59a-46ba-8f97-237ffeb11d4f', upstream=None, plugins=None, signature='*args,**kwargs', datatype=None, output_schema=None, flatten=False, model_update_kwargs={}, predict_kwargs={'max_tokens': 1024, 'temperature': 0.8}, compute_kwargs={}, validation=None, metric_values={}, num_workers=0, model='claude-2.1', max_batch_size=8, client_kwargs={}, prompt=''))

In [32]:
db.show()

[{'type_id': 'datatype', 'identifier': 'unstructured'},
 {'type_id': 'listener', 'identifier': 'chunker'},
 {'type_id': 'listener', 'identifier': 'embedding-bge-large-listener'},
 {'type_id': 'model', 'identifier': 'MODEL_IDENTIFIER_LLM'},
 {'type_id': 'model', 'identifier': 'embedding'},
 {'type_id': 'model', 'identifier': 'get_chunks'},
 {'type_id': 'model', 'identifier': 'llm'},
 {'type_id': 'vector_index', 'identifier': 'vector-index'}]

In [33]:
print(db.load("model","llm").predict("Tell me a joke"))

Why can't a bicycle stand up by itself? Because it's two-tired!


In [None]:
MODEL_IDENTIFIER_LLM = "llm"
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# from superduper.ext.vllm import VllmModel
from superduper_vllm import VllmModel
from superduper.ext.openai import OpenAIChatCompletion

# llm = VllmModel(
#     identifier=MODEL_IDENTIFIER_LLM,
#     model_name="TheBloke/Mistral-7B-Instruct-v0.2-AWQ",
#     prompt_func=prompt_template,
#     vllm_kwargs={ 
#         "gpu_memory_utilization": 0.50,
#         "max_model_len": 2048,
#         "quantization": "awq"
#                    },
#     predict_kwargs={"max_tokens": 1024, "temperature": 0.8},
# )
# Add the llm instance

# db.apply(llm)

## Prompt Template for LLM

In [68]:
prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)

# @model
def build_prompt(query, docs):
    # print(docs)
    chunks = [doc["text"]["txt"] for doc in docs]
    context = "\n\n".join(chunks)
    # context="blah"
    prompt = prompt_template.format(context=context, input=query)
    return prompt

## Test Prompt with documents from vector search output

In [55]:
from superduper.components.model import QueryModel
item = {'_outputs.chunker.txt': '<var:query>'}
top_k = 3
vector_search_model = QueryModel(
    identifier="VectorSearch",
    select=chunk_collection.like(
        item, 
        vector_index=VECTOR_INDEX_IDENTIFIER, 
        n=top_k
    ).select(),
    # The _source is the identifier of the upstream data, which can be used to locate the data from upstream sources using `_source`.
    postprocess=lambda docs: [{"text": doc['_outputs.chunker'], "_source": doc["_source"],"score": doc["score"]} for doc in docs],
    db=db
)

## Test Vector Search Model

In [47]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
pprint(vector_search_model.predict(query=query))

2024-Sep-05 20:56:05.26| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'_source': ObjectId('66da56765ebeedc9cbee8787'),
  'score': 0.7850211625334125,
  'text': {'metadata': {'page_number': 88,
                        'points': {'x': 379.85684499999996, 'y': 198.238195}},
           'txt': '4 Depress the pedal (1) a small\n'
                  'distance, and adjust the angle of the steering wheel.\n'
                  '5 Release the pedal. The steering\n'
                  'wheel is locked in its new position.\n'
                  'Steering wheel keypads\n'
                  'Left-hand keypad\n'
                  'Keys 10 to 12 control the audio in the truck. The others '
                  'control cruise control or adaptive cruise control. The '
                  'function of each key is the following:\n'
                  '1 Resume the previously set speed.\n'
                  '2 Increase speed.\n'
                  '3 Decrease speed.'}},
 {'_source': ObjectId('66da56765ebeedc9cbee8787'),
  'score': 0.7987961896635563,
  'text': {'metadata': {'page_nu

In [70]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
docs=vector_search_model.predict(query=query)
type(docs[0])
print(len(docs))
prompt=build_prompt(query,docs)

2024-Sep-05 21:05:18.12| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

3


In [71]:
print(db.load("model","llm").predict(prompt))

Keys 10 to 12 on the left steering wheel keypad control the audio in the truck.


## Define a QA function

In [78]:
from IPython.display import display
import pandas as pd
from superduper.components.model import QueryModel

def qa(query, vector_search_top_k=5):
    item = {'_outputs.chunker.txt': '<var:query>'}
    vector_search_model = QueryModel(
        identifier="VectorSearch",
        select=chunk_collection.like(
            item, 
            vector_index=VECTOR_INDEX_IDENTIFIER, 
            n=vector_search_top_k
        ).select(),
        postprocess=lambda docs: [{"text": doc['_outputs.chunker'], "_source": doc["_source"],"score": doc["score"]} for doc in docs],
        db=db
    )
    out=vector_search_model.predict(query=query)
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
        prompt= build_prompt(query,out)
        output = db.load("model","llm").predict(prompt)
    page_messages = []
    for source in out:
        chunk_data = source['text'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output)
    display(df)
    

In [79]:
query="What is the function of keys 10 to 12 on the left steering wheel keypad?"
qa(query, vector_search_top_k=5)

2024-Sep-05 23:42:55.68| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

'Keys 10 to 12 on the left steering wheel keypad control the audio in the truck. Specifically:\n\n10 Mute\n11 Reduce volume \n12 Increase volume'

Unnamed: 0,page_number,points,score
0,267,"{'x': 169.35586999999998, 'y': 225.92667}",0.830869
1,88,"{'x': 388.781385, 'y': 218.15015499999998}",0.798796
2,267,"{'x': 191.64962, 'y': 225.92667}",0.786558
3,88,"{'x': 379.85684499999996, 'y': 198.238195}",0.785021
4,89,"{'x': 210.55261000000002, 'y': 225.92667}",0.776941


## Define a QA function (Legacy)

In [None]:
from IPython.display import Markdown
from IPython.display import display
import pandas as pd
def qa(query, vector_search_top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    output, out = db.execute(
        model_name=MODEL_IDENTIFIER_LLM,
        query=query,
        context_select=collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=vector_search_top_k,
        ).select({}),
        context_key=f"{CHUNK_OUTPUT_KEY}.0.txt",
    )
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
    page_messages = []
    for source in out:
        chunk_data = source.outputs("elements", "chunk")
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output.content)
    display(df)

In [None]:
qa("What is the function of keys 10 to 12 on the left steering wheel keypad?")