# Vector Search and RAG function application based on SuperDuperDB

In [None]:
import os
import click
from tqdm import tqdm

import sentence_transformers
from dotenv import load_dotenv
from superduper import (
    Document,
    Listener,
    model,ObjectModel,
    Schema,
    VectorIndex,
    superduper,
    vector
)
# from superduper.backends.mongodb import
import superduper_mongodb
load_dotenv()

## Connect to mongodb database

In [2]:
mongodb_uri = os.getenv("MONGODB_URI", "superduperdb-demo")
artifact_store = os.getenv("ARTIFACT_STORE", "data/artifact_store")

db = superduper(mongodb_uri, artifact_store=f"filesystem://{artifact_store}")

2024-Sep-03 15:24:49.58| INFO     | localhost.localdomain| superduper.base.build:56   | Data Client is ready. MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-03 15:24:49.58| INFO     | localhost.localdomain| superduper.base.build:35   | Connecting to Metadata Client with engine:  MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True, serverselectiontimeoutms=5000)
2024-Sep-03 15:24:49.58| INFO     | localhost.localdomain| superduper.backends.local.artifacts:29   | Creating artifact store directory
2024-Sep-03 15:24:49.58| INFO     | localhost.localdomain| superduper.base.build:141  | Connecting to compute client: Compute(uri=None, compute_kwargs={}, _path='superduper.backends.local.compute.LocalComputeBackend')
2024-Sep-03 15:24:49.58| INFO     | localhost.localdomain| superduper.base.datalayer:106  | Building Data Layer
2024-Sep-03 15:24:49.59| INFO     | localhost.locald

In [3]:
 db.show()

[]

## Parse pdf files and store them in the database

In [4]:
from superduper.ext.unstructured.encoder import unstructured_encoder

db.add(unstructured_encoder)

pdf_folder = 'pdf-folders'

pdf_paths = [os.path.join(pdf_folder, pdf) for pdf in os.listdir(pdf_folder)]
# collection = superduper_mongodb("source")
to_insert = [
    Document({"elements": unstructured_encoder(pdf_path)}) for pdf_path in pdf_paths
]
# db.execute(collection.insert_many(to_insert))
# _ = db['source'].insert_many(to_insert).execute()
db['source'].insert_many(to_insert).execute()

2024-Sep-03 15:25:01.54| INFO     | localhost.localdomain| superduper.backends.mongodb.data_backend:226  | Table source does not exist, auto creating...


([ObjectId('66d770bb3016e6a40679dd3a'), ObjectId('66d770bb3016e6a40679dd3b')],
 None)

In [None]:
db.show()

In [None]:
db['source'].find_one().execute().unpack()

## Create a chunking model to chunk pdf chunks

In [5]:
def merge_metadatas(metadatas, return_center=False):
    MAX_NUM = 999999999
    if not metadatas:
        return {}
    p1, p2, p3, p4 = (MAX_NUM, MAX_NUM), (MAX_NUM, 0), (0, 0), (0, MAX_NUM)
    for metadata in metadatas:
        p1_, p2_, p3_, p4_ = metadata["coordinates"]["points"]
        p1 = (min(p1[0], p1_[0]), min(p1[1], p1_[1]))
        p2 = (min(p2[0], p2_[0]), max(p2[1], p2_[1]))
        p3 = (max(p3[0], p3_[0]), max(p3[1], p3_[1]))
        p4 = (max(p4[0], p4_[0]), min(p4[1], p4_[1]))
    points = (p1, p2, p3, p4)
    if return_center:
        points = {"x": (p1[0] + p3[0]) / 2, "y": (p1[1] + p3[1]) / 2}
        page_number = metadata["page_number"]
    return {"points": points, "page_number": page_number}


def create_chunk_and_metadatas(page_elements, stride=3, window=10):
    datas = []
    for i in range(0, len(page_elements), stride):
        windown_elements = page_elements[i : i + window]
        metadatas = [e.metadata.to_dict() for e in windown_elements]
        chunk = "\n".join([e.text for e in windown_elements])
        datas.append(
            {"txt": chunk, "metadata": merge_metadatas(metadatas, return_center=True)}
        )
    return datas


@model(flatten=True, model_update_kwargs={'document_embedded': False})
def get_chunks(elements):
    from collections import defaultdict

    pages_elements = defaultdict(list)
    for element in elements:
        pages_elements[element.metadata.page_number].append(element)

    all_chunks_and_links = sum(
        [
            create_chunk_and_metadatas(page_elements)
            for _, page_elements in pages_elements.items()
        ],
        [],
    )
    return all_chunks_and_links


In [6]:
MODEL_IDENTIFIER_CHUNK = "chunker"
upstream_listener= Listener(
        model=get_chunks,
        select=db['source'].select(),
        key="elements",
        uuid=MODEL_IDENTIFIER_CHUNK,
        identifier=MODEL_IDENTIFIER_CHUNK
)
db.apply(upstream_listener)

2024-Sep-03 15:31:09.67| INFO     | localhost.localdomain| superduper.jobs.queue:87   | Running jobs for listener.chunker with ids: [ObjectId('66d770bb3016e6a40679dd3a'), ObjectId('66d770bb3016e6a40679dd3b')]
2024-Sep-03 15:31:09.72| INFO     | localhost.localdomain| superduper.backends.local.compute:58   | Submitting job. function:<function method_job at 0x7fea15414720>
2024-Sep-03 15:31:09.72| INFO     | localhost.localdomain| superduper.components.model:672  | Requesting prediction in db - [get_chunks] with predict_id chunker
Using select {'_base': '?source-find', '_builds': {'source-find': {'_path': 'superduper.backends.mongodb.query.parse_query', 'documents': [], 'query': 'source.find()'}}, '_blobs': {}, '_files': {}} and ids [ObjectId('66d770bb3016e6a40679dd3a'), ObjectId('66d770bb3016e6a40679dd3b')]
2024-Sep-03 15:31:10.94| INFO     | localhost.localdomain| superduper.components.model:797  | Adding 2 model outputs to `db`
2024-Sep-03 15:31:12.12| SUCCESS  | localhost.localdomain

(['listener.chunker'],
 Listener(identifier='chunker', uuid='chunker', upstream=None, plugins=None, key='elements', model=ObjectModel(identifier='get_chunks', uuid='fb0fdf5f-e514-425b-aaec-bfe2c6813dfa', upstream=None, plugins=None, signature='*args,**kwargs', datatype=None, output_schema=None, flatten=True, model_update_kwargs={'document_embedded': False}, predict_kwargs={}, compute_kwargs={}, validation=None, metric_values={}, num_workers=0, object=<function get_chunks at 0x7fe9c5d3d4e0>), select=source.find(), active=True, predict_kwargs={}))

In [7]:
db.show()

[{'type_id': 'datatype', 'identifier': 'unstructured'},
 {'type_id': 'listener', 'identifier': 'chunker'},
 {'type_id': 'model', 'identifier': 'get_chunks'}]

In [9]:
upstream_listener.outputs_key
# '_outputs.chunker'

'_outputs.chunker'

In [10]:
  upstream_listener.outputs
# '_outputs.chunker'

'_outputs.chunker'

## Build text embedding model

In [11]:
from superduper_sentence_transformers import SentenceTransformer
# MODEL_IDENTIFIER_EMBEDDING = "embedding-bge-large"
embedding_model_identifier = "embedding-bge-large"
# Not in use
def preprocess(x):
    if isinstance(x, dict):
        # For model chains, the logic of this key needs to be optimized.
        chunk = sorted(x.items())[-1][1]
        return chunk["txt"]
    return x

embedding_model = SentenceTransformer(
    identifier=embedding_model_identifier,
    object=sentence_transformers.SentenceTransformer("BAAI/bge-large-en-v1.5", device="cuda"),
    datatype=vector(shape=(1024,)),
    device="cuda",
    postprocess=lambda x: x.tolist(),
    # preprocess = preprocess,
    predict_kwargs={"show_progress_bar": True},
)

In [12]:
print(len(embedding_model.predict("What is superduper")))

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

1024


## Create Vector Index

In [14]:
from superduper import VectorIndex, Listener
# VECTOR_INDEX_IDENTIFIER = "vector-index"
vector_index_name = 'vector-index'
indexing_key =  upstream_listener.outputs
# '_outputs.chunker'
vector_index = \
    VectorIndex(
        vector_index_name,
        indexing_listener=Listener(
            key=indexing_key,
            select=db[indexing_key].select(),
            model=embedding_model,
            uuid="embedding-bge-large",
            identifier="embedding-bge-large-listener"
        )
    )

## Vector Search Function

In [None]:
from superduper import Application

search_application = Application(
    'text-vector-search', 
    components=[
        upstream_listener,
        vector_index,
    ]
)
db.apply(search_application)

In [17]:
db.databackend.db.list_collection_names() 

['_parent_child_mappings', '_jobs', 'source', '_objects', '_outputs.chunker']

In [18]:
db.show()

[{'type_id': 'application', 'identifier': 'text-vector-search'},
 {'type_id': 'datatype', 'identifier': 'unstructured'},
 {'type_id': 'listener', 'identifier': 'chunker'},
 {'type_id': 'listener', 'identifier': 'embedding-bge-large'},
 {'type_id': 'model', 'identifier': 'embedding-bge-large'},
 {'type_id': 'model', 'identifier': 'get_chunks'},
 {'type_id': 'vector_index', 'identifier': 'vector-index'}]

## Embedding all text blocks and building vector indexes

In [19]:
# indexing_key =  upstream_listener.outputs
# '_outputs.chunker'
# COLLECTION_NAME_CHUNK ='_outputs.chunker'
db[indexing_key].find_one().execute().unpack()


{'_id': ObjectId('66d772103016e6a40679dd40'),
 '_outputs': {'chunker': {'txt': "DEAR VOLVO OWNER,\nCongratulations on your new truck and thank you for your vote of confidence! We hope that you will derive great satisfaction and benefit from your truck for many years to come.\nThis Driver Guide contains information tailored to your particular truck. It describes the truck's equipment, care and maintenance, and gives tips for safe and fuel-efficient driving.\nYou have the Driver Guide app in the side display in your truck. There is also an app that can be downloaded to Android and Apple devices. You find the app at your device's app store. The Driver Guide is available in web format at the following address: https://driverguide.volvotrucks.com\nTo get direct access to the Driver Guide for your truck, scan the QR code.\nIf you have any questions or want to know more about your truck, please contact your authorised Volvo dealer.\n©2012 Volvo Trucks, Göteborg.\nReproduction of the contents 

## Define a vector search function

In [50]:
from pprint import pprint
def vector_search(query, top_k=5):
    chunk_collection = db[indexing_key]
    item = Document({'_outputs.chunker': query})
    out = db.execute(
        chunk_collection.like(
            item, 
            vector_index=vector_index_name,
            n=top_k,
        ).select({})
    )
    # print(out)

    if out:
        out = sorted(out, key=lambda x: x['score'], reverse=True)
    for r in out:
        score = r["score"]
    #     # chunk_data = r.outputs("elements", "chunk")
        chunk_data = r['_outputs.chunker'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        chunk_message = {}
        chunk_message["score"] = score
        chunk_message["metadata"] = metadata
        txt = chunk_data["txt"]
        print(txt)
        print()
        print(chunk_message)
        print("\n\n", '-' * 20)

In [51]:
vector_search("What is the function of keys 10 to 12 on the left steering wheel keypad?")

2024-Sep-03 17:31:23.04| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

NOTE
For the sake of road safety, it is advised that you primarily use voice control (if available) or the steering wheel keypads when driving.
Steering wheel keypads
Keys 10 and 11 are used for phone calls. The others are used for navigating in the displays and controlling the infotainment system. The function of each key is the following:
1 Navigate left.
AA 338361
2 Navigate up.
3 Navigate right.
4 Navigate down.
5 Select.

{'score': 0.8308685339539639, 'metadata': {'points': {'x': 169.35586999999998, 'y': 225.92667}, 'page_number': 267}}


 --------------------
wheel is locked in its new position.
Steering wheel keypads
Left-hand keypad
Keys 10 to 12 control the audio in the truck. The others control cruise control or adaptive cruise control. The function of each key is the following:
1 Resume the previously set speed.
2 Increase speed.
3 Decrease speed.
4 Select current speed as set speed.
5 Switch off cruise control or adaptive
cruise control.

{'score': 0.7987961896635563, 'meta

## Define LLM Model from Anthropic and test it

In [101]:
from superduper_anthropic import AnthropicCompletions
# import os
#os.environ["ANTHROPIC_API_KEY"] = ""
predict_kwargs = {
    "max_tokens": 1024,
    "temperature": 0.8,
}

llm = AnthropicCompletions(identifier='llm', model='claude-2.1', predict_kwargs=predict_kwargs)
llm.predict("Tell me a joke")

"Why can't a bicycle stand up by itself? Because it's two-tired!"

## Approach with RAG Graph

In [172]:
# Approach with RAG Graph
from superduper import model
from superduper.components.graph import Graph, input_node

prompt_template = (
    "The following is a document and question about the volvo user manual\n"
    "Only provide a very concise answer\n"
    "{context}\n\n"
    "Here's the question:{input}\n"
    "answer:"
)
 
@model
def build_prompt(query, docs):
    # print(docs[0]["text"]["txt"])
    chunks = [doc["text"]["txt"] for doc in docs]
    context = "\n\n".join(chunks)
    prompt = prompt_template.format(context=context, input=query)
    return prompt

from superduper.components.model import QueryModel
chunk_collection = db[indexing_key]
# query= "What is the function of keys 10 to 12 on the left steering wheel keypad?"
top_k=5
# item = Document({'_outputs.chunker': query})
item = {'_outputs.chunker': '<var:query>'}
# indexing_key = '_outputs.chunker
vector_search_model = QueryModel(
    identifier="VectorSearch",
    select=chunk_collection.like(
        item, 
        vector_index=vector_index_name, 
        n=top_k
    ).select(),
    # The _source is the identifier of the upstream data, which can be used to locate the data from upstream sources using `_source`.
    postprocess=lambda docs: [{"text": doc['_outputs.chunker'], "_source": doc["_source"],"score": doc["score"]} for doc in docs],
    db=db
)
# We build a graph to handle the entire pipeline

# create a input node, only have one input parameter `query`
in_ = input_node('query')
# pass the query to the vector search model
vector_search_results = vector_search_model(query=in_)
# pass the query and the search results to the prompt builder
prompt = build_prompt(query=in_, docs=vector_search_results)
# pass the prompt to the llm model
answer = llm(prompt)
# create a graph, and the graph output is the answer
rag = answer.to_graph("rag")

In [177]:
query= "What is the function of keys 10 to 12 on the left steering wheel keypad?"
vector_search_model.predict(query=query)

2024-Sep-04 00:43:49.44| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[{'text': {'txt': '4 Depress the pedal (1) a small\ndistance, and adjust the angle of the steering wheel.\n5 Release the pedal. The steering\nwheel is locked in its new position.\nSteering wheel keypads\nLeft-hand keypad\nKeys 10 to 12 control the audio in the truck. The others control cruise control or adaptive cruise control. The function of each key is the following:\n1 Resume the previously set speed.\n2 Increase speed.\n3 Decrease speed.',
   'metadata': {'points': {'x': 379.85684499999996, 'y': 198.238195},
    'page_number': 88}},
  '_source': ObjectId('66d770bb3016e6a40679dd3a'),
  'score': 0.7850211625334125},
 {'text': {'txt': 'wheel is locked in its new position.\nSteering wheel keypads\nLeft-hand keypad\nKeys 10 to 12 control the audio in the truck. The others control cruise control or adaptive cruise control. The function of each key is the following:\n1 Resume the previously set speed.\n2 Increase speed.\n3 Decrease speed.\n4 Select current speed as set speed.\n5 Switch o

In [None]:
db.show()
# [{'type_id': 'application', 'identifier': 'rag-app'},
#  {'type_id': 'application', 'identifier': 'text-vector-search'},
#  {'type_id': 'datatype', 'identifier': 'unstructured'},
#  {'type_id': 'listener', 'identifier': 'chunker'},
#  {'type_id': 'listener', 'identifier': 'embedding-bge-large'},
#  {'type_id': 'model', 'identifier': 'VectorSearch'},
#  {'type_id': 'model', 'identifier': '_input'},
#  {'type_id': 'model', 'identifier': 'build_prompt'},
#  {'type_id': 'model', 'identifier': 'embedding-bge-large'},
#  {'type_id': 'model', 'identifier': 'get_chunks'},
#  {'type_id': 'model', 'identifier': 'llm'},
#  {'type_id': 'model', 'identifier': 'rag'},
#  {'type_id': 'vector_index', 'identifier': 'vector-index'}]

In [None]:
from superduper import Application

app = Application(
    'rag-app',
    components=[
        upstream_listener,
        vector_index,
        vector_search_model,
        rag,
    ]
)

db.apply(app)

In [41]:
db.databackend.db.list_collection_names() 


['_parent_child_mappings', '_jobs', 'source', '_objects', '_outputs.chunker']

In [175]:
rag = db.load("model", 'rag')
query= "What is the function of keys 10 to 12 on the left steering wheel keypad?"
print(rag.predict(query))

2024-Sep-04 00:38:40.66| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['Keys 10 to 12 on the left steering wheel keypad control the audio in the truck. Specifically:\n\n10 Mute\n11 Reduce volume \n12 Increase volume']


## New QA Function

In [160]:
from IPython.display import display
import pandas as pd
def qa(query):
    output = rag.predict(query)
    out = vector_search_model.predict(query=query)
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
    page_messages = []
    for source in out:
        # chunk_data = source.outputs("elements", "chunk")
        chunk_data = source['text'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output)
    display(df)


In [176]:
query= "What is the function of keys 10 to 12 on the left steering wheel keypad?"
qa(query)

2024-Sep-04 00:38:51.13| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

2024-Sep-04 00:38:52.83| INFO     | localhost.localdomain| superduper.base.datalayer:905  | {}


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

['Keys 10 to 12 on the left steering wheel keypad control the audio in the truck. Specifically:\n\n10 Mute\n11 Reduce volume \n12 Increase volume']

Unnamed: 0,page_number,points,score
0,267,"{'x': 169.35586999999998, 'y': 225.92667}",0.830869
1,88,"{'x': 388.781385, 'y': 218.15015499999998}",0.798796
2,267,"{'x': 191.64962, 'y': 225.92667}",0.786558
3,88,"{'x': 379.85684499999996, 'y': 198.238195}",0.785021
4,89,"{'x': 210.55261000000002, 'y': 225.92667}",0.776941


## Define a QA function

In [None]:
from IPython.display import Markdown
from IPython.display import display
import pandas as pd
def qa(query, vector_search_top_k=5):
    collection = db[COLLECTION_NAME_CHUNK]
    output, out = db.predict(
        model_name=MODEL_IDENTIFIER_LLM,
        query=query,
        context_select=collection.like(
            Document({CHUNK_OUTPUT_KEY: query}),
            vector_index=VECTOR_INDEX_IDENTIFIER,
            n=vector_search_top_k,
        ).select({}),
        context_key=f"{CHUNK_OUTPUT_KEY}.0.txt",
    )
    if out:
        out = sorted(out, key=lambda x: x["score"], reverse=True)
    page_messages = []
    for source in out:
        # chunk_data = source.outputs("elements", "chunk")
        chunk_data = source['_outputs.chunk'] # upstream_listener.outputs_key
        metadata = chunk_data["metadata"]
        page_number = metadata["page_number"]
        points = metadata["points"]
        score = source["score"]
        page_messages.append(
            {"page_number": page_number, "points": points, "score": score}
        )
    df = pd.DataFrame(page_messages)
    display(output.content)
    display(df)

In [None]:
qa("What is the function of keys 10 to 12 on the left steering wheel keypad?")