In [19]:
from __future__ import annotations
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb

# change the path to the backend directory
sys.path.append(os.path.join(os.path.dirname("."), "../../backend/"))

In [20]:
from modules.utils import *
from modules.rag_llm import *
from modules.results_gen import *

In [21]:
config = load_config_and_device("../../backend/config.json")
config["persist_dir"] = "../data/doc_examples/chroma_db/"
config["data_dir"] = "../data/doc_examples/"
config["type_of_data"] = "dataset"
config["training"] = False
config["testing_flag"] = True #set this to false while training, this is for demo
config["test_subset"] = True #set this to false while training, this is for demo

# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)

[INFO] Finding device.
[INFO] Device found: mps
{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 30, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'llama3', 'num_documents_for_llm': 30, 'data_dir': '../data/doc_examples/', 'persist_dir': '../data/doc_examples/chroma_db/', 'testing_flag': True, 'ignore_downloading_data': False, 'test_subset': True, 'data_download_n_jobs': 20, 'training': False, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'struc

## Change the way the data is combined
- To pass to the RAG, all the metadata is combined into a single string. This is done by concatenating all the metadata fields with a space separator.
- We can change the way the data in whatever way we want. For example, we can concatenate all the metadata fields with a "~" separator.

In [22]:
def join_attributes(attribute: object, attr_name: str) -> str:
        """
        Description: Join the attributes of the OpenML objects into a single string with the format "key : value"
        """
        return (
            " ~ ".join(
                [f"{k} : {v}," for k, v in getattr(attribute, attr_name, {}).items()]
            )
            if hasattr(attribute, attr_name)
            else ""
        )

In [23]:
OpenMLObjectHandler.join_attributes = join_attributes

In [24]:
# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset_handler = QASetup(
    config=config,
    data_type=config["type_of_data"],
    client=client,
)

In [None]:
qa_dataset, _ = qa_dataset_handler.setup_vector_db_and_qa()