# Load the Chroma Db and get retrieval results for a given query
- How would you load the Chroma Db and get retrieval results for a given query?

In [1]:
from __future__ import annotations
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb

In [2]:
from backend.modules.utils import *
from backend.modules.rag_llm import *
from backend.modules.results_gen import *

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
config = load_config_and_device("../../../backend/config.json")
config["persist_dir"] = "../../data/doc_examples/chroma_db/"
config["data_dir"] = "../../data/doc_examples/"
config["type_of_data"] = "dataset"
config["training"] = False
config["testing_flag"] = True  # set this to false while training, this is for demo
config["test_subset"] = True  # set this to false while training, this is for demo
# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)

[INFO] Finding device.
[INFO] Device found: mps
{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'llm_prompt_template': 'The following is a set of documents {docs}. Based on these docs, please summarize the content concisely. Also give a list of main concepts found in the documents. Do not add any new information. Helpful Answer: ', 'num_return_documents': 30, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'llm_model': 'llama3', 'num_documents_for_llm': 30, 'data_dir': '../../data/doc_examples/', 'persist_dir': '../../data/doc_examples/chroma_db/', 'testing_flag': True, 'ignore_downloading_data': False, 'test_subset': True, 'data_download_n_jobs': 20, 'training': False, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 

In [4]:
# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset_handler = QASetup(
    config=config,
    data_type=config["type_of_data"],
    client=client,
)

qa_dataset, _ = qa_dataset_handler.setup_vector_db_and_qa()

[INFO] Loading metadata from file.
[INFO] Loading model...
[INFO] Model loaded.
[INFO] Subsetting the data.
[INFO] Generating unique documents. Total documents: 500
Number of unique documents: 0 vs Total documents: 500
No new documents to add.


In [5]:
# get the llm chain and set the cache
llm_chain_handler = LLMChainCreator(config=config, local=True)
llm_chain_handler.enable_cache()
llm_chain = llm_chain_handler.get_llm_chain()

## Just get documents

In [6]:
query = "give me datasets about mushrooms"

In [7]:
res = qa_dataset.invoke(input=query, top_k=5)[:10]
res

[Document(metadata={'MajorityClassSize': 4208.0, 'MaxNominalAttDistinctValues': 12.0, 'MinorityClassSize': 3916.0, 'NumberOfClasses': 2.0, 'NumberOfFeatures': 23.0, 'NumberOfInstances': 8124.0, 'NumberOfInstancesWithMissingValues': 2480.0, 'NumberOfMissingValues': 2480.0, 'NumberOfNumericFeatures': 0.0, 'NumberOfSymbolicFeatures': 23.0, 'Unnamed: 0': 19, 'description': "**Author**: [Jeff Schlimmer](Jeffrey.Schlimmer@a.gp.cs.cmu.edu)  \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/mushroom) - 1981     \n**Please cite**:  The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \n\n\n### Description\n\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.\n\n### Source\n```\n(a) Origin: \nMushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \n\n(b) Dono

In [8]:
res[0].metadata

{'MajorityClassSize': 4208.0,
 'MaxNominalAttDistinctValues': 12.0,
 'MinorityClassSize': 3916.0,
 'NumberOfClasses': 2.0,
 'NumberOfFeatures': 23.0,
 'NumberOfInstances': 8124.0,
 'NumberOfInstancesWithMissingValues': 2480.0,
 'NumberOfMissingValues': 2480.0,
 'NumberOfNumericFeatures': 0.0,
 'NumberOfSymbolicFeatures': 23.0,
 'Unnamed: 0': 19,
 'description': "**Author**: [Jeff Schlimmer](Jeffrey.Schlimmer@a.gp.cs.cmu.edu)  \n**Source**: [UCI](https://archive.ics.uci.edu/ml/datasets/mushroom) - 1981     \n**Please cite**:  The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \n\n\n### Description\n\nThis dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.\n\n### Source\n```\n(a) Origin: \nMushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf \n\n(b) Donor: \nJef

In [9]:
print(res[0].page_content)

### Description

This dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.

### Source
```
(a) Origin: 
Mushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf 

(b) Donor: 
Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)
```

### Dataset description

This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identified as definitely edible, definitely poisonous, or of unknown edibility and not recommended. This latter class was combined with the poisonous one. The Guide clearly states that there is no simple rule for determining the edibility of a mushroom; no rule like ``leaflets three, let it be'' for Poisonous Oak and Ivy.


## Process the results and return a dataframe instead

In [18]:
output_df, ids_order = QueryProcessor(
    query=query,
    qa=qa_dataset,
    type_of_query=config["type_of_data"],
    config=config,
).get_result_from_query()

In [21]:
ids_order

[24,
 24,
 294,
 120,
 120,
 42,
 188,
 42,
 187,
 199,
 183,
 134,
 23,
 134,
 287,
 334,
 335,
 333,
 42,
 42,
 287,
 343,
 8,
 334,
 24,
 333,
 179,
 335,
 61,
 13]

In [20]:
output_df.head()

Unnamed: 0,id,name,Description,OpenML URL,Command
0,24,mushroom,StdvNominalAttDistinctValues : 3.1809710899501...,"<a href=""https://www.openml.org/search?type=da...",dataset = openml.datasets.get_dataset(24)
2,294,satellite_image,Data Set Information:,"<a href=""https://www.openml.org/search?type=da...",dataset = openml.datasets.get_dataset(294)
3,120,BNG(mushroom),"RandomTreeDepth3ErrRate : 0.024243, RandomTree...","<a href=""https://www.openml.org/search?type=da...",dataset = openml.datasets.get_dataset(120)
5,42,soybean,"did - 42, name - soybean, version - 1, uploade...","<a href=""https://www.openml.org/search?type=da...",dataset = openml.datasets.get_dataset(42)
6,188,eucalyptus,Kirsten Thomson and Robert J. McQueen (1996) M...,"<a href=""https://www.openml.org/search?type=da...",dataset = openml.datasets.get_dataset(188)
