In [1]:
import chromadb

from modules.llm import *
from modules.utils import *
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
set_llm_cache(SQLiteCache(database_path="./data/.langchain.db"))

In [2]:
# load the configuration and device
config = load_config_and_device("config.json")
config["training"] = False
config["type_of_data"] = "dataset"
# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)
# Loading the metadata for all types

# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset = setup_vector_db_and_qa(config=config, data_type="dataset", client=client)

[INFO] Finding device.
[INFO] Device found: cpu
{'rqa_prompt_template': 'This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-base-en-v1.5', 'persist_dir': './data/chroma_db/', 'data_download_n_jobs': 20, 'training': False, 'ignore_downloading_data': True, 'search_type': 'similarity', 'temperature': 0.95, 'top_p': 0.95, 'reranking': False, 'long_context_reorder': True, 'device': 'cpu', 'type_of_data': 'dataset'}
[INFO] Loading metadata from file.
[INFO] Metadata loaded.
[INFO] Loading model...




[INFO] Model loaded.


In [3]:
openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(
        config=config
    )
# Create the combined metadata dataframe
metadata_df, all_metadata = create_metadata_dataframe(
    openml_data_object, data_id, all_metadata, config=config
)


[INFO] Loading metadata from file.
[INFO] Metadata loaded.


In [4]:

# Create the vector store
vectordb = load_document_and_create_vector_store(
    metadata_df, config=config, chroma_client=client
)

[INFO] Loading model...




[INFO] Model loaded.


In [11]:
# qa = initialize_llm_chain(vectordb=vectordb, config=config)

qa = vectordb.as_retriever(
        search_type=config["search_type"],
        search_kwargs={"k": 5},
    )

In [19]:
qa.invoke(input="mushroom dataset")

[Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared i

In [20]:
vectordb.similarity_search_with_relevance_scores("mushroom dataset")

[(Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared 

In [21]:
vectordb.max_marginal_relevance_search("mushroom dataset", lambda_mult=.3)

[Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared i

## Get the most frequently used words in the dataset descriptions

In [11]:
config["long_context_reorder"] = False
get_result_from_query("find me a dataset about food preferences", qa=qa_dataset, 
type_of_query="dataset", config=config)

Unnamed: 0,id,name,command,OpenML URL,Description
0,43339,Chocolate-Bar-Ratings,dataset = openml.datasets.get_dataset(43339),"<a href=""https://www.openml.org/search?type=da...","did - 43339, name - Chocolate-Bar-Ratings, ver..."
1,43446,Online-Food-Delivery-Preferences-Bangalore-region,dataset = openml.datasets.get_dataset(43446),"<a href=""https://www.openml.org/search?type=da...","(string)], 28 : [28 - Unavailability (string)]..."
2,43825,Nutritional-values-for-common-foods-and-products,dataset = openml.datasets.get_dataset(43825),"<a href=""https://www.openml.org/search?type=da...","- serine (string)], 54 : [54 - threonine (stri..."
3,42133,cacao_flavor,dataset = openml.datasets.get_dataset(42133),"<a href=""https://www.openml.org/search?type=da...","did - 42133, name - cacao_flavor, version - 3,..."
4,43600,Updated-Wine-Enthusiast-Reviews,dataset = openml.datasets.get_dataset(43600),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 4 : [4 - price (numeric)], 5 : [5 ..."
5,42089,vancouver_employee,dataset = openml.datasets.get_dataset(42089),"<a href=""https://www.openml.org/search?type=da...","2 : [2 - review_time (numeric)], 3 : [3 - revi..."
6,985,squash-unstored,dataset = openml.datasets.get_dataset(985),"<a href=""https://www.openml.org/search?type=da...","10 : [10 - groundspot_a* (numeric)], 11 : [11 ..."
7,42078,beer_reviews,dataset = openml.datasets.get_dataset(42078),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 3 : [3 - review_overall (numeric)]..."
10,1498,sa-heart,dataset = openml.datasets.get_dataset(1498),"<a href=""https://www.openml.org/search?type=da...",sbp systolic blood pressure \ntobacco cumu...
11,340,squash-stored,dataset = openml.datasets.get_dataset(340),"<a href=""https://www.openml.org/search?type=da...",25. Acceptability - the acceptability of the f...


In [8]:
config["long_context_reorder"] = True
get_result_from_query("find me a dataset about food preferences", qa=qa_dataset, 
type_of_query="dataset", config=config)

[INFO] Reordering results...
[INFO] Reordering complete.


Unnamed: 0,id,name,command,OpenML URL,Description
0,43446,Online-Food-Delivery-Preferences-Bangalore-region,dataset = openml.datasets.get_dataset(43446),"<a href=""https://www.openml.org/search?type=da...","(string)], 28 : [28 - Unavailability (string)]..."
1,42133,cacao_flavor,dataset = openml.datasets.get_dataset(42133),"<a href=""https://www.openml.org/search?type=da...","did - 42133, name - cacao_flavor, version - 3,..."
2,42089,vancouver_employee,dataset = openml.datasets.get_dataset(42089),"<a href=""https://www.openml.org/search?type=da...","2 : [2 - review_time (numeric)], 3 : [3 - revi..."
3,42078,beer_reviews,dataset = openml.datasets.get_dataset(42078),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 3 : [3 - review_overall (numeric)]..."
4,342,squash-unstored,dataset = openml.datasets.get_dataset(342),"<a href=""https://www.openml.org/search?type=da...","16 : [16 - starch (numeric)], 17 : [17 - sweet..."
5,340,squash-stored,dataset = openml.datasets.get_dataset(340),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 18 : [18 - sweetness (numeric)], 1..."
8,43589,combined-wine-data,dataset = openml.datasets.get_dataset(43589),"<a href=""https://www.openml.org/search?type=da...",Title: Wine Quality\nSources Created by: Paulo...
9,40498,wine-quality-white,dataset = openml.datasets.get_dataset(40498),"<a href=""https://www.openml.org/search?type=da...",Available at: [@Elsevier] http://dx.doi.org/10...
10,43536,Chennai-Zomato-Restaurants-Data,dataset = openml.datasets.get_dataset(43536),"<a href=""https://www.openml.org/search?type=da...","- Zomato_URL (string)], 1 : [1 - Name_of_Resta..."
13,43739,Country_data,dataset = openml.datasets.get_dataset(43739),"<a href=""https://www.openml.org/search?type=da...","- Obesity_-_adult_prevalence_rate (string)], 7..."


In [17]:
with open("data/all_dataset_description.csv", "r") as f:
    dataset_descriptions = f.read()

In [18]:
dataset_descriptions[:100]

'did,description,qualities,features\n2,"**Author**: Unknown. Donated by David Sterling and Wray Buntin'

## Aggregate and test multiple queries

In [14]:
## Aggregate results from multiple queries
queries = ["Find datasets related to COVID-19", "Find datasets related to COVID-19 and India", "COVID-19 dataset", "COVID-19 dataset India", "Mexico historical covid"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")

 60%|██████    | 3/5 [00:00<00:00, 11.53it/s]

[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.


100%|██████████| 5/5 [00:00<00:00, 11.80it/s]

[INFO] Reordering results...
[INFO] Reordering complete.





In [15]:
combined_df.head(10)

Unnamed: 0,id,name,query
56,43733,Covid-19--historical-data,5
34,43367,COVID-19-Indonesia-Dataset,4
47,43509,COVID-19-Rio-de-Janeiro-(City),4
33,43365,Covid-19-Case-Surveillance-Public-Use-Dataset,4
35,43400,COVID-19-community-mobility-reports,4
37,43405,Covid-19-Turkey-Daily-Details-Dataset,4
38,43410,Coronavirus-Disease-(COVID-19),4
39,43412,COVID-19-Visualisation-and-Epidemic-Analysis-Data,4
41,43428,Mexico-COVID-19-clinical-data,4
42,43457,COVID19-Dataset-with-100-World-Countries,4


In [6]:
queries = ["Find me datasets related to mushrooms", "Fungi dataset", "Mushroom dataset", "shroom data", "types of mushroom", "earth fungus", "low features mushroom dataset"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")
combined_df.head(10)

Unnamed: 0,id,name,query
80,44272,Meta_Album_FNG_Micro,6
97,44302,Meta_Album_FNG_Mini,6
1,24,mushroom,6
113,44335,Meta_Album_FNG_Extended,6
98,44303,Meta_Album_PLT_DOC_Mini,5
71,44242,Meta_Album_PLT_VIL_Micro,5
108,44321,Meta_Album_PLT_VIL_Extended,5
81,44273,Meta_Album_PLT_DOC_Micro,5
114,44336,Meta_Album_PLT_DOC_Extended,5
67,44237,Meta_Album_BCT_Micro,5


In [9]:
queries = ["plant datasets, low features", "plant, less number of features", "plant dataset, tiny"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")
combined_df.head(10)

Unnamed: 0,id,name,query
22,44154,iris_reproduced,3
34,44299,Meta_Album_MED_LF_Mini,3
29,44273,Meta_Album_PLT_DOC_Micro,3
24,44242,Meta_Album_PLT_VIL_Micro,3
20,40983,wilt,3
32,44286,Meta_Album_PLT_VIL_Mini,3
33,44293,Meta_Album_PLT_NET_Mini,3
16,1493,one-hundred-plants-texture,3
15,1492,one-hundred-plants-shape,3
14,1491,one-hundred-plants-margin,3
