In [1]:
from __future__ import annotations
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
import os
import sys
import chromadb
# change the path to the backend directory
sys.path.append(os.path.join(os.path.dirname("."), '../../backend/'))

In [2]:
from modules.utils import load_config_and_device
from modules.llm import setup_vector_db_and_qa
from modules.results_gen import aggregate_multiple_queries_and_count

In [3]:
# Config and DB

# load the configuration and device
config = load_config_and_device("../../backend/config.json")
config["persist_dir"] = "../../backend/data/chroma_db/"
config["data_dir"] = "../../backend/data/"

[INFO] Finding device.
[INFO] Device found: cpu


In [4]:
# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])

In [5]:
# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset = setup_vector_db_and_qa(config=config, data_type="dataset", client=client)

[INFO] Loading metadata from file.
[INFO] Loading model...




[INFO] Model loaded.


## Aggregate results

In [6]:
queries = ["Find datasets related to COVID-19", "Find datasets related to COVID-19 and India", "COVID-19 dataset", "COVID-19 dataset India", "Mexico historical covid"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query", count = True)

  0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:02<00:08,  2.14s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 40%|████      | 2/5 [00:02<00:03,  1.17s/it]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 60%|██████    | 3/5 [00:03<00:01,  1.14it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 80%|████████  | 4/5 [00:03<00:00,  1.32it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:03<00:00,  1.26it/s]


In [7]:
combined_df.head()

Unnamed: 0,id,name,query
36,43495,COVID-19-Mexico-Clean--Order-by-States,4
52,43844,Coronavirus-Worldwide-Dataset,4
26,43349,COVID-19-World-Vaccination-Progress,4
27,43365,Covid-19-Case-Surveillance-Public-Use-Dataset,4
28,43367,COVID-19-Indonesia-Dataset,4


## Just collate

In [8]:
queries = ["Find datasets related to COVID-19", "Find datasets related to COVID-19 and India", "COVID-19 dataset", "COVID-19 dataset India", "Mexico historical covid"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query", count = False)

  0%|          | 0/5 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:00<00:02,  1.43it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 40%|████      | 2/5 [00:01<00:01,  1.99it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 60%|██████    | 3/5 [00:01<00:01,  1.50it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

 80%|████████  | 4/5 [00:02<00:00,  1.85it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:02<00:00,  2.01it/s]


In [9]:
combined_df.head()

Unnamed: 0,id,name,query
0,43410,Coronavirus-Disease-(COVID-19),Find datasets related to COVID-19
1,43412,COVID-19-Visualisation-and-Epidemic-Analysis-Data,Find datasets related to COVID-19
2,43365,Covid-19-Case-Surveillance-Public-Use-Dataset,Find datasets related to COVID-19
3,43367,COVID-19-Indonesia-Dataset,Find datasets related to COVID-19
4,43684,COVID-19-Stats-and-Mobility-Trends,Find datasets related to COVID-19
