In [1]:
import os
import sys
sys.path.append(os.path.join(os.path.dirname("."), '..'))

In [2]:
import chromadb

from modules.llm import *
from modules.utils import *
from langchain.globals import set_llm_cache
from langchain_community.cache import SQLiteCache
set_llm_cache(SQLiteCache(database_path="../data/.langchain.db"))

In [3]:
# load the configuration and device
config = load_config_and_device("../config.json")
config["training"] = False
config["type_of_data"] = "dataset"
config["persist_dir"] = "../data/chroma_db"
config["data_dir"] = "../data/"
# load the persistent database using ChromaDB
client = chromadb.PersistentClient(path=config["persist_dir"])
print(config)
# Loading the metadata for all types

# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset = setup_vector_db_and_qa(config=config, data_type="dataset", client=client)

[INFO] Finding device.
[INFO] Device found: cpu
{'rqa_prompt_template': 'This database is a list of metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}', 'num_return_documents': 50, 'embedding_model': 'BAAI/bge-large-en-v1.5', 'data_dir': '../data/', 'persist_dir': '../data/chroma_db', 'testing_flag': False, 'ignore_downloading_data': False, 'test_subset_2000': False, 'data_download_n_jobs': 20, 'training': False, 'temperature': 0.95, 'top_p': 0.95, 'search_type': 'similarity', 'reranking': False, 'long_context_reorder': False, 'device': 'cpu', 'type_of_data': 'dataset'}
[INFO] Loading metadata from file.
[INFO] Loading model...




[INFO] Model loaded.


In [4]:
from transformers import pipeline
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

In [5]:
from langchain_community.llms import Ollama
from langchain.chains import RetrievalQA

In [6]:
llm = Ollama(
    # model="phi3"
    model = "qwen2:1.5b" # super duper tiny model 
)  
# assuming you have Ollama installed and have llama3 model pulled with `ollama pull llama3 `


In [28]:
from langchain import PromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnableMap    
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough
from langchain_core.runnables import RunnableParallel, RunnablePassthrough

In [34]:
# prompt = """
# 1. Use the following pieces of context to answer the question at the end.
# 2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n
# 3. Keep the answer crisp and limited to 3,4 sentences.

# Context: {context}

# Question: {question}

# Helpful Answer:"""
prompt = """
Give a short summary of the following retriever results. Be concise and do not include any information that is not present in the context. \n Context : {context} \n"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

In [37]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

In [38]:
from langchain import PromptTemplate
from langchain.chains.llm import LLMChain
from langchain.chains.combine_documents.stuff import StuffDocumentsChain

In [44]:
template = """Use the following pieces of context to answer the question at the end.
    If you don't know the answer, just say that you don't know, don't try to make up an answer.
    Use three sentences maximum and keep the answer as concise as possible.
    {context}
    Question: {question}
    Helpful Answer:"""
prompt = """
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.\n
3. Keep the answer crisp and limited to 3,4 sentences.

Context: {context}

Question: {question}

Helpful Answer:"""

prompt = """
Give a short summary of the following retriever results. Be concise and do not include any information that is not present in the context. \n Context : {context} \n"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(prompt) 

In [45]:
from langchain.chains.combine_documents import create_stuff_documents_chain

In [53]:
document_prompt = PromptTemplate(
    input_variables=["page_content"],
    template="{page_content}"
)
document_variable_name = "context"
prompt = PromptTemplate.from_template(
    "Summarize this content: {context}"
)
llm_chain = LLMChain(llm=llm, prompt=prompt)
chain = StuffDocumentsChain(
    llm_chain=llm_chain,
    document_prompt=document_prompt,
    document_variable_name=document_variable_name
)

In [57]:
chain = create_stuff_documents_chain(llm, QA_CHAIN_PROMPT)

In [103]:
query = "eating disorder"
docs_res = qa_dataset.invoke(input = query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [115]:
# Map
map_template = """The following is a set of documents
{docs}
Based on this list of docs, please summarize the content concisely.
Helpful Answer:"""
map_prompt = PromptTemplate.from_template(map_template)
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [110]:
from langchain import hub

In [113]:
reduce_prompt = hub.pull("rlm/reduce-prompt")
map_chain = LLMChain(llm=llm, prompt=map_prompt)

In [116]:
map_chain.invoke({"docs": docs_res, "question": query})

{'docs': [Document(page_content='(numeric)], 50018 : [50018 - Var50019 (numeric)], 50019 : [50019 - Var50020 (numeric)], 50020 : [50020 - Var50021 (numeric)], 50021 : [50021 - Var50022 (numeric)], 50022 : [50022 - Var50023 (numeric)], 50023 : [50023 - Var50024 (numeric)], 50024 : [50024 - Var50025 (numeric)], 50025 : [50025 - Var50026 (numeric)], 50026 : [50026 - Var50027 (numeric)], 50027 : [50027 - Var50028 (numeric)], 50028 : [50028 - Var50029 (numeric)], 50029 : [50029 - Var50030 (numeric)], 50030 : [50030 - Var50031 (numeric)], 50031 : [50031 - Var50032 (numeric)], 50032 : [50032 - Var50033 (numeric)], 50033 : [50033 - Var50034 (numeric)], 50034 : [50034 - Var50035 (numeric)], 50035 : [50035 - Var50036 (numeric)], 50036 : [50036 - Var50037 (numeric)], 50037 : [50037 - Var50038 (numeric)], 50038 : [50038 - Var50039 (numeric)], 50039 : [50039 - Var50040 (numeric)], 50040 : [50040 - Var50041 (numeric)], 50041 : [50041 - Var50042 (numeric)], 50042 : [50042 - Var50043 (numeric)], 50043

In [98]:
from langchain.chains import MapReduceDocumentsChain, ReduceDocumentsChain

In [92]:
from langchain.chains import load_summarize_chain

In [96]:
chain = load_summarize_chain(llm, chain_type="stuff")

In [100]:
chain.invoke({"input_documents": docs_res, "question": query})

{'input_documents': [Document(page_content='did - 44343, name - Meta_Album_BTS_Extended, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 8930.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 1802.0, NumberOfClasses - 26.0, NumberOfFeatures - 3.0, NumberOfInstances - 138367.0, NumberOfInstancesWithMissingValues - 138367.0, NumberOfMissingValues - 138367.0, NumberOfNumericFeatures - 1.0, NumberOfSymbolicFeatures - 0.0, description - ## **Meta-Album Boats Dataset (Extended)**\n***', metadata={'did': 44343, 'name': 'Meta_Album_BTS_Extended'}),
  Document(page_content='did - 44279, name - Meta_Album_BTS_Micro, version - 1, uploader - 30980, status - active, format - arff, MajorityClassSize - 40.0, MaxNominalAttDistinctValues - nan, MinorityClassSize - 40.0, NumberOfClasses - 20.0, NumberOfFeatures - 3.0, NumberOfInstances - 800.0, NumberOfInstancesWithMissingValues - 800.0, NumberOfMissingValues - 800.0, NumberOfNumericFeatures - 1.0, NumberOfSymb

In [73]:
from langchain import PromptTemplate
from operator import itemgetter
from langchain.schema.runnable import RunnableMap    
from langchain.schema import StrOutputParser
from langchain.schema.runnable import RunnablePassthrough

In [74]:
llm_chain = LLMChain(
                  llm=llm, 
                  prompt=QA_CHAIN_PROMPT, 
                  callbacks=None, 
                  verbose=True)

document_prompt = PromptTemplate(
    input_variables=["page_content", "did"],
    template="Context:\ncontent:{page_content}\ndid:{did}",
)
# document_prompt = PromptTemplate(
#     input_variables=["page_content"],
#     template="Context:\ncontent:{page_content}",
# )

In [75]:
combine_documents_chain = StuffDocumentsChain(
                  llm_chain=llm_chain,
                  document_variable_name="context",
                  document_prompt=document_prompt,
                  callbacks=None,
              )

In [76]:
qa_dataset.search_kwargs = {"k":10}

In [83]:
qa = combine_documents_chain.invoke({"input_documents": docs_res})

AttributeError: 'str' object has no attribute 'page_content'

In [77]:

qa = RetrievalQA(
                  combine_documents_chain=combine_documents_chain,
                  verbose=True,
                  retriever=qa_dataset,
                  return_source_documents=True,
              )


In [20]:
query = "Find me datasets about mushrooms and explain why they are relavant"
result = qa(query)



[1m> Entering new RetrievalQA chain...[0m


  warn_deprecated(


Batches:   0%|          | 0/1 [00:00<?, ?it/s]



[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3m
1. Use the following pieces of context to answer the question at the end.
2. If you don't know the answer, just say that "I don't know" but don't make up an answer on your own.

3. Keep the answer crisp and limited to 3,4 sentences.

Context: Context:
content:### Description

This dataset describes mushrooms in terms of their physical characteristics. They are classified into: poisonous or edible.

### Source
```
(a) Origin: 
Mushroom records are drawn from The Audubon Society Field Guide to North American Mushrooms (1981). G. H. Lincoff (Pres.), New York: Alfred A. Knopf 

(b) Donor: 
Jeff Schlimmer (Jeffrey.Schlimmer '@' a.gp.cs.cmu.edu)
```

### Dataset description

This dataset includes descriptions of hypothetical samples corresponding to 23 species of gilled mushrooms in the Agaricus and Lepiota Family. Each species is identified as definitely edible, definitely poisonous, or of unknown edibility an

In [21]:
result

{'query': 'Find me datasets about mushrooms and explain why they are relavant',
 'result': "The mushroom dataset is a great example of a data set used for machine learning tasks in biology or other fields where the data is directly related to the research or application. Here's how it can be relevant:\n\n1. **Research**: It can serve as a benchmark for researchers in the field to test their algorithms and models. For instance, comparing results from different ML models on mushroom dataset could help identify which approaches are most effective.\n\n2. **Prediction and Classification**: By training models on mushroom datasets, practitioners can predict whether an image belongs to a certain genus or species. This can be crucial for applications like mushroom identification, forensics (where identifying the type of mushrooms in crime scenes), or even food quality control.\n\n3. **Data Science Methods**: The dataset can also be used as part of data science techniques such as anomaly detecti

In [80]:
print(result['result'])

 Based on the provided contexts for meta-album fungi datasets in Danish Fungi, here are two relevant datasets regarding mushrooms along with their relevance:

1. Meta_Album_FNG_Micro (did: 44272): The Micro dataset is a subset of the larger Danish Fungi Dataset, focusing on micro-level data analysis using images and labels sourced from citizen botanists verified by their peers and experts. This dataset contains information about fungi samples in various classes (representing different species) with 15,122 instances (images). The Micro dataset is relevant for mushroom research because it provides a large number of images, enabling analysis through machine learning techniques to classify, identify and study the characteristics of specific mushroom species.

2. Meta_Album_FNG_Extended (did: 44335): This dataset is an extended version of the Micro dataset that focuses on even fewer classes but with a larger number of instances per class, enabling more detailed analysis of individual specie

In [21]:
embeddings = load_model(config)

[INFO] Loading model...




[INFO] Model loaded.


In [22]:
user_query = "How to create a pipeline object?"
query_vector = embeddings.embed_query(user_query)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [18]:
qa_dataset.vectorstore

<langchain_community.vectorstores.chroma.Chroma at 0x39c654310>

In [11]:
qa_dataset

NameError: name 'qa_dataset' is not defined

In [31]:
test = qa_dataset.invoke("Mushroom dataset")

In [34]:
test[0].metadata

{'did': 44272, 'name': 'Meta_Album_FNG_Micro'}

In [30]:
[x for x in test if x.metadata["did"] in [43025]]

[Document(page_content='(numeric)], 18 : [18 - classe_circonference (nominal)], 19 : [19 - port_arbre (nominal)], 20 : [20 - vigueur_pousse (nominal)], 21 : [21 - champignon_collet (nominal)], 22 : [22 - insecte_collet (nominal)], 23 : [23 - plaie_collet (nominal)], 24 : [24 - observation_collet (numeric)], 25 : [25 - champignon_tronc (nominal)], 26 : [26 - insecte_tronc (nominal)], 27 : [27 - fissure_tronc (nominal)], 28 : [28 - rejet_tronc (nominal)], 29 : [29 - tuteurage_arbre (nominal)], 30 : [30 - canisse_arbre (nominal)], 31 : [31 - plaie_tronc (nominal)], 32 : [32 - observation_tronc (numeric)], 33 : [33 - champignon_houppier (nominal)], 34 : [34 - insecte_houppier (nominal)], 35 : [35 - fissure_houppier (nominal)], 36 : [36 - ecorce_incluse_houppier (nominal)], 37 : [37 - bois_mort_houppier (nominal)], 38 : [38 - plaie_houppier (nominal)], 39 : [39 - observation_houppier (numeric)], 40 : [40 - esperance_maintien (numeric)], 41 : [41 - contrainte (nominal)], 42 : [42 - classific

In [50]:
openml_data_object, data_id, all_metadata = get_all_metadata_from_openml(
        config=config
    )
# Create the combined metadata dataframe
metadata_df, all_metadata = create_metadata_dataframe(
    openml_data_object, data_id, all_metadata, config=config
)


[INFO] Loading metadata from file.
[INFO] Metadata loaded.


In [51]:

# Create the vector store
vectordb = load_document_and_create_vector_store(
    metadata_df, config=config, chroma_client=client
)

[INFO] Loading model...




[INFO] Model loaded.


In [117]:
embedding_vector = embeddings.embed_query("mushroom dataset")
docs = vectordb.similarity_search_by_vector(embedding_vector)
docs

[Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared i

In [95]:
source_nodes = vectordb.get('')

KeyboardInterrupt: 

In [None]:
# Create BM25 retriever with nodes
bm25_retriever = BM25Retriever.from_defaults(nodes=nodes)

In [56]:
metadata_df["did"][:10].values

array([ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11])

In [61]:
import langchain

In [113]:
metadata_df["description"]

0       **Author**: Unknown. Donated by David Sterling...
1       Author: Alen Shapiro\nSource: [UCI](https://ar...
2       **Author**: Unknown\n**Source**: Collective Ba...
3       **Author**: H. Altay Guvenir, Burak Acar, Hald...
4       **Author**: David J. Slate  \n**Source**: [UCI...
                              ...                        
5661    **flare** dataset from the **KEEL** repository...
5662    **flare** dataset from the **KEEL** repository...
5663    daily pickup data for 329 FHV companies from J...
5664    Monthly sales car parts. 2674 series. Jan 1998...
5665                                            test rats
Name: description, Length: 5666, dtype: object

In [129]:
# qa = initialize_llm_chain(vectordb=vectordb, config=config)

qa = vectordb.as_retriever(
        search_type=config["search_type"],
        search_kwargs={"k": 5},
    )

In [118]:
from langchain.chains import RetrievalQA

In [129]:
from langchain.chains.question_answering import load_qa_chain

In [132]:
model = "BAAI/bge-base-en-v1.5"
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
tokenizer = AutoTokenizer.from_pretrained(model)
model = AutoModelForQuestionAnswering.from_pretrained(model)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at BAAI/bge-base-en-v1.5 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [138]:
from langchain.llms import llamacpp

In [None]:
from langchain.chat_models import 

In [142]:
from langchain.document_loaders import HuggingFaceDatasetLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from transformers import AutoTokenizer, AutoModelForQuestionAnswering
from transformers import AutoTokenizer, pipeline
from langchain import HuggingFacePipeline
from langchain.chains import RetrievalQA

NameError: name 'PromptTemplate' is not defined

In [155]:
# Define a question-answering pipeline using the model and tokenizer
question_answerer = pipeline(
    "question-answering", 
    model=config["embedding_model"], 
    tokenizer=tokenizer,
    return_tensors='pt'
)

# Create an instance of the HuggingFacePipeline, which wraps the question-answering pipeline
# with additional model-specific arguments (temperature and max_length)
llm = HuggingFacePipeline(
    pipeline = question_answerer,
    model_kwargs={"temperature": 0.7, "max_length": 512},
)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at BAAI/bge-base-en-v1.5 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [156]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vectordb.as_retriever(), return_source_documents=False)

In [157]:
question = "Find me a disaster dataset"
result = qa.run({"query": question})
print(result["result"])

ValueError: Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer.

* Dataset:

Data Set Information:

Data Set Information:

### **Dataset Details**
![](https://meta-album.github.io/assets/img/samples/RSD.png)

**Meta Album ID**: REM_SEN.RSD  
**Meta Album URL**: [https://meta-album.github.io/datasets/RSD.html](https://meta-album.github.io/datasets/RSD.html)  
**Domain ID**: REM_SEN  
**Domain Name**: Remote Sensing  
**Dataset ID**: RSD  
**Dataset Name**: RSD  
**Short Description**: Remote sensing dataset  
**\# Classes**: 38  
**\# Images**: 1520  
**Keywords**: remote sensing, satellite image, aerial image, land cover  
**Data Format**: images  
**Image size**: 128x128  

**License (original data release)**: Open for research and non-profit purposes  
**License (Meta-Album data release)**: CC BY-NC 4.0  
**License URL (Meta-Album data release)**: [https://creativecommons.org/licenses/by-nc/4.0/](https://creativecommons.org/licenses/by-nc/4.0/)

Question: Find me a disaster dataset
Helpful Answer: argument needs to be of type (SquadExample, dict)

In [144]:
qa_chain = RetrievalQA.from_chain_type(
    llm, retriever=vectordb.as_retriever(), chain_type_kwargs={"prompt": "mushroom"}
)

ValidationError: 1 validation error for LLMChain
prompt
  value is not a valid dict (type=type_error.dict)

In [140]:
retrieval_chain = RetrievalQA.from_chain_type(model,llamacpp, retriever=vectordb.as_retriever())


ValueError: Got unsupported chain type: <module 'langchain_community.llms.llamacpp' from '/Users/eragon/.pyenv/versions/3.9.19/envs/openml/lib/python3.9/site-packages/langchain_community/llms/llamacpp.py'>. Should be one of dict_keys(['stuff', 'map_reduce', 'refine', 'map_rerank'])

In [None]:

retrieval_chain.run(query)


In [1]:
vectordb.similarity_search_with_relevance_scores("mushroom dataset")

NameError: name 'vectordb' is not defined

In [21]:
vectordb.max_marginal_relevance_search("mushroom dataset", lambda_mult=.3)

[Document(page_content='Meta-Album Fungi dataset is created by sampling the Danish Fungi 2020 dataset(https://arxiv.org/abs/2103.10107), itself a sampling of the Atlas of Danish Fungi repository. The images and labels which enter this database are sourced by a group consisting of 3 300 citizen botanists, then verified by their peers using a ranking of each person reliability, then finally verified by experts working at the Atlas. Of the 128 classes in the original Danish Fungi 2020 dataset, FNG retains the 25 most populous classes, belonging to six genera, for a total of 15 122 images total, with min 372, and max 1 221 images per class. Each image contains a colored 128x128 image of a fungus or a piece of a fungus from the corresponding class. Because the initial data were of widely varying sizes, we needed to crop a significant portion of the images, which we implemented by taking the largest possible square with center at the middle of the initial image. We then scaled each squared i

## Get the most frequently used words in the dataset descriptions

In [38]:
from langchain_core.prompts import ChatPromptTemplate

In [39]:
xml_system = """You're a helpful AI assistant. Given a user question and some Wikipedia article snippets, \
answer the user question and provide citations. If none of the articles answer the question, just say you don't know.

Remember, you must return both an answer and citations. A citation consists of a VERBATIM quote that \
justifies the answer and the ID of the quote article. Return a citation for every quote across all articles \
that justify the answer. Use the following format for your final output:

<cited_answer>
    <answer></answer>
    <citations>
        <citation><source_id></source_id><quote></quote></citation>
        <citation><source_id></source_id><quote></quote></citation>
        ...
    </citations>
</cited_answer>

Here are the Wikipedia articles:{context}"""
xml_prompt = ChatPromptTemplate.from_messages(
    [("system", xml_system), ("human", "{input}")]
)

In [79]:
config["rqa_prompt_template"] = "This database is a list of metadata. Use the following pieces of {context} to find the relevant document. Answer only from the context given using the {question} given. If you can't find the answer directly then return the document that best answers the question. Order the answers by relevance."

In [82]:
config["rqa_prompt_template"] = xml_prompt

In [83]:
# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa_dataset = setup_vector_db_and_qa(config=config, data_type="dataset", client=client)

[INFO] Loading metadata from file.
[INFO] Metadata loaded.
[INFO] Loading model...




[INFO] Model loaded.


In [84]:
get_result_from_query("find me a dataset about food preferences", qa=qa_dataset, 
type_of_query="dataset", config=config)

Unnamed: 0,id,name,command,OpenML URL,Description
0,43339,Chocolate-Bar-Ratings,dataset = openml.datasets.get_dataset(43339),"<a href=""https://www.openml.org/search?type=da...","did - 43339, name - Chocolate-Bar-Ratings, ver..."
1,43446,Online-Food-Delivery-Preferences-Bangalore-region,dataset = openml.datasets.get_dataset(43446),"<a href=""https://www.openml.org/search?type=da...","(string)], 28 : [28 - Unavailability (string)]..."
2,43825,Nutritional-values-for-common-foods-and-products,dataset = openml.datasets.get_dataset(43825),"<a href=""https://www.openml.org/search?type=da...","- serine (string)], 54 : [54 - threonine (stri..."
3,42133,cacao_flavor,dataset = openml.datasets.get_dataset(42133),"<a href=""https://www.openml.org/search?type=da...","did - 42133, name - cacao_flavor, version - 3,..."
4,43600,Updated-Wine-Enthusiast-Reviews,dataset = openml.datasets.get_dataset(43600),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 4 : [4 - price (numeric)], 5 : [5 ..."
5,42089,vancouver_employee,dataset = openml.datasets.get_dataset(42089),"<a href=""https://www.openml.org/search?type=da...","2 : [2 - review_time (numeric)], 3 : [3 - revi..."
6,985,squash-unstored,dataset = openml.datasets.get_dataset(985),"<a href=""https://www.openml.org/search?type=da...","10 : [10 - groundspot_a* (numeric)], 11 : [11 ..."
7,42078,beer_reviews,dataset = openml.datasets.get_dataset(42078),"<a href=""https://www.openml.org/search?type=da...","(numeric)], 3 : [3 - review_overall (numeric)]..."
10,1498,sa-heart,dataset = openml.datasets.get_dataset(1498),"<a href=""https://www.openml.org/search?type=da...",sbp systolic blood pressure \ntobacco cumu...
11,340,squash-stored,dataset = openml.datasets.get_dataset(340),"<a href=""https://www.openml.org/search?type=da...",25. Acceptability - the acceptability of the f...


In [17]:
config["long_context_reorder"] = True

In [19]:
%%timeit -n 20
get_result_from_query("find me a dataset about food preferences", qa=qa_dataset, 
type_of_query="dataset", config=config)

[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[I

In [17]:
with open("data/all_dataset_description.csv", "r") as f:
    dataset_descriptions = f.read()

In [18]:
dataset_descriptions[:100]

'did,description,qualities,features\n2,"**Author**: Unknown. Donated by David Sterling and Wray Buntin'

## Aggregate and test multiple queries

In [14]:
## Aggregate results from multiple queries
queries = ["Find datasets related to COVID-19", "Find datasets related to COVID-19 and India", "COVID-19 dataset", "COVID-19 dataset India", "Mexico historical covid"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")

 60%|██████    | 3/5 [00:00<00:00, 11.53it/s]

[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.
[INFO] Reordering results...
[INFO] Reordering complete.


100%|██████████| 5/5 [00:00<00:00, 11.80it/s]

[INFO] Reordering results...
[INFO] Reordering complete.





In [15]:
combined_df.head(10)

Unnamed: 0,id,name,query
56,43733,Covid-19--historical-data,5
34,43367,COVID-19-Indonesia-Dataset,4
47,43509,COVID-19-Rio-de-Janeiro-(City),4
33,43365,Covid-19-Case-Surveillance-Public-Use-Dataset,4
35,43400,COVID-19-community-mobility-reports,4
37,43405,Covid-19-Turkey-Daily-Details-Dataset,4
38,43410,Coronavirus-Disease-(COVID-19),4
39,43412,COVID-19-Visualisation-and-Epidemic-Analysis-Data,4
41,43428,Mexico-COVID-19-clinical-data,4
42,43457,COVID19-Dataset-with-100-World-Countries,4


In [6]:
queries = ["Find me datasets related to mushrooms", "Fungi dataset", "Mushroom dataset", "shroom data", "types of mushroom", "earth fungus", "low features mushroom dataset"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")
combined_df.head(10)

Unnamed: 0,id,name,query
80,44272,Meta_Album_FNG_Micro,6
97,44302,Meta_Album_FNG_Mini,6
1,24,mushroom,6
113,44335,Meta_Album_FNG_Extended,6
98,44303,Meta_Album_PLT_DOC_Mini,5
71,44242,Meta_Album_PLT_VIL_Micro,5
108,44321,Meta_Album_PLT_VIL_Extended,5
81,44273,Meta_Album_PLT_DOC_Micro,5
114,44336,Meta_Album_PLT_DOC_Extended,5
67,44237,Meta_Album_BCT_Micro,5


In [9]:
queries = ["plant datasets, low features", "plant, less number of features", "plant dataset, tiny"]
combined_df = aggregate_multiple_queries_and_count(queries,qa_dataset=qa_dataset, config=config, group_cols = ["id", "name"], sort_by="query")
combined_df.head(10)

Unnamed: 0,id,name,query
22,44154,iris_reproduced,3
34,44299,Meta_Album_MED_LF_Mini,3
29,44273,Meta_Album_PLT_DOC_Micro,3
24,44242,Meta_Album_PLT_VIL_Micro,3
20,40983,wilt,3
32,44286,Meta_Album_PLT_VIL_Mini,3
33,44293,Meta_Album_PLT_NET_Mini,3
16,1493,one-hundred-plants-texture,3
15,1492,one-hundred-plants-shape,3
14,1491,one-hundred-plants-margin,3
