In [1]:
from modules.llm import *
from modules.utils import *

## Config

In [2]:
config = {
    "rqa_prompt_template" : "This database is a list of dataset metadata. Use the following pieces of context to find the relevant document. Answer only from the context given using the {question} given. If you do not know the answer, say you do not know. {context}",
    "num_return_documents" : 50,
    "embedding_model": "BAAI/bge-base-en-v1.5",
    # "embedding_model": "Intel/bge-small-en-v1.5-rag-int8-static",
    "llm_model": "HuggingFaceH4/zephyr-7b-beta",
    "persist_dir": "./chroma_db/",
    # "recreate_chroma": False,
    "recreate_chroma": True,
    "recreate_data_cache" : False,
    # "recreate_data_cache" : True,
    "data_download_n_jobs" : 30,
    "device" : "cuda", # Change to "cuda" if you have a GPU or "cpu" if you don't. MPS is for Mac M{1..3} machines.
    "type_of_data" : "flow"
}

## Data Processing

In [3]:
openml_data_object, data_id, all_dataset_metadata = get_all_metadata_from_openml(recreate_cache=config["recreate_data_cache"], type_of_data = config["type_of_data"],n_jobs=config["data_download_n_jobs"])

In [4]:
metadata_df, all_dataset_metadata = create_metadata_dataframe(openml_data_object, data_id, all_dataset_metadata, type_of_data = config["type_of_data"])
metadata_df = clean_metadata_dataframe(metadata_df, type_of_data = config["type_of_data"])

In [6]:
vectordb = load_document_and_create_vector_store(metadata_df, model_name=config['embedding_model'], recreate_chroma=config['recreate_chroma'], persist_directory=config['persist_dir'], device=config['device'], type_of_data = config["type_of_data"])

100%|███████████████████████████████████████████████████████████████████████| 111/111 [03:13<00:00,  1.74s/it]


In [7]:
retriever, llm = create_retriever_and_llm(vectordb,num_return_documents=config["num_return_documents"], model_repo_id=config["llm_model"])
qa = create_llm_chain_and_query(vectordb=vectordb,retriever=retriever,llm=llm, prompt_template = config["rqa_prompt_template"])

  warn_deprecated(


## Getting results

In [10]:
def create_result_dataframe(query, qa, all_dataset_metadata, type_of_data = "runs") -> pd.DataFrame:
    if type_of_data == "dataset":
        type_of_data = "data"
        results = qa.invoke({"query": query})
        dict_results = {}
        for result in results["source_documents"]:
            dict_results[result.metadata["did"]] = {"name": result.metadata["name"] , "page_content" : result.page_content}
    
        output_df = pd.DataFrame(dict_results).T.reset_index() 
        output_df["urls"] = output_df["index"].apply(lambda x: f"https://www.openml.org/api/v1/json/{type_of_data}/{x}")
        return output_df
    elif type_of_data == "flow":
        results = qa.invoke({"query": query})
        dict_results = {}
        for result in results["source_documents"]:
            dict_results[result.metadata["id"]] = {"name": result.metadata["name"] , "page_content" : result.page_content}
    
        output_df = pd.DataFrame(dict_results).T.reset_index() 
        output_df["urls"] = output_df["index"].apply(lambda x: f"https://www.openml.org/api/v1/json/{type_of_data}/{x}")
        return output_df

In [16]:
# %time
# query = "Which datasets would be useful for stock market support?"
# query = "Which datasets would be useful for heart disease"
# query = "Which datasets would be useful for flowers"
# query = "Which datasets would be useful for image classification"
# query = "My supervisor wants me to work on cloud cover, which datasets can I use"
# query = "Are there any datasets from the netherlands?"
# query = "Are there any datasets about farm animals?"
# query = "Find chinese authors"
query = "Which flow can I use for classifying categories of data efficiently"
results = create_result_dataframe(query, qa, all_dataset_metadata, type_of_data="flow")
results.head()

Unnamed: 0,index,name,page_content,urls
0,18090,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18090, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18090
1,18431,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18431, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18431
2,18401,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18401, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18401
3,18433,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18433, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18433
4,18229,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18229, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18229


In [17]:
results['page_content'].values[:10]

array(['id - 18090, full_name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectPercentile,step_2=sklearn.tree._classes.DecisionTreeClassifier)(1), name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectPercentile,step_2=sklearn.tree._classes.DecisionTreeClassifier), version - 1, external_version - automl==0.0.1,openml==0.10.2,sklearn==0.22.1, uploader - 12269,',
       'id - 18431, full_name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectKBest,step_2=sklearn.tree._classes.DecisionTreeClassifier)(1), name - sklearn.pipeline.Pipeline(step_0=automl.compon

In [14]:
results['urls'].values[:10]

array(['https://www.openml.org/api/v1/json/flow/8188',
       'https://www.openml.org/api/v1/json/flow/8187',
       'https://www.openml.org/api/v1/json/flow/8283',
       'https://www.openml.org/api/v1/json/flow/9178',
       'https://www.openml.org/api/v1/json/flow/8091',
       'https://www.openml.org/api/v1/json/flow/18698',
       'https://www.openml.org/api/v1/json/flow/7823',
       'https://www.openml.org/api/v1/json/flow/8035',
       'https://www.openml.org/api/v1/json/flow/5848',
       'https://www.openml.org/api/v1/json/flow/18925'], dtype=object)

In [84]:
results['name'].values[:10]

array(['BTC', 'COVID-19-biotech-companies-on-stock-exchange(2020)',
       'Apple-Historical-Dataset',
       'Ethereum-Cryptocurrency-Historical-Dataset',
       'Corporate-Credit-Rating', 'Stock-Information',
       'Historical-data-on-the-trading-of-cryptocurrencies',
       'Stock-price-trend-prediction', 'Stock-Market-NIFTY50-Index-Data',
       'Google-Stock-10Year-data2004-2020'], dtype=object)