In [1]:
from modules.llm import *
from modules.utils import *

# Config

In [2]:
config = load_config_and_device("config.json")

[INFO] Finding device.
[INFO] Device found: cuda


In [9]:
config["type_of_data"] = "flow"

## Data Processing

In [10]:
 # Download the data if it does not exist
openml_data_object, data_id, all_dataset_metadata = get_all_metadata_from_openml(
        config=config
    )
# Create the combined metadata dataframe
metadata_df, all_dataset_metadata = create_metadata_dataframe(
        openml_data_object, data_id, all_dataset_metadata, config=config
    )

# Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.
# This also downloads the embedding model if it does not exist
vectordb = load_document_and_create_vector_store(metadata_df, config=config)

# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa = initialize_llm_chain(vectordb=vectordb, config=config)

[INFO] Training is set to False.




## Getting results

In [11]:
# %time
# query = "Which datasets would be useful for stock market support?"
# query = "Which datasets would be useful for heart disease"
# query = "Which datasets would be useful for flowers"
# query = "Which datasets would be useful for image classification"
# query = "My supervisor wants me to work on cloud cover, which datasets can I use"
# query = "Are there any datasets from the netherlands?"
# query = "Are there any datasets about farm animals?"
# query = "Find chinese authors"
query = "Which flow can I use for classifying categories of data efficiently"
result_data_frame = get_result_from_query(
        query=query, qa=qa, config=config
    )
result_data_frame.head()

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,index,name,page_content,urls
0,4074,sklearn.tree.tree.DecisionTreeClassifier,"id - 4074, description - Flow generated by run...",https://www.openml.org/api/v1/json/flow/4074
1,3404,sklearn.tree.tree.DecisionTreeClassifier,"id - 3404, description - Flow generated by run...",https://www.openml.org/api/v1/json/flow/3404
2,3436,sklearn.ensemble.bagging.BaggingClassifier,"id - 3436, description - Flow generated by run...",https://www.openml.org/api/v1/json/flow/3436
3,6591,sklearn.pipeline.Pipeline(featureunion=sklearn...,"id - 6591, description - Automatically created...",https://www.openml.org/api/v1/json/flow/6591
4,6540,sklearn.pipeline.Pipeline(imputer=sklearn.prep...,"id - 6540, description - Automatically created...",https://www.openml.org/api/v1/json/flow/6540


In [12]:
result_data_frame['page_content'].values[:10]

array(["id - 4074, description - Flow generated by run_task, name - sklearn.tree.tree.DecisionTreeClassifier, tags - ['Verified_Supervised_Classification'],",
       "id - 3404, description - Flow generated by run_task, name - sklearn.tree.tree.DecisionTreeClassifier, tags - ['Verified_Supervised_Classification'],",
       "id - 3436, description - Flow generated by run_task, name - sklearn.ensemble.bagging.BaggingClassifier, tags - ['Verified_Supervised_Classification'],",
       'id - 6591, description - Automatically created scikit-learn flow., name - sklearn.pipeline.Pipeline(featureunion=sklearn.pipeline.FeatureUnion(votingclassifier=sklearn.ensemble.voting_classifier.VotingClassifier(est=sklearn.tree.tree.ExtraTreeClassifier),functiontransformer=sklearn.preprocessing._function_transformer.FunctionTransformer),logisticregression=sklearn.linear_model.logistic.LogisticRegression), tags - [],',
       'id - 6540, description - Automatically created scikit-learn flow., name - sklearn.

In [13]:
result_data_frame['urls'].values[:10]

array(['https://www.openml.org/api/v1/json/flow/4074',
       'https://www.openml.org/api/v1/json/flow/3404',
       'https://www.openml.org/api/v1/json/flow/3436',
       'https://www.openml.org/api/v1/json/flow/6591',
       'https://www.openml.org/api/v1/json/flow/6540',
       'https://www.openml.org/api/v1/json/flow/4834',
       'https://www.openml.org/api/v1/json/flow/6579',
       'https://www.openml.org/api/v1/json/flow/5440',
       'https://www.openml.org/api/v1/json/flow/3434',
       'https://www.openml.org/api/v1/json/flow/8004'], dtype=object)