In [2]:
from modules.llm import *
from modules.utils import *

# Config

In [None]:
config = load_config_and_device("config.json")

In [None]:
config["type_of_data"] = "dataset"

## Data Processing

In [None]:
 # Download the data if it does not exist
openml_data_object, data_id, all_dataset_metadata = get_all_metadata_from_openml(
        config=config
    )
# Create the combined metadata dataframe
metadata_df, all_dataset_metadata = create_metadata_dataframe(
        openml_data_object, data_id, all_dataset_metadata, config=config
    )

# Create the vector database using Chroma db with each type of data in its own collection. Doing so allows us to have a single database with multiple collections, reducing the number of databases we need to manage.
# This also downloads the embedding model if it does not exist
vectordb = load_document_and_create_vector_store(metadata_df, config=config)

# Setup llm chain, initialize the retriever and llm, and setup Retrieval QA
qa = initialize_llm_chain(vectordb=vectordb, config=config)

## Getting results

In [10]:
# %time
# query = "Which datasets would be useful for stock market support?"
# query = "Which datasets would be useful for heart disease"
# query = "Which datasets would be useful for flowers"
# query = "Which datasets would be useful for image classification"
# query = "My supervisor wants me to work on cloud cover, which datasets can I use"
# query = "Are there any datasets from the netherlands?"
# query = "Are there any datasets about farm animals?"
# query = "Find chinese authors"
query = "Which flow can I use for classifying categories of data efficiently"
result_data_frame = get_result_from_query(
        query=query, qa=qa, config=config
    )
result_data_frame.head()

Unnamed: 0,index,name,page_content,urls
0,18090,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18090, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18090
1,18431,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18431, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18431
2,18401,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18401, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18401
3,18433,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18433, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18433
4,18229,sklearn.pipeline.Pipeline(step_0=automl.compon...,"id - 18229, full_name - sklearn.pipeline.Pipel...",https://www.openml.org/api/v1/json/flow/18229


In [11]:
result_data_frame['page_content'].values[:10]

array(['id - 18090, full_name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectPercentile,step_2=sklearn.tree._classes.DecisionTreeClassifier)(1), name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectPercentile,step_2=sklearn.tree._classes.DecisionTreeClassifier), version - 1, external_version - automl==0.0.1,openml==0.10.2,sklearn==0.22.1, uploader - 12269,',
       'id - 18431, full_name - sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectKBest,step_2=sklearn.tree._classes.DecisionTreeClassifier)(1), name - sklearn.pipeline.Pipeline(step_0=automl.compon

In [12]:
result_data_frame['urls'].values[:10]

array(['https://www.openml.org/api/v1/json/flow/18090',
       'https://www.openml.org/api/v1/json/flow/18431',
       'https://www.openml.org/api/v1/json/flow/18401',
       'https://www.openml.org/api/v1/json/flow/18433',
       'https://www.openml.org/api/v1/json/flow/18229',
       'https://www.openml.org/api/v1/json/flow/18440',
       'https://www.openml.org/api/v1/json/flow/18112',
       'https://www.openml.org/api/v1/json/flow/20050',
       'https://www.openml.org/api/v1/json/flow/18130',
       'https://www.openml.org/api/v1/json/flow/18100'], dtype=object)

In [14]:
result_data_frame['name'].values[:10]

array(['sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectPercentile,step_2=sklearn.tree._classes.DecisionTreeClassifier)',
       'sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=sklearn.feature_selection._univariate_selection.SelectKBest,step_2=sklearn.tree._classes.DecisionTreeClassifier)',
       'sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=automl.util.sklearn.StackingEstimator(estimator=sklearn.tree._classes.DecisionTreeClassifier),step_2=sklearn.naive_bayes.BernoulliNB)',
       'sklearn.pipeline.Pipeline(step_0=automl.components.feature_preprocessing.multi_column_label_encoder.MultiColumnLabelEncoderComponent,step_1=automl.util.sklearn.StackingEstimator