In [56]:

from dash import dcc, html
from flask import Flask, render_template, jsonify, request, send_file
from glob import glob
from interpret import set_visualize_provider
from interpret.glassbox import ExplainableBoostingClassifier
from interpret.provider import InlineProvider
from itables import to_html_datatable
from jinja2 import Environment, FileSystemLoader
from ollama import ChatResponse
from ollama import chat
from pathlib import Path
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from tqdm.auto import tqdm
from typing import Optional, Union
from utils import OpenMLTaskHandler
import base64
import dash
import dash_bootstrap_components as dbc
import io
import json
import markdown
import matplotlib
import matplotlib.pyplot as plt
import numpy as np
import openml
import os
import pandas as pd
import plotly
import plotly.express as px
import plotly.graph_objs as go
import re
import seaborn as sns
import sqlite3
from typing import Any
matplotlib.use('agg')
set_visualize_provider(InlineProvider())

In [57]:
def safe_load_file(file_path, file_type) -> Union[pd.DataFrame, dict, None]:
        """
        This function is responsible for safely loading a file. It returns None if the file is not found or if there is an error loading the file.
        """
        if file_type == "json":
            try:
                with open(str(Path(file_path)), "r") as f:
                    return json.load(f)
            except:
                return None
        elif file_type == "pd":
            try:
                return pd.read_csv(str(file_path))
            except:
                return None
        elif file_type == "textdict":
            try:
                with open(file_path, "r") as f:
                    return json.loads(f.read())
            except:
                return None
        else:
            raise NotImplementedError
    

In [58]:
def find_max_existing_dataset_id()->int:
    conn = sqlite3.connect("./data/runs.db")
    c = conn.cursor()
    c.execute("SELECT DISTINCT dataset_id FROM runs")
    rows = c.fetchall()
    conn.close()
    return max([x[0] for x in rows]) if rows else 0

In [59]:
class DataReportGenerator:
    def __init__(self, generated_ebm_report_dir):
        self.generated_ebm_report_dir = generated_ebm_report_dir

    def generate_ebm_report(self, names, scores):
        fig = px.bar(
            x=names,
            y=scores,
            orientation='v',
            color_discrete_sequence=px.colors.qualitative.Safe
        )
        fig.update_layout(
            title="Feature Importance",
            xaxis_title="Feature",
            yaxis_title="Score"
        )
        return fig.to_html(full_html=False, include_plotlyjs='cdn')

    def run_ebm_on_dataset(self, dataset_id, X_train, y_train):
        try:
            ebm = ExplainableBoostingClassifier(random_state=42)
            ebm.fit(X_train, y_train)
            ebm_global = ebm.explain_global().data()
            names, scores = ebm_global["names"], ebm_global["scores"]
            return self.generate_ebm_report(names, scores)
        except Exception as e:
            return "<div>Unable to generate feature importance report</div>"

    def get_data_and_split(self, dataset_id):
        dataset = openml.datasets.get_dataset(dataset_id=dataset_id)
        X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)
        X = pd.get_dummies(X, prefix_sep='.').astype(float)
        y, y_categories = y.factorize()
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
        return X, y, X_train, y_train

    
    def get_feature_distribution(self, X):
        try:
            return to_html_datatable(X.describe().T)
        except Exception as e:
            return "<div>Unable to generate feature distribution</div>"

    def class_imbalance(self, y):
        try:
            return to_html_datatable(pd.DataFrame(y).value_counts(), index=True)
        except Exception as e:
            return "<div>Unable to generate class imbalance report</div>"
        
    def get_missing_value_count(self, X):
        try:
            return to_html_datatable(pd.DataFrame(X.isnull().sum(), columns=["Missing Value Count"]))
        except Exception as e:
            return "<div>Unable to generate missing value count</div>"

    def generate_data_report_for_dataset(self, dataset_id):
        report_path = f"{self.generated_ebm_report_dir}/{dataset_id}_report.html"
        if os.path.exists(report_path):
            pass
        else:
            try:
                X, y, X_train, y_train = self.get_data_and_split(dataset_id)

                ebm_report = self.run_ebm_on_dataset(dataset_id, X_train, y_train)
                missing_value_count = self.get_missing_value_count(X)
                feature_distribution = self.get_feature_distribution(X)
                class_imbalance_report = self.class_imbalance(y)

                report_html = f"""
                    <h1>Extra Dataset Information</h1>
                    
                    <h2>Feature Importance</h2>
                    {ebm_report}
                    <h2>Feature Distribution</h2>
                    {feature_distribution}
                    <h2>Class Imbalance</h2>
                    {class_imbalance_report}
                    <h2>Missing Value Count</h2>
                    {missing_value_count}
                    </div>
                    """
                
                report_path = f"{self.generated_ebm_report_dir}/{dataset_id}_report.html"
                with open(report_path, "w") as f:
                    f.write(report_html)
            except Exception as e:
                print(f"Error processing dataset {dataset_id}: {e}")

In [77]:
class ResultCollector:
    def __init__(self, path: str = "./data/results/*"):
        self.experiment_directory = Path(path)

        self.all_run_paths = glob(pathname=str(self.experiment_directory))
        self.all_results = pd.DataFrame()
        self.openml_task_handler = OpenMLTaskHandler()
        # Required columns
        self.required_columns = {
            "metric",
            "result",
            "framework",
            "dataset_id",
            "id",
            "task",
            "predict_duration",
            "models",
        }

        # Define how to find the best result for the metric
        self.metric_used_dict = {
            "auc": lambda x: x.max(),
            "neg_logloss": lambda x: x.min(),
        }
    
    def get_dataset_description_from_id(self, dataset_id: int) -> Optional[str]:
        return openml.datasets.get_dataset(dataset_id).description

    def collect_all_run_info_to_df(self):
        """
        This function is responsible for loading all the results files from the runs and storing them in self.all_results. This is further used to generate the dashboard.
        """
        all_results_list = []  # Temporary list to store individual DataFrames

        for run_path in tqdm(self.all_run_paths, total=len(self.all_run_paths)):
            run_path = Path(run_path)
            results_file_path = run_path / "results.csv"

            # Load results file if it exists
            results_file = safe_load_file(results_file_path, "pd")

            # If results file is loaded, proceed to process it
            if results_file is not None:
                # Get the model path specific to this run_path
                models_path_list = list((run_path / "models").rglob("models.*"))
                leaderboard_path_list = list(
                    (run_path / "models").rglob("leaderboard.*")
                )
                # models_path = str(models_path_list[0]) if len(models_path_list) >0 else None

                if len(models_path_list) > 0:
                    models_path = str(models_path_list[0])
                elif len(leaderboard_path_list) > 0:
                    models_path = str(leaderboard_path_list[0])
                else:
                    models_path = None

                # Add the model path as a new column in the current results_file DataFrame
                results_file["models"] = models_path

                # Get the dataset ID for each row in the results file
                results_file["dataset_id"] = results_file["id"].apply(
                    self.openml_task_handler.get_dataset_id_from_task_id
                )
                results_file["dataset_description"] = results_file["dataset_id"].apply(
                    self.get_dataset_description_from_id
                )

                # Append the processed DataFrame to our list
                all_results_list.append(results_file)

        # Concatenate all individual DataFrames into self.all_results
        if all_results_list:
            self.all_results = pd.concat(all_results_list, ignore_index=True)
    
    def validate_dataframe_and_add_extra_info(self):
        # Validate DataFrame
        if self.all_results is None or self.all_results.empty:
            return "Error: Provided DataFrame is empty or None."

        # Handle duplicate frameworks by keeping the one with the best result
        self.all_results = self.all_results.drop_duplicates(subset=["framework"], keep="first")

        # Add missing columns with default values
        for column in self.required_columns:
            if column not in self.all_results.columns:
                self.all_results[column] = "N/A"
        
    def __call__(self):
        self.collect_all_run_info_to_df()
        return self.all_results
        # self.validate_dataframe_and_add_extra_info()

In [101]:
class GenerateCompleteReportForDataset:
    def __init__(self, dataset_id: int, collector_results, GENERATED_REPORTS_DIR: str = "./data/generated_reports", GENERATED_DATA_REPORT_DIR: str = "./data/generated_data_reports"):
        self.dataset_id = dataset_id
        self.collector_results = collector_results
        self.current_results = self.get_results_for_dataset_id(self.dataset_id)
        self.jinja_environment = Environment(
            loader=FileSystemLoader("./website_assets/templates/")
        )
        self.generated_final_reports_dir = GENERATED_REPORTS_DIR
        self.generated_data_reports_dir = GENERATED_DATA_REPORT_DIR
        self.template_to_use = {
            "best_result": "best_result_table.html",
            "framework_table": "framework_table.html",
            "metric_vs_result": "metric_vs_result.html",
        }
        binary_metrics = [
            "auc",
            "logloss",
            "acc",
            "balacc",
        ]  # available metrics: auc (AUC), acc (Accuracy), balacc (Balanced Accuracy), pr_auc (Precision Recall AUC), logloss (Log Loss), f1, f2, f05 (F-beta scores with beta=1, 2, or 0.5), max_pce, mean_pce (Max/Mean Per-Class Error).
        multiclass_metrics = [
            "logloss",
            "acc",
            "balacc",
        ]  # available metrics: same as for binary, except auc, replaced by auc_ovo (AUC One-vs-One), auc_ovr (AUC One-vs-Rest). AUC metrics and F-beta metrics are computed with weighted average.
        regression_metrics = [
            "rmse",
            "r2",
            "mae",
        ]  # available metrics: mae (Mean Absolute Error), mse (Mean Squared Error), msle (Mean Squared Logarithmic Error), rmse (Root Mean Square Error), rmsle (Root Mean Square Logarithmic Error), r2 (R^2).
        timeseries_metrics = [
            "mase",
            "mape",
            "smape",
            "wape",
            "rmse",
            "mse",
            "mql",
            "wql",
            "sql",
        ]  # available metrics: mase (Mean Absolute Scaled Error), mape (Mean Absolute Percentage Error),
        self.all_metrics = (
            binary_metrics
            + multiclass_metrics
            + regression_metrics
            + timeseries_metrics
        )

        # run the function to get the best result
        self.framework_names = ["Auto-sklearn", "H20AutoML", "AutoGluon", "All results"]
        self.process_fns = [
            self.process_auto_sklearn_data(self.current_results),
            self.get_rows_for_framework_from_df(
                df=self.current_results, framework_name="H20AutoML", top_n=10
            ),
            self.get_rows_for_framework_from_df(
                df=self.current_results, framework_name="AutoGluon", top_n=10
            ),
            self.get_rows_for_framework_from_df(
                df=self.current_results, framework_name="All results"
            ),
        ]

        self.best_framework = ""
        self.best_metric = ""
        self.type_of_task =""
        self.dataset_id = ""
        self.task_id = ""
        self.task_name = ""
        self.best_result_for_metric = ""
        self.description = ""
        self.metric_and_result = ""

        self.get_best_result()
    def get_results_for_dataset_id(self, dataset_id: int) -> Optional[pd.DataFrame]:
        """
        This function returns the results for a given dataset_id. If no results are found, it returns None.
        """
        results_for_dataset = self.collector_results[
            self.collector_results["dataset_id"] == dataset_id
        ]
        if results_for_dataset.empty:
            return None
        return results_for_dataset


    def get_best_result(self):
        """
        This function returns the best result from the current_results DataFrame. It first sorts the DataFrame based on the metric used and then returns the best result.
        """
        if self.current_results is None:
            return None
        metric_used = self.current_results["metric"].iloc[0]
        if metric_used in ["auc", "acc", "balacc"]:
            # Since higher value is better we sort in descending order
            sort_in_ascending_order = False
        elif metric_used in ["logloss", "neg_logloss"]:
            # Since lower value is better we sort in ascending order
            sort_in_ascending_order = True
        else:
            sort_in_ascending_order = False

        sorted_results = self.current_results.sort_values(
            by="result", ascending=sort_in_ascending_order
        ).head()

        best_result = sorted_results.iloc[0]
        self.best_framework = best_result.get("framework", "")
        self.best_metric = best_result.get("metric", "")
        self.type_of_task = best_result.get("type", "")
        self.dataset_id = best_result.get("dataset_id", "")
        self.task_id = "https://" + best_result.get("id", "")
        self.task_name = best_result.get("task", "")
        self.best_result_for_metric = best_result.get("result", "")
        self.description = best_result.get("dataset_description", "")

        # all metric columns that are in the dataframe and in the list of all metrics
        metric_columns = [
            col for col in self.current_results.columns if col in self.all_metrics
        ]
        all_metrics_present = []
        for metric in metric_columns:
            try:
                all_metrics_present.append(self.current_results[metric].values[0])
            except:
                pass

        self.metric_and_result = " ".join(
            [
                f"The {metric} is {result} "
                for metric, result in zip(metric_columns, all_metrics_present)
            ]
        )

    def generate_best_result_table(self):
        """
        This function generates the best result table using the best result information.
        """
        template = self.jinja_environment.get_template(
            self.template_to_use["best_result"]
        )
        return template.render(
            best_framework=self.best_framework,
            best_metric=self.best_metric,
            type_of_task=self.type_of_task,
            dataset_id=self.dataset_id,
            task_id=self.task_id,
            task_name=self.task_name,
        )

    def process_auto_sklearn_data(self, df, top_n=10):
        auto_sklearn_data = pd.DataFrame()
        try:
            auto_sklearn_rows = df[df["framework"] == "autosklearn"]
            # for each row, read the json file from the models column and get the model id and cost
            for _, row in auto_sklearn_rows.iterrows():
                models_path = row["models"]
                try:
                    with open(models_path, "r") as f:
                        models_file = json.load(f)
                        for model in models_file:
                            model_type = (
                                "sklearn_classifier"
                                if "sklearn_classifier" in models_file[model]
                                else "sklearn_regressor"
                            )

                            auto_sklearn_data = pd.concat(
                                [auto_sklearn_data, pd.DataFrame([models_file[model]])],
                                ignore_index=True,
                            )
                except:
                    pass
                auto_sklearn_data = auto_sklearn_data.sort_values(
                    by="cost", ascending=True
                ).head(top_n)
                return to_html_datatable(auto_sklearn_data, caption="Auto Sklearn Models")
        except Exception as e:
            print(e)
            return "<div></div>"

        # return auto_sklearn_data.to_html()

    def get_rows_for_framework_from_df(
        self, df: pd.DataFrame, framework_name, top_n=40
    ):
        try:
            if framework_name == "All results":
                # drop the description column if it exists
                try:
                    df.drop("dataset_description", axis=1)
                except:
                    pass
                return to_html_datatable(df, caption="All Results")
            framework_rows: pd.DataFrame = df[df["framework"] == framework_name][
                "models"
            ].values[0]
            framework_data = safe_load_file(framework_rows, "pd")
            if top_n is not None:
                framework_data = framework_data.head(40)

            return to_html_datatable(framework_data, caption=f"{framework_name} Models")
        except:
            return ""

    def generate_framework_table(self):
        """
        This function generates the framework table using the framework_name information.
        """
        complete_html = ""
        for framework_name, process_fn in zip(self.framework_names, self.process_fns):
            try:
                complete_html += process_fn
            except:
                pass


        return f"""<div class="container">
                <h2>{framework_name}</h2>
                    {complete_html}
                </div>
                """

    def generate_dashboard_section(self):
        dashboard_html = f"""
        <div style="text-align: center; margin-bottom: 20px; margin-top: 20px;">
            <h1>Framework Performance Dashboard</h1>
        </div>
        <div style="display: grid; grid-template-columns: 1fr 1fr; gap: 30px; margin-bottom: 40px;">
        {self.graph_and_heading(self.current_results, self.best_metric.upper() + "-task", "task", "result", "framework", f"{self.best_metric.upper()} of each Framework", "1", "This is a plot of the main metric used in the experiment against the result of the experiment for each framework for each task. Use this plot to compare the performance of each framework for each task.", "bar")}
        {self.graph_and_heading(self.current_results, "predict-duration-task", "framework", "predict_duration", "framework", "Predict Duration of each Framework", "2", "This is a plot of the prediction duration for each framework for each task. Use this plot to find the framework with the fastest prediction time.", "bar")}
        {self.graph_and_heading(self.current_results, "framework-performance", "framework", "result", "framework", "Performance of each Framework", "1", "This is a plot of the performance of each framework for each task. Use this plot find the best framework for the tasks.", "bar")}
        {self.graph_and_heading(self.current_results, "predict-duration-performance", "predict_duration", "result", "framework", "Predict Duration vs Performance", "2", "This is a scatter plot of the prediction duration against the performance of each framework for each task. Use this plot to find the best framework for the tasks.", "scatter")}
        </div>
        """
        return dashboard_html

    def graph_and_heading(
        self,
        df,
        graph_id,
        x,
        y,
        color,
        title,
        grid_column,
        description,
        plot_type="bar",
    ):
        try:
            colors = px.colors.qualitative.Safe
            if len(x) == 0:
                return "<div></div>"

            # use plotly to create the plot
            if plot_type == "bar":
                fig = px.bar(df, x=x, y=y, color=color, title=title, color_discrete_sequence=colors)
            elif plot_type == "scatter":
                fig = px.scatter(df, x=x, y=y, color=color, title=title, color_discrete_sequence=colors)

            fig.update_layout(
                title=title,
                xaxis_title=x,
                yaxis_title=y,
            )
            encoded_image = fig.to_html(full_html=False, include_plotlyjs="cdn")

            return f"<div style='grid-column: {grid_column};'>{encoded_image}</div>"
        except Exception as e:
            print(e)
            return f"<div style='grid-column: {grid_column};'><p>Error generating graph: {str(e)}</p></div>"

    def get_explanation_from_llm(self):
        prompt_format = f"""For a dataset called {self.task_name} , the best framework is {self.best_framework} with a {self.best_metric} of {self.best_result_for_metric}. This is a {self.type_of_task} task. The results are as follows {self.metric_and_result}. For each metric, tell me if this is a good score (and why), and if it is not, how can I improve it? Keep your answer to the point.
        The dataset description is: {self.description}
    """
        response: ChatResponse = chat(
            model="llama3.2",
            messages=[
                {
                    "role": "user",
                    "content": prompt_format,
                },
            ],
            options={
                "temperature": 0.3,
            }
        )
        response = response["message"]["content"]
        markdown_response = markdown.markdown(response)
        return markdown_response
    
    def get_data_report(self):
        try:
            with open(f"{self.generated_data_reports_dir}/{self.dataset_id}_report.html", "r") as f:
                return f.read()
        except FileNotFoundError:
            return "<div><p>Feature importance not available for this dataset</p></div>"
    
    def __call__(self):
        best_result_table = self.generate_best_result_table()
        framework_table = self.generate_framework_table()
        dashboard_section = self.generate_dashboard_section()
        explanation = self.get_explanation_from_llm()
        feature_importance = self.get_data_report()
        combined_html = f"""
    <!-- Latest compiled and minified CSS -->
<link rel="stylesheet" href="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/css/bootstrap.min.css">

<!-- jQuery library -->
<script src="https://ajax.googleapis.com/ajax/libs/jquery/3.7.1/jquery.min.js"></script>

<!-- Latest compiled JavaScript -->
<script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.4.1/js/bootstrap.min.js"></script>
    <div class="container">
        {best_result_table}
        {feature_importance}
        {dashboard_section}
        <div>
        <h1>Explanation and What's next?</h1>
        <p>!!! This is an AI-generated (llama3.2) explanation of the results. Please take the response with a grain of salt and use your own judgement.</p>
        <p>{explanation}</p>
        </div>
        {framework_table}

        </div>
        """

        with open(Path(self.generated_final_reports_dir)/f"report_{self.dataset_id}.html", "w") as f:
            f.write(combined_html)

In [102]:
# set paths
GENERATED_DATA_REPORT_DIR = Path("./data/generated_data_reports")
os.makedirs(GENERATED_DATA_REPORT_DIR, exist_ok=True)

GENERATED_REPORTS_DIR = Path("./data/generated_reports")
GENERATED_REPORTS_DIR.mkdir(exist_ok=True)

# find the largest dataset id that has been processed
max_existing_dataset_id:int = find_max_existing_dataset_id()

In [63]:
def run_report_script_for_all_datasets(GENERATED_DATA_REPORT_DIR, GENERATED_REPORTS_DIR, max_existing_dataset_id):
    # collect all the results from the runs
    collector = ResultCollector()
    all_results = collector()
    drg = DataReportGenerator(GENERATED_DATA_REPORT_DIR)
    for dataset_id in tqdm(range(1,max_existing_dataset_id + 1)):
        try:
            # generate the data report for all datasets
            drg.generate_data_report_for_dataset(dataset_id=dataset_id)
            # write complete report to a file
            GenerateCompleteReportForDataset(dataset_id=dataset_id,collector_results=all_results, GENERATED_DATA_REPORT_DIR=GENERATED_DATA_REPORT_DIR, GENERATED_REPORTS_DIR=GENERATED_REPORTS_DIR)()
        except Exception as e:
            print(f"Error generating report for dataset {dataset_id}: {str(e)}")


In [64]:
run_report_script_for_all_datasets(GENERATED_DATA_REPORT_DIR, GENERATED_REPORTS_DIR, max_existing_dataset_id)

100%|██████████| 65/65 [00:00<00:00, 394.66it/s]
  0%|          | 0/11 [00:00<?, ?it/s]

Error generating report for dataset 1: 'NoneType' object is not subscriptable


 27%|██▋       | 3/11 [00:18<00:48,  6.05s/it]


KeyboardInterrupt: 

In [1]:
import openml

In [None]:
openml.tasks.get_task()

In [1]:
import openml
openml.config.apikey = openml.config.get_config_as_dict()['apikey']

In [3]:
t = openml.tasks.get_task(2)

In [8]:
df = t.get_dataset().get_data()[0]
df.head()

Unnamed: 0,family,product-type,steel,carbon,hardness,temper_rolling,condition,formability,strength,non-ageing,...,s,p,shape,thick,width,len,oil,bore,packing,class
0,,C,A,8.0,0.0,,S,,0.0,,...,,,COIL,0.7,610.0,0.0,,0,,3
1,,C,R,0.0,0.0,,S,2.0,0.0,,...,,,COIL,3.2,610.0,0.0,,0,,3
2,,C,R,0.0,0.0,,S,2.0,0.0,,...,,,SHEET,0.7,1300.0,762.0,,0,,3
3,,C,A,0.0,60.0,T,,,0.0,,...,,,COIL,2.801,385.1,0.0,,0,,3
4,,C,A,0.0,60.0,T,,,0.0,,...,,,SHEET,0.801,255.0,269.0,,0,,3


In [10]:
df["class"].unique()

['3', 'U', '1', '5', '2']
Categories (6, object): ['1' < '2' < '3' < '4' < '5' < 'U']

In [98]:
d = openml.datasets.get_dataset(128)

In [125]:
from pandas import CategoricalDtype
import pandas as pd
import numpy as np

def get_target_col_type(dataset, target_col_name):
    try:
        if dataset.features:
            return next((feature.data_type for feature in dataset.features.values() if feature.name == target_col_name), None)
    except Exception as e:
        print(e)
        return None
def check_if_api_key_is_valid():
    if not openml.config.get_config_as_dict()['apikey']:
        print("API key is not set. Please set the API key using openml.config.apikey = 'your-key'")
        return False
    else:
        return True


def try_create_task(dataset_id):
    try:
        dataset = openml.datasets.get_dataset(dataset_id)
        target_col_name = dataset.default_target_attribute
        target_col_type = get_target_col_type(dataset, target_col_name)

        if target_col_type:
            if target_col_type in ['nominal', 'string', 'categorical']:
                evaluation_measure="predictive_accuracy"
                task_type = openml.tasks.TaskType.SUPERVISED_CLASSIFICATION
            elif target_col_type == 'numeric':
                evaluation_measure="mean_absolute_error"
                task_type = openml.tasks.TaskType.SUPERVISED_REGRESSION
            else:
                return None
            
            task = openml.tasks.create_task(
                dataset_id=dataset_id,
                task_type=task_type,
                target_name=target_col_name,
                evaluation_measure=evaluation_measure,
                estimation_procedure_id=1)
            # try:
            if check_if_api_key_is_valid():
                task.publish()
            else:
                return None
            print(f"Task created: {task}, task_id: {task.task_id}")
            return task.task_id
        else:
            return None
    except Exception as e:
        print(e)
    
    return None



In [128]:
try_create_task(46342)

Task created: OpenML Classification Task
Task Type Description: https://www.openml.org/tt/TaskType.SUPERVISED_CLASSIFICATION
Task ID..............: 362130
Task URL.............: https://www.openml.org/t/362130
Estimation Procedure.: None
Evaluation Measure...: predictive_accuracy
Target Feature.......: result
Cost Matrix..........: Available, task_id: 362130


362130

- todo
  - requirements to poetry
  - install all required frameworks
  - check if results exist

In [1]:
import sqlite3


In [5]:
# connect to the database and delete rows that have framework==autogluon
conn = sqlite3.connect('./data/runs.db')
c = conn.cursor()
benchmarks_to_use = [ "autoweka", "decisiontree", "flaml", "gama", "h20automl", "hyperoptsklearn", "lightautoml", "oboe", "tpot", "autogluon"]
for framework in benchmarks_to_use:
    c.execute(f"DELETE FROM runs WHERE framework='{framework}'")
conn.commit()

In [14]:
import sqlite3
# ['dataset_id', 'task_id', 'framework']
conn = sqlite3.connect('./data/runs.db')
c = conn.cursor()

c.execute("SELECT distinct dataset_id FROM runs")
rows = c.fetchall()
rows = [x[0] for x in rows]
max(rows)


11

In [7]:
# conn = sqlite3.connect('./data/runs.db')
# c = conn.cursor()

# # add task (10, 10, 'autogluon') to the database
# c.execute("INSERT INTO runs VALUES (10, 10, 'autogluon')")
# conn.commit()

# load the db as writable
conn = sqlite3.connect('./data/runs.db')
c = conn.cursor()

# check if the task (10, 10, 'autogluon') is in the database
c.execute("SELECT * FROM runs WHERE task_id=10 AND dataset_id=10 AND framework='autogluon'")
print(c.fetchall())

# write a function that takes a task_id and dataset_id and framework and adds it to the database

def add_run(task_id, dataset_id, framework):
    conn = sqlite3.connect('./data/runs.db')
    c = conn.cursor()
    c.execute(f"INSERT INTO runs VALUES ({task_id}, {dataset_id}, '{framework}')")
    conn.commit()
    conn = sqlite3.connect('./data/runs.db')
    c = conn.cursor()
    c.execute(f"SELECT * FROM runs WHERE task_id={task_id} AND dataset_id={dataset_id} AND framework='{framework}'")
    print(c.fetchall())

add_run(10, 10, 'autogluon')



[]


OperationalError: attempt to write a readonly database

In [6]:
# select * from runs
c.execute("SELECT * FROM runs")
print(c.fetchall())


[(2, 2, 'randomforest'), (3, 3, 'randomforest'), (4, 4, 'randomforest'), (5, 5, 'randomforest'), (6, 6, 'randomforest'), (7, 7, 'randomforest'), (8, 8, 'randomforest'), (9, 9, 'randomforest'), (10, 10, 'randomforest'), (11, 11, 'randomforest'), (2, 2, 'autosklearn'), (3, 3, 'autosklearn'), (4, 4, 'autosklearn'), (5, 5, 'autosklearn'), (6, 6, 'autosklearn'), (7, 7, 'autosklearn'), (8, 8, 'autosklearn'), (9, 9, 'autosklearn'), (10, 10, 'autosklearn'), (11, 11, 'autosklearn')]


In [None]:
import openml

In [4]:
openml.datasets.get_dataset(1)

OpenML Dataset
Name..........: anneal
Version.......: 2
Format........: ARFF
Upload Date...: 2014-04-06 23:19:20
Licence.......: Public
Download URL..: https://api.openml.org/data/v1/download/1/anneal.arff
OpenML URL....: https://www.openml.org/d/1
# of features.: 39
# of instances: 898

In [1]:
import openml
import pandas as pd
import os
import subprocess
from tqdm.auto import tqdm
import json

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class TaskConfig:
    def __init__(self, *args):
        self.args = args

In [23]:
class TaskFinder:
    def __init__(
        self,
        testing_mode=True,
        use_cache=True,
        run_mode="docker",
        num_tasks_to_return=1,
        save_every_n_tasks=10,
    ):
        self.testing_mode = testing_mode
        self.cache_file_name = "data/dataset_list.csv"
        self.global_results_store = {}
        self.num_tasks_to_return = num_tasks_to_return
        self.use_cache = use_cache
        self.save_every_n_tasks = save_every_n_tasks
        # self.benchmarks_to_use = ["autosklearn"]
        self.benchmarks_to_use = ["randomforest", "autogluon"]
        # Ensure required folders exist
        self.make_files(["data"])
        self.run_mode = run_mode

        self.check_run_mode()
        # Load datasets from cache or OpenML
        self.datasets = self.load_datasets()

        # If in testing mode, only take the first dataset
        if self.testing_mode:
            # self.datasets = self.datasets.head(1)
            # 43972 - tic tac
            self.datasets = self.datasets[self.datasets["did"] == 50]

    def check_run_mode(self):
        possible = ["local", "aws", "docker", "singularity"]
        if self.run_mode not in possible:
            raise ValueError(
                f"Invalid run mode: {self.run_mode}. Possible values are: {possible}"
            )

    def load_datasets(self):
        datasets = openml.datasets.list_datasets(output_format="dataframe")
        # if the cache file exists, append the new unique datasets to it
        if self.use_cache and os.path.exists(self.cache_file_name):
            cached_datasets = pd.read_csv(self.cache_file_name)
        else:
            if os.path.exists(self.cache_file_name):
                cached_datasets = pd.read_csv(self.cache_file_name)
                datasets = pd.concat([datasets, cached_datasets], ignore_index=True)
                datasets = datasets.drop_duplicates(subset=["did"])

                # Save the updated dataset list to the cache
                datasets.to_csv(self.cache_file_name)
        return datasets

    def make_files(self, folders):
        for folder in folders:
            os.makedirs(folder, exist_ok=True)

    def get_tasks_from_dataset(self, dataset_id, num_tasks_to_return=1):
        try:
            tasks = openml.tasks.list_tasks(
                data_id=dataset_id, output_format="dataframe"
            )
            # if the task column estimation_procedure is not 10-fold Crossvalidation drop the row
            tasks = tasks[tasks["estimation_procedure"] == "10-fold Crossvalidation"]
            # return the first num_tasks_to_return tasks
            tasks = tasks.head(num_tasks_to_return)
            return tasks["tid"].tolist() if not len(tasks) == 0 else None
        except Exception as e:
            print(f"Error retrieving tasks for dataset {dataset_id}: {e}")
            return None

    def run_all_benchmarks_on_task(self, task_id):
        # task_id = task_id.strip()
        for benchmark_type in self.benchmarks_to_use:
            try:
                result = subprocess.run(
                    [
                        "python3",
                        "automlbenchmark/runbenchmark.py",
                        benchmark_type,
                        f"openml/t/{task_id}",
                        "--mode",
                        self.run_mode,
                    ],
                    text=True,
                    capture_output=True,
                )
                print(result.stderr)
                # Filter and return relevant benchmark output
                if self.run_mode == "local":
                    return [
                        line for line in result.stderr.split("\n") if "TaskConfig" in line
                    ]
                elif self.run_mode == "docker":
                   return [
                        line for line in result.stderr.split("\n") if "Starting docker: docker run --name " in line
                    ] 
            except subprocess.CalledProcessError as e:
                print(f"Benchmark run failed for task {task_id}: {e}")
                return []

    def get_task_config_from_str(self, task_str):
        try:
            if self.run_mode == "local":
                return eval(task_str).__dict__["args"][0]
            elif self.run_mode == "docker":
                # --name randomforest.openml_t_49.test.docker.20241009T153843.sIiURqJG8Z99apxsQC8ISg__ 
                
        except Exception as e:
            print(f"Error parsing task config: {e}")
            return None

    def get_task_for_dataset(self, dataset_id):
        task_ids = self.get_tasks_from_dataset(dataset_id, self.num_tasks_to_return)
        if task_ids:
            for task_id in tqdm(
                task_ids, desc=f"Running benchmark on dataset {dataset_id}"
            ):
                benchmark_results = self.run_all_benchmarks_on_task(
                    task_id,
                )
                for result in tqdm(
                    benchmark_results, desc=f"Processing results for task {task_id}"
                ):
                    task_config = self.get_task_config_from_str(result)
                    if task_config and self.run_mode == "local":
                        current_run_info = {
                            "dataset_id": dataset_id,
                            "task_id": task_id,
                            "task_config": task_config,
                        }
                        self.global_results_store[dataset_id] = current_run_info
                    if task_config and self.run_mode == "docker":
                        current_run_info = {
                            "dataset_id": dataset_id,
                            "task_id": task_id,
                        }
                # write the results to a file every 10 tasks
                if len(self.global_results_store) % self.save_every_n_tasks == 0:
                    self.write_global_to_file()

    def write_global_to_file(
        self, file_name="data/links_to_automl_files_per_dataset.json"
    ):
        # read the current file if it exists and append the new results
        if os.path.exists(file_name):
            with open(file_name, "r") as f:
                data = json.load(f)
                data.update(self.global_results_store)
        else:
            data = self.global_results_store
        with open(file_name, "w") as f:
            json.dump(data, f)

    def load_global_from_file(
        self, file_name="data/links_to_automl_files_per_dataset.json"
    ):
        if os.path.exists(file_name):
            with open(file_name, "r") as f:
                self.global_results_store = json.load(f)
        else:
            print("No global results file found")

    def run_benchmark_on_all_datasets(self):
        for _, row in self.datasets.iterrows():
            dataset_id = row["did"]
            self.get_task_for_dataset(dataset_id)

        # save info
        self.write_global_to_file()

    def upload_results_to_openml(self):
        raise NotImplementedError()

In [28]:
tf = TaskFinder()

In [29]:
tf.run_benchmark_on_all_datasets()

Running benchmark on dataset 50:   0%|          | 0/1 [00:00<?, ?it/s]

Running benchmark `randomforest` on `openml/t/49` framework in `docker` mode.
Loading frameworks definitions from ['/Users/smukherjee/Documents/CODE/Github/OpenML-auto-automl/automlbenchmark/resources/frameworks.yaml'].
Loading benchmark constraint definitions from ['/Users/smukherjee/Documents/CODE/Github/OpenML-auto-automl/automlbenchmark/resources/constraints.yaml'].
Loading openml task 49.
Running cmd `docker images -q automlbenchmark/randomforest:stable-dev`
Running cmd `docker pull automlbenchmark/randomforest:stable-dev`
Error response from daemon: manifest for automlbenchmark/randomforest:stable-dev not found: manifest unknown: manifest unknown



Error response from daemon: manifest for automlbenchmark/randomforest:stable-dev not found: manifest unknown: manifest unknown

Running cmd `docker images -q automlbenchmark/randomforest:stable`
[MONITORING] [docker.openml_t_49.test.all_tasks.all_folds.RandomForest] CPU Utilization: 26.1%

---------------------------------------------

Processing results for task 49: 100%|██████████| 2/2 [00:00<00:00, 18157.16it/s]
Running benchmark on dataset 50: 100%|██████████| 1/1 [00:41<00:00, 41.49s/it]


In [3]:
from pathlib import Path
store = tf.global_results_store
for datasets in store.keys():
    result_file = Path(store[datasets]["task_config"]["output_dir"])/"scores/results.csv"
    pandas_file = pd.read_csv(result_file)
    print(pandas_file.head())
    break

NameError: name 'tf' is not defined

In [30]:
pandas_file.columns

Index(['id', 'task', 'framework', 'constraint', 'fold', 'type', 'result',
       'metric', 'mode', 'version', 'params', 'app_version', 'utc', 'duration',
       'training_duration', 'predict_duration', 'models_count', 'seed', 'info',
       'acc', 'auc', 'balacc', 'logloss'],
      dtype='object')

In [14]:
import pandas as pd
from glob import glob
from pathlib import Path
import json

{'framework': 'RandomForest', 'framework_params': {'n_estimators': 2000}, 'framework_version': '1.0', 'type': 'classification', 'name': 'anneal', 'fold': 1, 'metric': 'logloss', 'metrics': ['logloss', 'acc', 'balacc'], 'seed': 1095784781, 'job_timeout_seconds': 1200, 'max_runtime_seconds': 600, 'cores': 4, 'max_mem_size_mb': 10721, 'min_vol_size_mb': -1, 'input_dir': '/input', 'output_dir': '/output/', 'output_predictions_file': '/output/predictions/anneal/1/predictions.csv', 'ext': {}, 'type_': 'multiclass', 'output_metadata_file': '/output/predictions/anneal/1/metadata.json'}


In [1]:
import dash
import dash_bootstrap_components as dbc
from dash import dcc, html
from dash.dependencies import Input, Output
import plotly.express as px

In [32]:
# Initialize Dash app
app = dash.Dash(__name__, external_stylesheets=[dbc.themes.BOOTSTRAP])

# Layout
app.layout = dbc.Container([
    dbc.Row([
        dbc.Col(html.H1("Model Results Dashboard"), width=12)
    ]),
    
    dbc.Row([
        dbc.Col([
            html.Label("Select Metric:"),
            dcc.Dropdown(
                id='metric-dropdown',
                options=[
                    {'label': 'Accuracy', 'value': 'acc'},
                    {'label': 'AUC', 'value': 'auc'},
                    {'label': 'Log Loss', 'value': 'logloss'}
                ],
                value='acc',
                clearable=False
            )
        ], width=4),
    ]),

    dbc.Row([
        dbc.Col(dcc.Graph(id='metric-graph'), width=12)
    ]),

    dbc.Row([
        dbc.Col(dcc.Graph(id='task-pie-chart'), width=6),
        dbc.Col(dcc.Graph(id='framework-bar-chart'), width=6),
    ]),

], fluid=True)

# Callbacks
@app.callback(
    Output('metric-graph', 'figure'),
    Output('task-pie-chart', 'figure'),
    Output('framework-bar-chart', 'figure'),
    Input('metric-dropdown', 'value')
)


# Run app and display result inline in the notebook
app.run_server(mode='inline')

In [40]:
import re
def get_task_id_from_folder_name(folder_name):
    name = re.findall(pattern=r"_t_.*[0-9]\.", string=folder_name)
    print(name)
    if isinstance(name, list) and len(name) > 0:
        return str(name[0])
    else:
        return name

In [41]:
get_task_id_from_folder_name(folder_name="randomforest.openml_t_5.test.docker.20241016T141320")

['_t_5.']


'_t_5.'

In [42]:
import openml

In [51]:
t = openml.tasks.get_task(10, download_data=False, download_qualities= False)
t.dataset_id

10

In [None]:
i