In [1]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml


from optuna_kfoldCV import evaluate_dataset_with_model, run_all_openML_with_model
from classification_param_specs import evaluate_LogisticRegression, evaluate_XGBoostClassifier

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [None]:
# # Fetch the collection with ID 99 https://www.openml.org/search?type=study&study_type=task&id=99&sort=runs_included
# collection = openml.study.get_suite(99)
# dataset_ids = collection.data
# metadata_list = []

# # Fetch and process each dataset
# for i, dataset_id in enumerate(dataset_ids):
#     dataset = openml.datasets.get_dataset(dataset_id)
#     X, y, categorical_indicator, attribute_names = dataset.get_data(
#         target=dataset.default_target_attribute
#     )

#     #count missing values in X
#     missing_values_count = X.isnull().sum().sum()
#     print(f"Missing values in X: {missing_values_count}")

#     X = np.array(X)
#     y = np.array(y)[..., None]
#     print(X.shape)
#     print(y.shape)
    
#     # Determine if the dataset has categorical features
#     has_categorical = any(categorical_indicator)
    
#     # Extract the required metadata
#     metadata = {
#         'dataset_id': dataset.id,
#         'name': dataset.name,
#         'n_obs': int(dataset.qualities['NumberOfInstances']),
#         'n_features': int(dataset.qualities['NumberOfFeatures']),
#         '%_unique_y': len(np.unique(y))/len(y),
#         'n_unique_y': len(np.unique(y)),
#         'has_categorical': has_categorical,
#         'n_missing_values': missing_values_count,
#     }
    
#     metadata_list.append(metadata)
#     print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# # Create a DataFrame from the metadata list
# df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
# df_metadata.sort_values('%_unique_y', ascending=True)

# # Display the metadata DataFrame
# df_metadata

In [None]:
# pd.set_option('display.max_rows', 72)
# df_metadata.head(72).sort_values("n_obs")

In [None]:
# df_metadata.index

In [None]:
from optuna_kfoldCV import np_load_openml_dataset, openML_cls_ids
import numpy as np


for idx, id in enumerate(openML_cls_ids):
    X,y = np_load_openml_dataset(id, "classification")
    print("idx", idx, "id", id, "X", X.shape, "y", y.shape, np.isnan(X).sum(), np.isnan(y).sum())

# for id in df_metadata.index:
#     X,y = np_load_openml_dataset(id, "classification")
#     print("id", id, "X", X.shape, "y", y.shape, np.isnan(X).sum(), np.isnan(y).sum())

## TODO TODO TODO NEXT: investigate nans, and "Mean of empty slice". also look at less than 200 classes and check performance on reg

# Run experiments (just for testing)

In [None]:
# !python classification_param_specs.py \
#     --models End2End \
#     --dataset_indices 0 2 \
#     --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLClassification/ \
#     --n_optuna_trials 2 \
#     --device cuda \
#     --k_folds 5 \
#     --cv_seed 42

In [None]:
# !python classification_param_specs.py \
#     --models LogisticRegression \
#     --dataset_indices 0 2 \
#     --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLClassification/ \
#     --n_optuna_trials 10 \
#     --device cuda \
#     --k_folds 2 \
#     --cv_seed 42

In [None]:
# !python classification_param_specs.py \
#     --models XGBoostClassifier \
#     --dataset_indices 0 2 \
#     --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLClassification/ \
#     --n_optuna_trials 2 \
#     --device cuda \
#     --k_folds 2 \
#     --cv_seed 42

In [None]:
# !python classification_param_specs.py \
#     --models RFNN \
#     --dataset_indices 0 2 \
#     --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLClassification/ \
#     --n_optuna_trials 2 \
#     --device cuda \
#     --k_folds 5 \
#     --cv_seed 42

In [None]:
# !python classification_param_specs.py \
#     --models RFRBoost \
#     --dataset_indices 0 2 \
#     --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLClassification/ \
#     --n_optuna_trials 2 \
#     --device cuda \
#     --k_folds 5 \
#     --cv_seed 42

# join json results

In [2]:
from pydantic.v1.utils import deep_update
import json
import os
import numpy as np
import pandas as pd

from optuna_kfoldCV import openML_cls_ids


def read_json(path):
    with open(path, "r") as f:
        return json.load(f)
    


def custom_deep_update(original, update):
    for key, value in update.items():
        if isinstance(value, dict) and key in original:
            custom_deep_update(original[key], value)
        else:
            original[key] = value



def get_joined_results_json(
        models = ["End2End", "Ridge", "XGBoostRegressor",
                  "GradientRFRBoost", "GradientRFRBoostID", 
                  "GreedyRFRBoostDense", "GreedyRFRBoostDiag", "GreedyRFRBoostScalar",
                  "RandomFeatureNetwork"],
        datasets = openML_cls_ids,
        save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLClassification/",
        classification_or_regression = "classification",
        ):
    results_json = {}
    for model in models:
        for dataset in datasets:
            path = os.path.join(save_dir, f"{classification_or_regression}_{dataset}_{model}.json")
            res = read_json(path)
            if results_json == {}:
                results_json = res
            else:
                custom_deep_update(results_json, res)
    return results_json



def join_jsons_into_array(
        results_json,
        ):
    results = []
    for dataset, dataset_results in results_json.items():
        res = []
        for model_name, model_results in dataset_results.items():
            model_res = np.stack([model_results["score_train"], model_results["score_test"], model_results["t_fit"], model_results["t_inference"]])
            res.append(model_res)
        results.append(res)
    return np.stack(results) # (n_datasets, n_models, 4, n_folds)


def results_to_df(
        models = ["End2End", "Ridge", "XGBoostRegressor", 
                  "GradientRFRBoost", "GradientRFRBoostID", 
                  "GradientRFRBoost_relu",
                  "GreedyRFRBoostDense", "GreedyRFRBoostDiag", "GreedyRFRBoostScalar",
                  "GreedyRFRBoostDense_relu", "GreedyRFRBoostDiag_relu", "GreedyRFRBoostScalar_relu",
                  "RandomFeatureNetwork", "RandomFeatureNetwork_iid",
                  "RandomFeatureNetwork_relu", "RandomFeatureNetwork_iid_relu",
                  ],
        datasets = openML_cls_ids[:],
        save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLClassification/",
        ):
    # Load and join the JSON data
    results_json = get_joined_results_json(models, datasets, save_dir)
    results = join_jsons_into_array(results_json) # (n_datasets, n_models, 4, n_folds)
    
    # Calculate means and stds across folds
    results_mean = np.mean(results, axis=-1)  # (n_datasets, n_models, 4)
    results_std = np.std(results, axis=-1)    # (n_datasets, n_models, 4)
    
    # Create a dictionary to hold both mean and std DataFrames
    metrics = ["score_train", "score_test", "t_fit", "t_inference"]
    metric_dfs = {}
    
    # Initialize DataFrames for both mean and std metrics
    for metric in metrics:
        metric_dfs[metric] = pd.DataFrame(index=datasets, columns=models)
        metric_dfs[f"{metric}_std"] = pd.DataFrame(index=datasets, columns=models)
    
    # Populate the DataFrames for each metric
    for dataset_idx, dataset in enumerate(datasets):
        for model_idx, model in enumerate(models):
            for metric_idx, metric in enumerate(metrics):
                # Set mean value
                metric_dfs[metric].loc[dataset, model] = results_mean[dataset_idx, model_idx, metric_idx]
                # Set* std value
                metric_dfs[f"{metric}_std"].loc[dataset, model] = results_std[dataset_idx, model_idx, metric_idx]
    
    return metric_dfs


In [4]:
models = ["End2End_cpu", 
                "LogisticRegression", "XGBoostClassifier", 
                "RFNN", "RFNN_iid",
                ]
for up in ["", "_upscaleiid", "_ID"]:
    for bn in ["", "_batchnormFalse"]:
        for ls in ["", "_linesearchFalse"]:
            models.append(f"RFRBoost{up}{ls}{bn}")

# for name in  ["RFRBoost_upscaleiid", "RFRBoost_upscaleiid_linesearchFalse", 
#               "RFRBoost_upscaleiid_batchnormFalse", "RFRBoost_ID",
#               "RFRBoost_linesearchFalse_batchnormFalse",
#               "RFRBoost_upscaleiid_linesearchFalse_batchnormFalse",
#               "End2End_cpu",
#               "XGBoostClassifier",]:
#     models.remove(name)

#save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLClassification/"
save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLClassification_ce/"

In [None]:

for i in range(57):
    for model in models:
        try:
            results = results_to_df(models=[model], datasets=[openML_cls_ids[i]],
                                    save_dir=save_dir)
        except:
            print(f"Failed for {model} on {i}, ie {openML_cls_ids[i]}")
            pass

In [5]:
df = results_to_df(        
    models = models,
    datasets = openML_cls_ids[[i for i in range(len(openML_cls_ids)) if i not in [33]]],
    save_dir=save_dir,
    )


In [9]:

df = results_to_df(        
    models = ["End2End_cpu", 
                "LogisticRegression", 
                "XGBoostClassifier", 
                "RFNN", "RFNN_iid", 
                "RFRBoost_ID_batchnormFalse",
                #"RFRBoost_upscaleiid_batchnormFalse",
                ],
    datasets = openML_cls_ids[[i for i in range(len(openML_cls_ids)) if i not in [33]]],
    save_dir=save_dir,
    )

In [None]:
df["score_test"].mean().sort_values(ascending=True)
#ce ce ce ce ce
# XGBoostClassifier                                    -0.857749
# RFRBoost_ID_batchnormFalse                           -0.857634
# RFRBoost_upscaleiid_batchnormFalse                   -0.857406
# RFRBoost_upscaleiid_linesearchFalse_batchnormFalse   -0.857285
# RFRBoost_ID_linesearchFalse_batchnormFalse           -0.857017
# RFRBoost_batchnormFalse                              -0.856695
# RFRBoost_linesearchFalse_batchnormFalse              -0.856688
# RFRBoost                                             -0.854604
# RFRBoost_linesearchFalse                             -0.854588
# RFRBoost_upscaleiid_linesearchFalse                  -0.854255
# RFRBoost_ID                                          -0.853843
# RFRBoost_upscaleiid                                  -0.853546
# End2End_cpu                                          -0.852974
# RFRBoost_ID_linesearchFalse                          -0.851653
# RFNN                                                 -0.850191
# RFNN_iid                                              -0.84904
# LogisticRegression                                   -0.828274

In [None]:
df["score_train"].mean().sort_values(ascending=True)

In [None]:
df["t_fit"].mean().sort_values(ascending=True)

In [None]:
df["score_test"].rank(axis=1).mean().sort_values()
# GradientRFRBoost_upscaleiid        2.400000
# GreedyRFRBoostDense_upscaleiid     2.400000
# End2End                            3.542857
# GreedyRFRBoostDiag_upscaleiid      3.685714
# GreedyRFRBoostScalar_upscaleiid    4.657143
# RandomFeatureNetwork_iid           5.371429
# Ridge                              5.942857
# dtype: float64

In [None]:
(df["score_test"].rank(axis=1) == 1).sum(axis=0)

In [None]:
from aeon.visualisation import plot_critical_difference, plot_significance
import matplotlib.pyplot as plt

# Generate the plot
plot = plot_critical_difference(df["score_test"].values,
                                df["score_test"].columns.tolist(), 
                                alpha=0.05, 
                                lower_better=True)

# Retrieve the figure and axes from the plot
fig = plot[0].figure
ax = plot[0]

# Adjust figure size
fig.set_size_inches(6, 3)

# Adjust layout
fig.tight_layout()

# Save the figures
plot[0].savefig("results/OpenMLClassification/OpenMLReg_critical_difference.eps", bbox_inches='tight')
plot[0].savefig("results/OpenMLClassification/OpenMLReg_critical_difference.png", bbox_inches='tight')

In [None]:
import pandas as pd
import numpy as np

def create_latex_table(df):
    table = """
\\begin{table}[t]
\\caption{Test accuracies on the concentric circles task.}
\\label{tab:concentric-circles}
\\vskip 0.15in
\\begin{center}
\\begin{small}
\\begin{sc}
\\begin{tabular}{lcc}
\\toprule
Model & Mean Acc & Std Dev \\\\
\\midrule
"""
    for model_name in df.columns:
        accs = df[model_name]
        mean_acc = np.mean(accs)
        std_acc = np.std(accs)
        table += f"{model_name} & {mean_acc:.4f} & {std_acc:.4f} \\\\\n"
    
    table += """
\\bottomrule
\\end{tabular}
\\end{sc}
\\end{small}
\\end{center}
\\vskip -0.1in
\\end{table}
"""
    return table

# Example usage
# Assuming `results_df` is your pandas DataFrame
latex_table = create_latex_table(df_old["score_test"])
print(latex_table)

In [None]:
# df2 = results_to_df(        
#     datasets = openML_reg_ids_noCat[[0,1,2  ,4,5,6,7,8,9,10,     13,14,15,16,17,18,19]],
#     save_dir = "/home/nikita/Code/random-feature-boosting/save/regv2_added40nlayers/OpenMLRegression/",
#     )

In [None]:
# df2["score_test"].mean().sort_values(ascending=True)

In [None]:
# df2["score_train"].mean().sort_values(ascending=True)

In [None]:
# df2["score_test"].rank(axis=1).mean().sort_values()

In [None]:
#number of first places
# (df2["score_test"].rank(axis=1) == 1).sum().sort_values()

# Look at small datasets

In [None]:
less5000 = df_metadata.query("n_obs < 5000").index
less1000 = df_metadata.query("n_obs < 1000").index
less5000

In [None]:
df["score_test"].loc[less5000].mean().sort_values(ascending=True)

In [None]:
df["score_test"].loc[less5000].rank(axis=1).mean().sort_values(ascending=True)

In [None]:
df["score_test"].loc[less5000].rank(axis=1)

In [None]:
df["score_test"].loc[less1000]

# Look at distribution of params

In [None]:
import matplotlib.pyplot as plt

def plot_param_distribution(
        models = [#"End2End_cpu", 
                    "LogisticRegression", "XGBoostClassifier", 
                    #"RFNN", "RFNN_iid", 
                    "RFRBoost_ID_batchnormFalse",
                    "RFRBoost_upscaleiid_batchnormFalse",
                    ],
        datasets = openML_cls_ids[[i for i in range(len(openML_cls_ids)) if i not in [33]]],
        save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLClassification/",
        ):
    # Load and join the JSON data
    results_json = get_joined_results_json(models, datasets, save_dir)

    # model: list_of_param_names
    modelwise_param_names = {model: list(results_json[str(datasets[0])][model]['hyperparams'][0])
                            for model in models} 

    # model: param_name: list_of_param_values
    param_distribution = { model: {param: [] for param in param_names}
                          for model, param_names in modelwise_param_names.items()}

    #populate teh param_districution nested dict
    for dataset, dataset_results in results_json.items():
        for model_name, model_results in dataset_results.items():
            for fold in model_results["hyperparams"]:
                for param_name, param_val in fold.items():
                    param_distribution[model_name][param_name].append(param_val)

    # For each model, plot the distribution of each parameter
    for model, param_dict in param_distribution.items():
        print(f"Model: {model}")
        for param_name, param_values in param_dict.items():
            if param_name not in [
                "out_dim", "loss", "objective", "feature_type",
                "upscale_type", "sandwich_solver", "n_classes", "activation",
                "use_batchnorm", "do_linesearch",
                ]:
                print(param_name)
                # Create figure with two subplots side by side
                fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
                
                # Linear scale plot
                ax1.hist(param_values, bins=20)
                ax1.set_title(f"{model} {param_name}\n(linear scale)")
                ax1.set_xlabel(param_name)
                ax1.set_ylabel("Count")
                
                # Log scale plot
                min_val = np.min(param_values)  # Avoid log(0)
                max_val = np.max(param_values)
                bins = np.logspace(np.log10(min_val), np.log10(max_val), 20)
                ax2.hist(param_values, bins=bins)
                ax2.set_xscale('log')
                ax2.set_title(f"{model} {param_name}\n(log scale)")
                ax2.set_xlabel(param_name)
                ax2.set_ylabel("Count")
                
                plt.tight_layout()
                plt.show()

plot_param_distribution()