In [None]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml

from optuna_kfoldCV import evaluate_dataset_with_model, run_all_openML_with_model
from regression_param_specs import evaluate_Ridge, evaluate_XGBoostRegressor

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [None]:
# Fetch the collection with ID 353
collection = openml.study.get_suite(353)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )

    #count missing values in X
    missing_values_count = X.isnull().sum().sum()
    print(f"Missing values in X: {missing_values_count}")

    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical,
        'n_missing_values': missing_values_count,
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata.loc[44962, "has_categorical"] = True
df_metadata

In [None]:
dataset_ids_no_categorical = list(df_metadata.query("has_categorical == False").index.values)
dataset_ids_no_categorical = sorted([int(x) for x in dataset_ids_no_categorical])
len(dataset_ids_no_categorical)
dataset_ids_no_categorical

# Run experiments (just for testing)

In [None]:
# run_all_openML_with_model(
#     dataset_ids_no_categorical[0:2], 
#     evaluate_Ridge,
#     name_model="Ridge",
#     k_folds=5,
#     cv_seed=42,
#     regression_or_classification="regression",
#     n_optuna_trials=2,
#     device="cuda",
#     save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
# )

-- models:::: End2End Ridge RidgeCV XGBoostRegressor GradientRFBoost GradientRFBoostID GreedyRFBoostDense GreedyRFBoostDiag GreedyRFBoostScalar


In [None]:
!python regression_param_specs.py --models End2End --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 50 --device cuda --k_folds 5 --cv_seed 42 --save_experiments_individually

In [6]:
!python regression_param_specs.py --models Ridge RidgeCV --dataset_indices 17 18 19  --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 100 --device cuda --k_folds 5 --cv_seed 42

[32m[I 2024-12-02 20:50:48,435][0m A new study created in memory with name: no-name-674f959a-9fde-473e-b2fd-8f2f38c81b92[0m
[32m[I 2024-12-02 20:50:48,446][0m Trial 0 finished with value: 0.4133633732795715 and parameters: {'l2_reg': 0.0017053709204126154}. Best is trial 0 with value: 0.4133633732795715.[0m
[32m[I 2024-12-02 20:50:48,454][0m Trial 1 finished with value: 0.4133248865604401 and parameters: {'l2_reg': 2.0638495199064525e-05}. Best is trial 1 with value: 0.4133248865604401.[0m
[32m[I 2024-12-02 20:50:48,462][0m Trial 2 finished with value: 0.41373313069343565 and parameters: {'l2_reg': 0.0064613987690671825}. Best is trial 1 with value: 0.4133248865604401.[0m
[32m[I 2024-12-02 20:50:48,470][0m Trial 3 finished with value: 0.41357163786888124 and parameters: {'l2_reg': 0.004677157441922423}. Best is trial 1 with value: 0.4133248865604401.[0m
[32m[I 2024-12-02 20:50:48,481][0m Trial 4 finished with value: 0.4133392870426178 and parameters: {'l2_reg': 0.00110

In [None]:
!python regression_param_specs.py --models XGBoostRegressor --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 1 --device cpu --k_folds 2 --cv_seed 42 --save_experiments_individually

In [None]:
!python regression_param_specs.py --models GradientRFBoost --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 100 --device cuda --k_folds 5 --cv_seed 42 --save_experiments_individually

In [None]:
!python regression_param_specs.py --models GradientRFBoostID --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 1 --device cuda --k_folds 2 --cv_seed 42 --save_experiments_individually

In [None]:
!python regression_param_specs.py --models GreedyRFBoostDense --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 100 --device gpu --k_folds 5 --cv_seed 42 --save_experiments_individually

In [None]:
!python regression_param_specs.py --models GreedyRFBoostDiag --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 1 --device cuda --k_folds 2 --cv_seed 42 --save_experiments_individually

In [None]:
!python regression_param_specs.py --models GreedyRFBoostScalar --save_dir /home/nikita/Code/random-feature-boosting/save/OpenMLRegression/ --n_optuna_trials 1 --device cuda --k_folds 2 --cv_seed 42 --save_experiments_individually

# join json results

In [None]:
from pydantic.v1.utils import deep_update
import json
import os
import numpy as np
import pandas as pd

def read_json(path):
    with open(path, "r") as f:
        return json.load(f)
    


def custom_deep_update(original, update):
    for key, value in update.items():
        if isinstance(value, dict) and key in original:
            custom_deep_update(original[key], value)
        else:
            original[key] = value



def get_joined_results_json(
        models = ["End2End", "Ridge", "RidgeCV", "XGBoostRegressor", 
                  "GradientRFBoost", "GradientRFBoostID", 
                  "GreedyRFBoostDense", "GreedyRFBoostDiag", "GreedyRFBoostScalar"],
        save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLRegression/",
        ):
    results_json = {}
    for model in models:
        path = os.path.join(save_dir, f"regression_{model}.json")
        res = read_json(path)
        if results_json == {}:
            results_json = res
        else:
            custom_deep_update(results_json, res)
    return results_json



def join_jsons_into_array(
        results_json,
        ):
    results = []
    for dataset, dataset_results in results_json.items():
        res = []
        for model_name, model_results in dataset_results.items():
            model_res = np.stack([model_results["score_train"], model_results["score_test"], model_results["t_fit"], model_results["t_inference"]])
            res.append(model_res)
        results.append(res)
    return np.stack(results) # (n_datasets, n_models, 4, n_folds)



def results_to_df(
        models = ["End2End", "Ridge", "RidgeCV", "XGBoostRegressor", 
                  "GradientRFBoost", "GradientRFBoostID", 
                  "GreedyRFBoostDense", "GreedyRFBoostDiag", "GreedyRFBoostScalar"],
        save_dir = "/home/nikita/Code/random-feature-boosting/save/OpenMLRegression/",
        ):
    # Load and join the JSON data
    results_json = get_joined_results_json(models, save_dir)
    results = join_jsons_into_array(results_json) # (n_datasets, n_models, 4, n_folds)
    #TODO for now just average. later, add stds too
    results = np.mean(results, axis=-1)
    
    # Extract dataset names and prepare metrics
    datasets = list(results_json.keys())
    models = list(results_json[datasets[0]].keys())
    metrics = ["score_train", "score_test", "t_fit", "t_inference"]
    
    # Create a dictionary to hold metric-specific DataFrames
    metric_dfs = {metric: pd.DataFrame(index=datasets, columns=models) for metric in metrics}
    
    # Populate the DataFrames for each metric
    for dataset_idx, dataset in enumerate(datasets):
        for model_idx, model in enumerate(models):
            for metric_idx, metric in enumerate(metrics):
                # Average across folds for each metric
                metric_dfs[metric].loc[dataset, model] = results[dataset_idx, model_idx, metric_idx]
    
    return metric_dfs  # Return a dictionary of metric-specific DataFrames


df = results_to_df()

In [None]:
df["score_test"].mean().sort_values(ascending=True)

In [None]:
df["score_train"].mean().sort_values(ascending=True)

In [None]:
df["score_test"].rank(axis=1).mean().sort_values()