In [3]:
from typing import Tuple, List, Union, Any, Optional, Dict, Literal, Callable
import time
import collections
import os
import sys
sys.path.append(os.path.dirname(os.getcwd()))
sys.path.append(os.path.dirname(os.path.dirname(os.getcwd())))

from tqdm import tqdm
import numpy as np
import torch
import torch.nn as nn
from torch import Tensor, tensor
import pandas as pd
import openml


from optuna_kfoldCV import evaluate_dataset_with_model, run_all_openML_with_model
from classification_param_specs import evaluate_LogisticRegression, evaluate_XGBoostClassifier

np.set_printoptions(precision=3, threshold=5) # Print options

# OpenML code

In [4]:
# Fetch the collection with ID 99 https://www.openml.org/search?type=study&study_type=task&id=99&sort=runs_included
collection = openml.study.get_suite(99)
dataset_ids = collection.data
metadata_list = []

# Fetch and process each dataset
for i, dataset_id in enumerate(dataset_ids):
    dataset = openml.datasets.get_dataset(dataset_id)
    X, y, categorical_indicator, attribute_names = dataset.get_data(
        target=dataset.default_target_attribute
    )

    #count missing values in X
    missing_values_count = X.isnull().sum().sum()
    print(f"Missing values in X: {missing_values_count}")

    X = np.array(X)
    y = np.array(y)[..., None]
    print(X.shape)
    print(y.shape)
    
    # Determine if the dataset has categorical features
    has_categorical = any(categorical_indicator)
    
    # Extract the required metadata
    metadata = {
        'dataset_id': dataset.id,
        'name': dataset.name,
        'n_obs': int(dataset.qualities['NumberOfInstances']),
        'n_features': int(dataset.qualities['NumberOfFeatures']),
        '%_unique_y': len(np.unique(y))/len(y),
        'n_unique_y': len(np.unique(y)),
        'has_categorical': has_categorical,
        'n_missing_values': missing_values_count,
    }
    
    metadata_list.append(metadata)
    print(f" {i+1}/{len(dataset_ids)} Processed dataset {dataset.id}: {dataset.name}")

# Create a DataFrame from the metadata list
df_metadata = pd.DataFrame(metadata_list).sort_values('%_unique_y', ascending=False).set_index("dataset_id").sort_index()
df_metadata.sort_values('%_unique_y', ascending=True)

# Display the metadata DataFrame
df_metadata

Missing values in X: 0
(3196, 36)
(3196, 1)
 1/72 Processed dataset 3: kr-vs-kp
Missing values in X: 0
(20000, 16)
(20000, 1)
 2/72 Processed dataset 6: letter
Missing values in X: 0
(625, 4)
(625, 1)
 3/72 Processed dataset 11: balance-scale
Missing values in X: 0
(2000, 216)
(2000, 1)
 4/72 Processed dataset 12: mfeat-factors
Missing values in X: 0
(2000, 76)
(2000, 1)
 5/72 Processed dataset 14: mfeat-fourier
Missing values in X: 16
(699, 9)
(699, 1)
 6/72 Processed dataset 15: breast-w
Missing values in X: 0
(2000, 64)
(2000, 1)
 7/72 Processed dataset 16: mfeat-karhunen
Missing values in X: 0
(2000, 6)
(2000, 1)
 8/72 Processed dataset 18: mfeat-morphological
Missing values in X: 0
(2000, 47)
(2000, 1)
 9/72 Processed dataset 22: mfeat-zernike
Missing values in X: 0
(1473, 9)
(1473, 1)
 10/72 Processed dataset 23: cmc
Missing values in X: 0
(5620, 64)
(5620, 1)
 11/72 Processed dataset 28: optdigits
Missing values in X: 67
(690, 15)
(690, 1)
 12/72 Processed dataset 29: credit-app

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical,n_missing_values
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3,kr-vs-kp,3196,37,0.000626,2,True,0
6,letter,20000,17,0.001300,26,False,0
11,balance-scale,625,5,0.004800,3,False,0
12,mfeat-factors,2000,217,0.005000,10,False,0
14,mfeat-fourier,2000,77,0.005000,10,False,0
...,...,...,...,...,...,...,...
40983,wilt,4839,6,0.000413,2,False,0
40984,segment,2310,20,0.003030,7,False,0
40994,climate-model-simulation-crashes,540,21,0.003704,2,False,0
40996,Fashion-MNIST,70000,785,0.000143,10,False,0


In [5]:
ids_nfeatsLess500_noCat_noMissing = df_metadata.query("n_features < 500 and has_categorical==False and n_missing_values==0").sort_index().index.values
len(ids_nfeatsLess500_noCat_noMissing)
allIDS_str = ",".join([str(i) for i in ids_nfeatsLess500_noCat_noMissing])
allIDS_str

'6,11,12,14,16,18,22,28,32,37,44,54,182,458,1049,1050,1063,1067,1068,1462,1464,1475,1487,1489,1494,1497,1501,1510,4538,23517,40499,40979,40982,40983,40984,40994,41027'

In [4]:
pd.set_option('display.max_rows', 72)
df_metadata.loc[ids_nfeatsLess500_noCat_noMissing]

Unnamed: 0_level_0,name,n_obs,n_features,%_unique_y,n_unique_y,has_categorical,n_missing_values
dataset_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
6,letter,20000,17,0.0013,26,False,0
11,balance-scale,625,5,0.0048,3,False,0
12,mfeat-factors,2000,217,0.005,10,False,0
14,mfeat-fourier,2000,77,0.005,10,False,0
16,mfeat-karhunen,2000,65,0.005,10,False,0
18,mfeat-morphological,2000,7,0.005,10,False,0
22,mfeat-zernike,2000,48,0.005,10,False,0
28,optdigits,5620,65,0.001779,10,False,0
32,pendigits,10992,17,0.00091,10,False,0
37,diabetes,768,9,0.002604,2,False,0


# OpenML classification code (only for testing purposes)

In [8]:
run_all_openML_with_model(
    ids_nfeatsLess500_noCat_noMissing[19:21], 
    evaluate_LogisticRegression,
    name_model="LogisticRegression",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="classification",
    n_optuna_trials=5,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-24 20:16:25,180] A new study created in memory with name: no-name-afd15901-75aa-43c9-ba44-b130aa93bf03
[I 2024-11-24 20:16:27,096] Trial 0 finished with value: -0.9544167637825012 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 0.06164572167296636, 'max_iter': 64}. Best is trial 0 with value: -0.9544167637825012.
[I 2024-11-24 20:16:27,271] Trial 1 finished with value: -0.9908841848373413 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 1.6860318828068558e-07, 'max_iter': 190}. Best is trial 1 with value: -0.9908841848373413.
[I 2024-11-24 20:16:27,461] Trial 2 finished with value: -0.9908841848373413 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 5.43659545255938e-07, 'max_iter': 93}. Best is trial 1 with value: -0.9908841848373413.
[I 2024-11-24 20:16:27,646] Trial 3 finished with value: -0.9908841848373413 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 3.5568130060428566e-07, 'max_iter': 52}. Best is trial 1 with value: -0.9908841848373413.
[I 202

 1/2 Processed dataset 1462


[I 2024-11-24 20:16:31,355] Trial 2 finished with value: -0.7360504269599915 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 7.433736930960216e-08, 'max_iter': 65}. Best is trial 0 with value: -0.7893837571144104.
[I 2024-11-24 20:16:31,490] Trial 3 finished with value: -0.7860224127769471 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 0.001363192622680815, 'max_iter': 77}. Best is trial 0 with value: -0.7893837571144104.
[I 2024-11-24 20:16:31,590] Trial 4 finished with value: -0.7893837571144104 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 0.00023005482339100665, 'max_iter': 21}. Best is trial 0 with value: -0.7893837571144104.
[I 2024-11-24 20:16:31,603] A new study created in memory with name: no-name-30e32f57-44be-41b3-b03b-f2789f6ca4eb
[I 2024-11-24 20:16:31,702] Trial 0 finished with value: -0.7826330542564393 and parameters: {'in_dim': 4, 'out_dim': 2, 'l2_reg': 8.343713451443599e-06, 'max_iter': 132}. Best is trial 0 with value: -0.7826330542564393.
[I 20

 2/2 Processed dataset 1464


{'1462': {'LogisticRegression': {'score_train': [-0.9954420924186707,
    -0.9908842444419861,
    -0.994535505771637,
    -0.993624746799469,
    -0.9918032884597778],
   'score_test': [-0.9854545593261719,
    -0.9927272796630859,
    -0.9927007555961609,
    -0.9927007555961609,
    -1.0],
   't_fit': [0.20907500300381798,
    0.02446504100225866,
    0.042668960995797534,
    0.033831611995992716,
    0.04433432000223547],
   't_inference': [8.301500201923773e-05,
    0.00010463900252943859,
    7.278300472535193e-05,
    8.614700345788151e-05,
    0.00010356999700888991],
   'hyperparams': [{'in_dim': 4,
     'out_dim': 2,
     'l2_reg': 2.5958265576242355e-05,
     'max_iter': 195},
    {'in_dim': 4,
     'out_dim': 2,
     'l2_reg': 2.7259670188469582e-06,
     'max_iter': 22},
    {'in_dim': 4,
     'out_dim': 2,
     'l2_reg': 1.004935809739053e-08,
     'max_iter': 37},
    {'in_dim': 4,
     'out_dim': 2,
     'l2_reg': 5.525811842979985e-07,
     'max_iter': 59},
    {'in_d

In [6]:
run_all_openML_with_model(
    ids_nfeatsLess500_noCat_noMissing[19:21], 
    evaluate_XGBoostClassifier,
    name_model="XGBoost",
    k_folds=5,
    cv_seed=42,
    regression_or_classification="classification",
    n_optuna_trials=5,
    device="cuda",
    save_dir = "/home/nikita/Code/zephyrox/pytorch_based/SWIM/save/"
)

[I 2024-11-24 20:15:29,692] A new study created in memory with name: no-name-6b9389d2-edca-486d-88f8-a3549bc4bad0
[I 2024-11-24 20:15:30,087] Trial 0 finished with value: -0.9899709343910217 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda': 0.009022423344545576, 'learning_rate': 0.1083330254029035, 'n_estimators': 73, 'max_depth': 9, 'subsample': 0.5836071610028561, 'colsample_bytree': 0.8275271365005856}. Best is trial 0 with value: -0.9899709343910217.
[I 2024-11-24 20:15:30,503] Trial 1 finished with value: -0.9908841729164124 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda': 0.07450351970780383, 'learning_rate': 0.28961481614509105, 'n_estimators': 223, 'max_depth': 9, 'subsample': 0.6963461304444939, 'colsample_bytree': 0.8475119079656562}. Best is trial 1 with value: -0.9908841729164124.
[I 2024-11-24 20:15:30,998] Trial 2 finished with value: -0.9908800244331359 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda':

 1/2 Processed dataset 1462


[I 2024-11-24 20:15:42,123] Trial 0 finished with value: -0.7274369835853577 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda': 1.202428218141367, 'learning_rate': 0.28385165838889875, 'n_estimators': 96, 'max_depth': 10, 'subsample': 0.9143176355167892, 'colsample_bytree': 0.5254376676933088}. Best is trial 0 with value: -0.7274369835853577.
[I 2024-11-24 20:15:42,557] Trial 1 finished with value: -0.760896360874176 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda': 0.050836644248845694, 'learning_rate': 0.042014169858407495, 'n_estimators': 247, 'max_depth': 4, 'subsample': 0.7145554930249872, 'colsample_bytree': 0.6019721047495981}. Best is trial 1 with value: -0.760896360874176.
[I 2024-11-24 20:15:43,092] Trial 2 finished with value: -0.7357983231544495 and parameters: {'objective': 'multi:softmax', 'num_class': 2, 'lambda': 0.1916291239241457, 'learning_rate': 0.059374656775846205, 'n_estimators': 193, 'max_depth': 6, 'subsample': 0.778

 2/2 Processed dataset 1464


{'1462': {'XGBoost': {'score_train': [-1.0, -1.0, -1.0, -1.0, -1.0],
   'score_test': [-0.996363639831543,
    -1.0,
    -0.9963503479957581,
    -0.9963503479957581,
    -0.9927007555961609],
   't_fit': [0.07142497899621958,
    0.04901612400135491,
    0.12023879699700046,
    0.1677685449976707,
    0.06706753799517173],
   't_inference': [0.002409009000984952,
    0.002744094999798108,
    0.00787384899740573,
    0.006493987995781936,
    0.003907447004166897],
   'hyperparams': [{'objective': 'multi:softmax',
     'num_class': 2,
     'lambda': 0.07450351970780383,
     'learning_rate': 0.28961481614509105,
     'n_estimators': 223,
     'max_depth': 9,
     'subsample': 0.6963461304444939,
     'colsample_bytree': 0.8475119079656562},
    {'objective': 'multi:softmax',
     'num_class': 2,
     'lambda': 0.07964227087643784,
     'learning_rate': 0.18139793238565285,
     'n_estimators': 109,
     'max_depth': 3,
     'subsample': 0.635547226459491,
     'colsample_bytree': 0.7

In [7]:
# TODO Now i just need to code all the models, then run them on the cluster