In [3]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score
from sklearn_extra.cluster import KMedoids
import gower

#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361110
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
y=y.astype('int')

# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


N_CLUSTERS=20

X_gower = X.copy()

for col in X_gower.select_dtypes(['category']).columns:
    X_gower[col] = X_gower[col].astype('object')

gower_dist_matrix = gower.gower_matrix(X_gower)

kmedoids = KMedoids(n_clusters=N_CLUSTERS, random_state=0, metric='precomputed', init='k-medoids++').fit(gower_dist_matrix)
distances=[]
gower_dist=[]
counts=[]
ideal_len=len(kmedoids.labels_)/5

for i in np.arange(N_CLUSTERS):
    cluster_data = X_gower.loc[kmedoids.labels_==i,:]
    # Compute the Gower distance between each data point in the cluster and each data point in the global dataset
    distances_matrix = gower.gower_matrix(cluster_data, X_gower)
    # Compute the average distance
    average_distance = np.mean(distances_matrix)
    gower_dist.append(average_distance)
    counts.append(cluster_data.shape[0])

dist_df=pd.DataFrame(data={'gower_dist': gower_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
dist_df=dist_df.sort_values('gower_dist', ascending=False)
dist_df['cumulative_count']=dist_df['count'].cumsum()
dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
labelss=dist_df.index[0:final+1].to_list()
labels=pd.Series(kmedoids.labels_).isin(labelss)
labels.index=X.index
close_index=labels.index[np.where(labels==False)[0]]
far_index=labels.index[np.where(labels==True)[0]]

X_train = X.loc[close_index,:]
X_gower_ = X_train.copy()

for col in X_gower_.select_dtypes(['category']).columns:
    X_gower_[col] = X_gower_[col].astype('object')

gower_dist_matrix_ = gower.gower_matrix(X_gower_)

kmedoids_ = KMedoids(n_clusters=N_CLUSTERS, random_state=0, metric='precomputed', init='k-medoids++').fit(gower_dist_matrix_)
distances_=[]
gower_dist_=[]
counts_=[]
ideal_len_=len(kmedoids.labels_)/5

for i in np.arange(N_CLUSTERS):
    cluster_data_ = X_gower_.loc[kmedoids_.labels_==i,:]
    # Compute the Gower distance between each data point in the cluster and each data point in the global dataset
    distances_matrix_ = gower.gower_matrix(cluster_data_, X_gower_)
    # Compute the average distance
    average_distance_ = np.mean(distances_matrix_)
    gower_dist_.append(average_distance_)
    counts_.append(cluster_data_.shape[0])

dist_df_=pd.DataFrame(data={'gower_dist': gower_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
dist_df_=dist_df_.sort_values('gower_dist', ascending=False)
dist_df_['cumulative_count']=dist_df_['count'].cumsum()
dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
labelss_=dist_df_.index[0:final_+1].to_list()
labels_=pd.Series(kmedoids_.labels_).isin(labelss_)
labels_.index=X_train.index
close_index_train=labels_.index[np.where(labels_==False)[0]]
far_index_train=labels_.index[np.where(labels_==True)[0]]

# Convert data to PyTorch tensors
# Modify X_train_, X_val, X_train, and X_test to have dummy variables
X = pd.get_dummies(X.astype(str), drop_first=True)

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

X_train_ = X_train.loc[close_index_train,:]
X_val = X_train.loc[far_index_train,:]
y_train_ = y_train.loc[close_index_train]
y_val = y_train.loc[far_index_train]

# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


ValueError: Cannot cast object dtype to int32

In [56]:
#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361110
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
# Create a label encoder
le = LabelEncoder()
# Fit the label encoder and transform y to get binary labels
y_encoded = le.fit_transform(y)
# Convert the result back to a pandas Series
y = pd.Series(y_encoded, index=y.index)


Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


In [9]:
y

0       0
1       0
2       0
3       0
4       0
       ..
7603    1
7604    1
7605    1
7606    1
7607    1
Name: label, Length: 7608, dtype: category
Categories (2, object): ['0' < '1']

In [11]:
y

0        DOWN
1        DOWN
2        DOWN
3        DOWN
4        DOWN
         ... 
38469      UP
38470      UP
38471      UP
38472      UP
38473      UP
Name: class, Length: 38474, dtype: category
Categories (2, object): ['DOWN' < 'UP']

In [54]:
y.astype('int')

0        0
1        0
2        0
3        0
4        0
        ..
16709    1
16710    1
16711    1
16712    1
16713    1
Length: 16714, dtype: int32

In [13]:
pd.to_numeric(y, errors='coerce')

0       NaN
1       NaN
2       NaN
3       NaN
4       NaN
         ..
38469   NaN
38470   NaN
38471   NaN
38472   NaN
38473   NaN
Name: class, Length: 38474, dtype: float64

In [15]:
y.dtype

CategoricalDtype(categories=['DOWN', 'UP'], ordered=True, categories_dtype=object)

In [16]:
#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361111
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
#y=y.astype('int')
y.dtype

Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


CategoricalDtype(categories=['0', '1'], ordered=True, categories_dtype=object)

In [50]:
# [361110, 361111, 361113, 361282, 361283, 361285, 361286]
SUITE_ID = 334 # Classification on numerical and categorical features
task_id=361286
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Transform y to int type, to then be able to apply BCEWithLogitsLoss
print(y)
y=y.astype('int')
print(y)

0       0
1       0
2       0
3       0
4       0
       ..
4961    1
4962    1
4963    1
4964    1
4965    1
Name: twoyearrecid, Length: 4966, dtype: category
Categories (2, object): ['0' < '1']
0       0
1       0
2       0
3       0
4       0
       ..
4961    1
4962    1
4963    1
4964    1
4965    1
Name: twoyearrecid, Length: 4966, dtype: int32


Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.


In [49]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
le = LabelEncoder()

# Fit the label encoder and transform y to get binary labels
y = le.fit_transform(y)
y

array([0, 0, 0, ..., 1, 1, 1], dtype=int64)

In [38]:
X

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer
0,0.898987,2,0.957447,0.068632,0.568283,0.004456,0.456499,0.644737
1,0.867616,5,0.234043,0.033716,0.337102,0.001672,0.329622,0.846930
2,0.009159,6,0.255319,0.059175,0.185808,0.003467,0.422915,0.414912
3,0.898987,2,0.531915,0.087577,0.539572,0.004936,0.637752,0.491667
4,0.868280,6,0.085106,0.027021,0.165129,0.001271,0.265924,0.748246
...,...,...,...,...,...,...,...,...
38469,0.915800,6,0.404255,0.077549,0.456263,0.005332,0.378560,0.356140
38470,0.915800,6,0.425532,0.074397,0.444213,0.005110,0.377525,0.369737
38471,0.915800,6,0.468085,0.072835,0.423386,0.005019,0.354480,0.380263
38472,0.915800,6,0.829787,0.065420,0.353913,0.004508,0.319524,0.319737


In [20]:
benchmark_suite.tasks

[361110, 361111, 361113, 361282, 361283, 361285, 361286]

In [39]:
SUITE_ID = 337 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite
benchmark_suite.tasks

[361055,
 361060,
 361061,
 361062,
 361063,
 361065,
 361066,
 361068,
 361069,
 361070,
 361273,
 361274,
 361275,
 361276,
 361277,
 361278]

In [40]:
X.shape

(38474, 8)

In [41]:
for task_id in benchmark_suite.tasks:
    print(task_id)
    task = openml.tasks.get_task(task_id)  # download the OpenML task
    dataset = task.get_dataset()

    X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute)
    y.astype('int')

361055
361060


Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.
Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until vers

ValueError: Cannot cast object dtype to int32

In [42]:
X

Unnamed: 0,date,period,nswprice,nswdemand,vicprice,vicdemand,transfer
0,0.898987,0.957447,0.068632,0.568283,0.004456,0.456499,0.644737
1,0.867616,0.234043,0.033716,0.337102,0.001672,0.329622,0.846930
2,0.009159,0.255319,0.059175,0.185808,0.003467,0.422915,0.414912
3,0.898987,0.531915,0.087577,0.539572,0.004936,0.637752,0.491667
4,0.868280,0.085106,0.027021,0.165129,0.001271,0.265924,0.748246
...,...,...,...,...,...,...,...
38469,0.915800,0.404255,0.077549,0.456263,0.005332,0.378560,0.356140
38470,0.915800,0.425532,0.074397,0.444213,0.005110,0.377525,0.369737
38471,0.915800,0.468085,0.072835,0.423386,0.005019,0.354480,0.380263
38472,0.915800,0.829787,0.065420,0.353913,0.004508,0.319524,0.319737


In [33]:
y.dtype

CategoricalDtype(categories=['DOWN', 'UP'], ordered=True, categories_dtype=object)

In [None]:
from sklearn.preprocessing import LabelEncoder

# Create a label encoder
le = LabelEncoder()

# Fit the label encoder and transform y to get binary labels
y_encoded = le.fit_transform(y)

# Convert the result back to a pandas Series
y = pd.Series(y_encoded, index=y.index)

In [52]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score


#SUITE_ID = 336 # Regression on numerical features
SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361055
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)
y=y.astype('int')

# Set the random seed for reproducibility
N_TRIALS=5
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# New new implementation
N_CLUSTERS=20
# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)
scaler = StandardScaler()

# transform data to compute the clusters
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_scaled)
distances=[]
mahalanobis_dist=[]
counts=[]
ideal_len=len(kmeans.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances.append(np.abs(np.sum(kmeans.labels_==i)-ideal_len))
    counts.append(np.sum(kmeans.labels_==i))
    mean_k= np.mean(X.loc[kmeans.labels_==i,:], axis=0)
    mahalanobis_dist.append(mahalanobis(mean_k, mean, np.linalg.inv(cov)))

dist_df=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
dist_df=dist_df.sort_values('mahalanobis_dist', ascending=False)
dist_df['cumulative_count']=dist_df['count'].cumsum()
dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
labelss=dist_df.index[0:final+1].to_list()
labels=pd.Series(kmeans.labels_).isin(labelss)
labels.index=X.index
close_index=labels.index[np.where(labels==False)[0]]
far_index=labels.index[np.where(labels==True)[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

# calculate the mean and covariance matrix of the dataset
mean_ = np.mean(X_train, axis=0)
cov_ = np.cov(X_train.T)
scaler_ = StandardScaler()

# transform data to compute the clusters
X_train_scaled = scaler_.fit_transform(X_train)

kmeans_ = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_train_scaled)
distances_=[]
counts_=[]
mahalanobis_dist_=[]
ideal_len_=len(kmeans_.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances_.append(np.abs(np.sum(kmeans_.labels_==i)-ideal_len_))
    counts_.append(np.sum(kmeans_.labels_==i))
    mean_k_= np.mean(X_train.loc[kmeans_.labels_==i,:], axis=0)
    mahalanobis_dist_.append(mahalanobis(mean_k_, mean_, np.linalg.inv(cov_)))

dist_df_=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
dist_df_=dist_df_.sort_values('mahalanobis_dist', ascending=False)
dist_df_['cumulative_count']=dist_df_['count'].cumsum()
dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
labelss_=dist_df_.index[0:final_+1].to_list()
labels_=pd.Series(kmeans_.labels_).isin(labelss_)
labels_.index=X_train.index
close_index_=labels_.index[np.where(labels_==False)[0]]
far_index_=labels_.index[np.where(labels_==True)[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

# GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
              'lam': trial.suggest_loguniform('lam', 1e-3, 1)}

    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the accuracy
    y_val_hat_gam = gam.predict(X_val)
    accuracy_gam = accuracy_score(y_val, y_val_hat_gam)

    return accuracy_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='maximize')

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LogisticGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict(X_test)
# Calculate the accuracy
accuracy_gam = accuracy_score(y_test, y_test_hat_gam)
print("Accuracy GAM: ", accuracy_gam)

Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.
[I 2024-02-05 17:46:52,747] A new study created in memory with name: no-name-924550f2-1350-4f70-8901-895fe7db3898
suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.
[I 2024-02-05 17:46:52,965] Trial 0 finished with value: 0.7761388286334057 and parameters: {'n_splines': 17, 'lam': 0.001154132

Accuracy GAM:  0.7056033204862141


In [53]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score


#SUITE_ID = 336 # Regression on numerical features
SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361055
task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Create a label encoder
le = LabelEncoder()

# Fit the label encoder and transform y to get binary labels
y_encoded = le.fit_transform(y)

# Convert the result back to a pandas Series
y = pd.Series(y_encoded, index=y.index)

# Set the random seed for reproducibility
N_TRIALS=5
N_SAMPLES=100
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# New new implementation
N_CLUSTERS=20
# calculate the mean and covariance matrix of the dataset
mean = np.mean(X, axis=0)
cov = np.cov(X.T)
scaler = StandardScaler()

# transform data to compute the clusters
X_scaled = scaler.fit_transform(X)

kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_scaled)
distances=[]
mahalanobis_dist=[]
counts=[]
ideal_len=len(kmeans.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances.append(np.abs(np.sum(kmeans.labels_==i)-ideal_len))
    counts.append(np.sum(kmeans.labels_==i))
    mean_k= np.mean(X.loc[kmeans.labels_==i,:], axis=0)
    mahalanobis_dist.append(mahalanobis(mean_k, mean, np.linalg.inv(cov)))

dist_df=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
dist_df=dist_df.sort_values('mahalanobis_dist', ascending=False)
dist_df['cumulative_count']=dist_df['count'].cumsum()
dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
labelss=dist_df.index[0:final+1].to_list()
labels=pd.Series(kmeans.labels_).isin(labelss)
labels.index=X.index
close_index=labels.index[np.where(labels==False)[0]]
far_index=labels.index[np.where(labels==True)[0]]

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

# calculate the mean and covariance matrix of the dataset
mean_ = np.mean(X_train, axis=0)
cov_ = np.cov(X_train.T)
scaler_ = StandardScaler()

# transform data to compute the clusters
X_train_scaled = scaler_.fit_transform(X_train)

kmeans_ = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_train_scaled)
distances_=[]
counts_=[]
mahalanobis_dist_=[]
ideal_len_=len(kmeans_.labels_)/5
for i in np.arange(N_CLUSTERS):
    distances_.append(np.abs(np.sum(kmeans_.labels_==i)-ideal_len_))
    counts_.append(np.sum(kmeans_.labels_==i))
    mean_k_= np.mean(X_train.loc[kmeans_.labels_==i,:], axis=0)
    mahalanobis_dist_.append(mahalanobis(mean_k_, mean_, np.linalg.inv(cov_)))

dist_df_=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
dist_df_=dist_df_.sort_values('mahalanobis_dist', ascending=False)
dist_df_['cumulative_count']=dist_df_['count'].cumsum()
dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
labelss_=dist_df_.index[0:final_+1].to_list()
labels_=pd.Series(kmeans_.labels_).isin(labelss_)
labels_.index=X_train.index
close_index_=labels_.index[np.where(labels_==False)[0]]
far_index_=labels_.index[np.where(labels_==True)[0]]

X_train_ = X_train.loc[close_index_,:]
X_val = X_train.loc[far_index_,:]
y_train_ = y_train.loc[close_index_]
y_val = y_train.loc[far_index_]


# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train_.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

# GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
              'lam': trial.suggest_loguniform('lam', 1e-3, 1)}

    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the accuracy
    y_val_hat_gam = gam.predict(X_val)
    accuracy_gam = accuracy_score(y_val, y_val_hat_gam)

    return accuracy_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='maximize')

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LogisticGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict(X_test)
# Calculate the accuracy
accuracy_gam = accuracy_score(y_test, y_test_hat_gam)
print("Accuracy GAM: ", accuracy_gam)

Starting from Version 0.15.0 `download_splits` will default to ``False`` instead of ``True`` and be independent from `download_data`. To disable this message until version 0.15 explicitly set `download_splits` to a bool.
Starting from Version 0.15 `download_data`, `download_qualities`, and `download_features_meta_data` will all be ``False`` instead of ``True`` by default to enable lazy loading. To disable this message until version 0.15 explicitly set `download_data`, `download_qualities`, and `download_features_meta_data` to a bool while calling `get_dataset`.
[I 2024-02-05 17:47:40,461] A new study created in memory with name: no-name-1a22f0e1-08d4-4461-9351-891cce8abc2a
suggest_loguniform has been deprecated in v3.0.0. This feature will be removed in v6.0.0. See https://github.com/optuna/optuna/releases/tag/v3.0.0. Use suggest_float(..., log=True) instead.
[I 2024-02-05 17:47:40,612] Trial 0 finished with value: 0.7761388286334057 and parameters: {'n_splines': 17, 'lam': 0.001154132

Accuracy GAM:  0.7056033204862141


divide by zero encountered in divide


In [1]:
from umap import UMAP
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.preprocessing import LabelEncoder
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping, train_GP, ExactGPModel
from torch.utils.data import TensorDataset, DataLoader
import gower

#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

task_id=361110

# Create the checkpoint directory if it doesn't exist
os.makedirs('CHECKPOINTS/UMAP', exist_ok=True)
CHECKPOINT_PATH = f'CHECKPOINTS/UMAP/task_{task_id}.pt'

print(f"Task {task_id}")

task = openml.tasks.get_task(task_id)  # download the OpenML task
dataset = task.get_dataset()

X, y, categorical_indicator, attribute_names = dataset.get_data(
        dataset_format="dataframe", target=dataset.default_target_attribute)

# Find features with absolute correlation > 0.9
corr_matrix = X.corr().abs()
upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

# Drop one of the highly correlated features
X = X.drop(high_corr_features, axis=1)

Task 361110




In [2]:
corr_matrix

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer
date,1.0,0.004318,0.001347,0.143463,0.069539,0.007842,0.077647,0.398192
day,0.004318,1.0,0.006288,0.004045,0.058458,0.000778,0.077045,0.058479
period,0.001347,0.006288,1.0,0.101543,0.424553,0.021871,0.178263,0.103193
nswprice,0.143463,0.004045,0.101543,1.0,0.308241,0.289463,0.301555,0.268668
nswdemand,0.069539,0.058458,0.424553,0.308241,1.0,0.086709,0.667414,0.260809
vicprice,0.007842,0.000778,0.021871,0.289463,0.086709,1.0,0.128585,0.080543
vicdemand,0.077647,0.077045,0.178263,0.301555,0.667414,0.128585,1.0,0.544034
transfer,0.398192,0.058479,0.103193,0.268668,0.260809,0.080543,0.544034,1.0


In [3]:
X

Unnamed: 0,date,day,period,nswprice,nswdemand,vicprice,vicdemand,transfer
0,0.898987,2,0.957447,0.068632,0.568283,0.004456,0.456499,0.644737
1,0.867616,5,0.234043,0.033716,0.337102,0.001672,0.329622,0.846930
2,0.009159,6,0.255319,0.059175,0.185808,0.003467,0.422915,0.414912
3,0.898987,2,0.531915,0.087577,0.539572,0.004936,0.637752,0.491667
4,0.868280,6,0.085106,0.027021,0.165129,0.001271,0.265924,0.748246
...,...,...,...,...,...,...,...,...
38469,0.915800,6,0.404255,0.077549,0.456263,0.005332,0.378560,0.356140
38470,0.915800,6,0.425532,0.074397,0.444213,0.005110,0.377525,0.369737
38471,0.915800,6,0.468085,0.072835,0.423386,0.005019,0.354480,0.380263
38472,0.915800,6,0.829787,0.065420,0.353913,0.004508,0.319524,0.319737


In [2]:
# Transform y to int type, to then be able to apply BCEWithLogitsLoss
# Create a label encoder
le = LabelEncoder()
# Fit the label encoder and transform y to get binary labels
y_encoded = le.fit_transform(y)
# Convert the result back to a pandas Series
y = pd.Series(y_encoded, index=y.index)

# Set the random seed for reproducibility
N_TRIALS=100
N_SAMPLES=100
PATIENCE=40
N_EPOCHS=1000
GP_ITERATIONS=1000
BATCH_SIZE=1024
seed=10
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)


# Apply UMAP decomposition
umap = UMAP(n_components=2, random_state=42)
X_umap = umap.fit_transform(X)

# calculate the Euclidean distance matrix
euclidean_dist_matrix = euclidean_distances(X_umap)

# calculate the Euclidean distance for each data point
euclidean_dist = np.mean(euclidean_dist_matrix, axis=1)

euclidean_dist = pd.Series(euclidean_dist, index=X.index)
far_index = euclidean_dist.index[np.where(euclidean_dist >= np.quantile(euclidean_dist, 0.8))[0]]
close_index = euclidean_dist.index[np.where(euclidean_dist < np.quantile(euclidean_dist, 0.8))[0]]

X_train = X.loc[close_index,:]

# Apply UMAP decomposition on the training set
X_umap_train = umap.fit_transform(X_train)

# calculate the Euclidean distance matrix for the training set
euclidean_dist_matrix_train = euclidean_distances(X_umap_train)

# calculate the Euclidean distance for each data point in the training set
euclidean_dist_train = np.mean(euclidean_dist_matrix_train, axis=1)

euclidean_dist_train = pd.Series(euclidean_dist_train, index=X_train.index)
far_index_train = euclidean_dist_train.index[np.where(euclidean_dist_train >= np.quantile(euclidean_dist_train, 0.8))[0]]
close_index_train = euclidean_dist_train.index[np.where(euclidean_dist_train < np.quantile(euclidean_dist_train, 0.8))[0]]


# Convert data to PyTorch tensors
# Modify X_train_, X_val, X_train, and X_test to have dummy variables
X = pd.get_dummies(X.astype(str), drop_first=True)

X_train = X.loc[close_index,:]
X_test = X.loc[far_index,:]
y_train = y.loc[close_index]
y_test = y.loc[far_index]

X_train_ = X_train.loc[close_index_train,:]
X_val = X_train.loc[far_index_train,:]
y_train_ = y_train.loc[close_index_train]
y_val = y_train.loc[far_index_train]

# Standardize the data for non-dummy variables
non_dummy_cols = X.select_dtypes(exclude=['bool']).columns
mean_X_train_ = np.mean(X_train_[non_dummy_cols], axis=0)
std_X_train_ = np.std(X_train_[non_dummy_cols], axis=0)
X_train__scaled = X_train_.copy()
X_train__scaled[non_dummy_cols] = (X_train_[non_dummy_cols] - mean_X_train_) / std_X_train_
X_val_scaled = X_val.copy()
X_val_scaled[non_dummy_cols] = (X_val[non_dummy_cols] - mean_X_train_) / std_X_train_

mean_X_train = np.mean(X_train[non_dummy_cols], axis=0)
std_X_train = np.std(X_train[non_dummy_cols], axis=0)
X_train_scaled = X_train.copy()
X_train_scaled[non_dummy_cols] = (X_train[non_dummy_cols] - mean_X_train) / std_X_train
X_test_scaled = X_test.copy()
X_test_scaled[non_dummy_cols] = (X_test[non_dummy_cols] - mean_X_train) / std_X_train

# Convert data to PyTorch tensors
X_train__tensor = torch.tensor(X_train__scaled.values, dtype=torch.float32)
y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_scaled.values, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

# Convert to use GPU if available
if torch.cuda.is_available():
    X_train__tensor = X_train__tensor.cuda()
    y_train__tensor = y_train__tensor.cuda()
    X_train_tensor = X_train_tensor.cuda()
    y_train_tensor = y_train_tensor.cuda()
    X_val_tensor = X_val_tensor.cuda()
    y_val_tensor = y_val_tensor.cuda()
    X_test_tensor = X_test_tensor.cuda()
    y_test_tensor = y_test_tensor.cuda()

# Create flattened versions of the data
y_val_np = y_val.values.flatten()
y_test_np = y_test.values.flatten()

# Create TensorDatasets for training and validation sets
train__dataset = TensorDataset(X_train__tensor, y_train__tensor)
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders for training and validation sets
train__loader = DataLoader(train__dataset, batch_size=BATCH_SIZE, shuffle=True)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

d_out = 1  
d_in=X_train_.shape[1]

  warn(f"n_jobs value {self.n_jobs} overridden to 1 by setting random_state. Use no seed for parallelism.")


In [None]:
N_TRIALS=5

In [None]:
#### Boosted trees, random forest, engression, linear regression
def boosted(trial):

    params = {'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.5, log=True),
            'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
            'max_depth': trial.suggest_int('max_depth', 1, 30),
            'num_leaves': 2**10,
            'min_child_samples': trial.suggest_int('min_child_samples', 10, 100)}
    
    boosted_tree_model=lgbm.LGBMClassifier(**params)
    boosted_tree_model.fit(X_train_, y_train_)
    y_val_hat_boost=boosted_tree_model.predict(X_val)
    print(y_val_hat_boost)
    accuracy_boost=accuracy_score(y_val, y_val_hat_boost)

    return accuracy_boost

sampler_boost = optuna.samplers.TPESampler(seed=seed)
study_boost = optuna.create_study(sampler=sampler_boost, direction='maximize')
study_boost.optimize(boosted, n_trials=N_TRIALS)
params=study_boost.best_params
params['num_leaves']=2**10
boosted_model=lgbm.LGBMClassifier(**params)

def rf(trial):

    params = {'n_estimators': trial.suggest_int('n_estimators', 100, 500),
            'max_depth': trial.suggest_int('max_depth', 1, 30),
            'max_features': trial.suggest_float('max_features', 0, 1),
            'min_samples_leaf': trial.suggest_int('min_samples_leaf', 10, 100)}
    
    rf_model=RandomForestClassifier(**params)
    rf_model.fit(X_train_, y_train_)
    y_val_hat_rf=rf_model.predict(X_val)
    accuracy_rf=accuracy_score(y_val, y_val_hat_rf)

    return accuracy_rf

sampler_rf = optuna.samplers.TPESampler(seed=seed)
study_rf = optuna.create_study(sampler=sampler_rf, direction='maximize')
study_rf.optimize(rf, n_trials=N_TRIALS)
rf_model=RandomForestClassifier(**study_rf.best_params)


# Fit the boosted model and make predictions
boosted_model.fit(X_train, y_train)
y_test_hat_boosted = boosted_model.predict(X_test)
accuracy_boosted = accuracy_score(y_test, y_test_hat_boosted)

# Fit the random forest model and make predictions
rf_model.fit(X_train, y_train)
y_test_hat_rf = rf_model.predict(X_test)
accuracy_rf = accuracy_score(y_test, y_test_hat_rf)

# Fit the logistic regression model and make predictions
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_test_hat_logreg = log_reg.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_test_hat_logreg)

constant_prediction = np.full_like(y_test, np.mean(y_train))
constant_prediction = np.where(constant_prediction >= 0.5, 1, 0)
accuracy_constant = accuracy_score(y_test, constant_prediction)

print("Accuracy logistic regression: ", accuracy_logreg)
print("Accuracy boosted trees: ", accuracy_boosted)
print("Accuracy random forest: ", accuracy_rf)
print("Accuracy constant prediction: ", accuracy_constant)

In [2]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import accuracy_score
import gower
from sklearn.preprocessing import LabelEncoder
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping, train_GP, ExactGPModel
from torch.utils.data import TensorDataset, DataLoader
import re


#SUITE_ID = 336 # Regression on numerical features
#SUITE_ID = 337 # Classification on numerical features
SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

#task_id=361110
for task_id in benchmark_suite.tasks:  # iterate over all tasks in the suite

    # Set the random seed for reproducibility
    N_TRIALS=100
    N_SAMPLES=100
    PATIENCE=40
    N_EPOCHS=1000
    GP_ITERATIONS=1000
    BATCH_SIZE=1024
    seed=10
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)

    print(f"Task {task_id}")

    # Create the checkpoint directory if it doesn't exist
    os.makedirs('CHECKPOINTS/GOWER', exist_ok=True)
    CHECKPOINT_PATH = f'CHECKPOINTS/GOWER/task_{task_id}.pt'

    print(f"Task {task_id}")

    task = openml.tasks.get_task(task_id)  # download the OpenML task
    dataset = task.get_dataset()

    X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute)
    
    if len(X) > 15000:
        indices = np.random.choice(X.index, size=15000, replace=False)
        X = X.iloc[indices,]
        y = y[indices]

    # Remove categorical columns with more than 20 unique values and non-categorical columns with less than 10 unique values
    # Remove non-categorical columns with more than 70% of the data in one category
    for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if indicator]:
        if len(X[col].unique()) > 20:
            X = X.drop(col, axis=1)

    for col in [attribute for attribute, indicator in zip(attribute_names, categorical_indicator) if not indicator]:
        if len(X[col].unique()) < 10:
            X = X.drop(col, axis=1)
        elif X[col].value_counts(normalize=True).max() > 0.7:
                X = X.drop(col, axis=1)
    
    # Find features with absolute correlation > 0.9
    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

    # Drop one of the highly correlated features
    X = X.drop(high_corr_features, axis=1)

    # Rename columns to avoid problems with LGBM
    X = X.rename(columns = lambda x:re.sub('[^A-Za-z0-9_]+', '', x))

    # Transform y to int type, to then be able to apply BCEWithLogitsLoss
    # Create a label encoder
    le = LabelEncoder()
    # Fit the label encoder and transform y to get binary labels
    y_encoded = le.fit_transform(y)
    # Convert the result back to a pandas Series
    y = pd.Series(y_encoded, index=y.index)

    print(X.shape)

Task 361093
Task 361093
(4052, 6)
Task 361094
Task 361094




(8641, 4)
Task 361096
Task 361096
(15000, 6)
Task 361097
Task 361097




(4209, 263)
Task 361098
Task 361098
(10692, 10)
Task 361099
Task 361099




(15000, 10)
Task 361101
Task 361101




(15000, 9)
Task 361102
Task 361102
(15000, 16)
Task 361103
Task 361103




(15000, 6)
Task 361104
Task 361104




(15000, 7)
Task 361287
Task 361287




(8885, 57)
Task 361288
Task 361288
(4177, 3)
Task 361289
Task 361289




(15000, 3)
Task 361291
Task 361291




(15000, 9)
Task 361292
Task 361292




(15000, 117)
Task 361293
Task 361293




(15000, 5)
Task 361294
Task 361294
(15000, 3)


