In [16]:
import pandas as pd
import numpy as np
import setuptools
import openml
from sklearn.linear_model import LogisticRegression 
import lightgbm as lgbm
import optuna
from scipy.spatial.distance import mahalanobis
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier
from sklearn.gaussian_process.kernels import Matern
from engression import engression, engression_bagged
import torch
from sklearn.preprocessing import StandardScaler
from scipy.spatial.distance import mahalanobis
from scipy.stats import norm
from sklearn.metrics import mean_squared_error
from rtdl_revisiting_models import MLP, ResNet, FTTransformer
import random
import gpytorch
import tqdm.auto as tqdm
import os
from pygam import LogisticGAM, s
import torch
from torch import nn
from torch.optim import Adam
from sklearn.metrics import log_loss
from sklearn.preprocessing import LabelEncoder
from utils import EarlyStopping, train, train_trans, train_no_early_stopping, train_trans_no_early_stopping
from torch.utils.data import TensorDataset, DataLoader


#SUITE_ID = 336 # Regression on numerical features
SUITE_ID = 337 # Classification on numerical features
#SUITE_ID = 335 # Regression on numerical and categorical features
#SUITE_ID = 334 # Classification on numerical and categorical features
benchmark_suite = openml.study.get_suite(SUITE_ID)  # obtain the benchmark suite

#task_id=361055
for task_id in benchmark_suite.tasks:
    # Create the checkpoint directory if it doesn't exist
    os.makedirs('CHECKPOINTS/CLUSTERING', exist_ok=True)
    CHECKPOINT_PATH = f'CHECKPOINTS/CLUSTERING/task_{task_id}.pt'

    print(f"Task {task_id}")

    task = openml.tasks.get_task(task_id)  # download the OpenML task
    dataset = task.get_dataset()

    X, y, categorical_indicator, attribute_names = dataset.get_data(
            dataset_format="dataframe", target=dataset.default_target_attribute)

    # Find features with absolute correlation > 0.9
    corr_matrix = X.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    high_corr_features = [column for column in upper_tri.columns if any(upper_tri[column] > 0.9)]

    # Drop one of the highly correlated features
    X = X.drop(high_corr_features, axis=1)

    # Transform y to int type, to then be able to apply BCEWithLogitsLoss
    # Create a label encoder
    le = LabelEncoder()
    # Fit the label encoder and transform y to get binary labels
    y_encoded = le.fit_transform(y)
    # Convert the result back to a pandas Series
    y = pd.Series(y_encoded, index=y.index)

    # Set the random seed for reproducibility
    N_TRIALS=100
    N_SAMPLES=100
    PATIENCE=40
    N_EPOCHS=1000
    GP_ITERATIONS=1000
    BATCH_SIZE=1024
    seed=10
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    random.seed(seed)


    # New new implementation
    N_CLUSTERS=20
    # calculate the mean and covariance matrix of the dataset
    mean = np.mean(X, axis=0)
    cov = np.cov(X.T)
    scaler = StandardScaler()

    # transform data to compute the clusters
    X_scaled = scaler.fit_transform(X)

    kmeans = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_scaled)
    distances=[]
    mahalanobis_dist=[]
    counts=[]
    ideal_len=len(kmeans.labels_)/5
    for i in np.arange(N_CLUSTERS):
        distances.append(np.abs(np.sum(kmeans.labels_==i)-ideal_len))
        counts.append(np.sum(kmeans.labels_==i))
        mean_k= np.mean(X.loc[kmeans.labels_==i,:], axis=0)
        mahalanobis_dist.append(mahalanobis(mean_k, mean, np.linalg.inv(cov)))

    dist_df=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist, 'count': counts}, index=np.arange(N_CLUSTERS))
    dist_df=dist_df.sort_values('mahalanobis_dist', ascending=False)
    dist_df['cumulative_count']=dist_df['count'].cumsum()
    dist_df['abs_diff']=np.abs(dist_df['cumulative_count']-ideal_len)

    final=(np.where(dist_df['abs_diff']==np.min(dist_df['abs_diff']))[0])[0]
    labelss=dist_df.index[0:final+1].to_list()
    labels=pd.Series(kmeans.labels_).isin(labelss)
    labels.index=X.index
    close_index=labels.index[np.where(labels==False)[0]]
    far_index=labels.index[np.where(labels==True)[0]]

    X_train = X.loc[close_index,:]
    X_test = X.loc[far_index,:]
    y_train = y.loc[close_index]
    y_test = y.loc[far_index]

    # calculate the mean and covariance matrix of the dataset
    mean_ = np.mean(X_train, axis=0)
    cov_ = np.cov(X_train.T)
    scaler_ = StandardScaler()

    # transform data to compute the clusters
    X_train_scaled = scaler_.fit_transform(X_train)

    kmeans_ = KMeans(n_clusters=N_CLUSTERS, random_state=0, n_init="auto").fit(X_train_scaled)
    distances_=[]
    counts_=[]
    mahalanobis_dist_=[]
    ideal_len_=len(kmeans_.labels_)/5
    for i in np.arange(N_CLUSTERS):
        distances_.append(np.abs(np.sum(kmeans_.labels_==i)-ideal_len_))
        counts_.append(np.sum(kmeans_.labels_==i))
        mean_k_= np.mean(X_train.loc[kmeans_.labels_==i,:], axis=0)
        mahalanobis_dist_.append(mahalanobis(mean_k_, mean_, np.linalg.inv(cov_)))

    dist_df_=pd.DataFrame(data={'mahalanobis_dist': mahalanobis_dist_, 'count': counts_}, index=np.arange(N_CLUSTERS))
    dist_df_=dist_df_.sort_values('mahalanobis_dist', ascending=False)
    dist_df_['cumulative_count']=dist_df_['count'].cumsum()
    dist_df_['abs_diff']=np.abs(dist_df_['cumulative_count']-ideal_len_)

    final_=(np.where(dist_df_['abs_diff']==np.min(dist_df_['abs_diff']))[0])[0]
    labelss_=dist_df_.index[0:final_+1].to_list()
    labels_=pd.Series(kmeans_.labels_).isin(labelss_)
    labels_.index=X_train.index
    close_index_=labels_.index[np.where(labels_==False)[0]]
    far_index_=labels_.index[np.where(labels_==True)[0]]

    X_train_ = X_train.loc[close_index_,:]
    X_val = X_train.loc[far_index_,:]
    y_train_ = y_train.loc[close_index_]
    y_val = y_train.loc[far_index_]


    # Standardize the data
    mean_X_train_ = np.mean(X_train_, axis=0)
    std_X_train_ = np.std(X_train_, axis=0)
    X_train__scaled = (X_train_ - mean_X_train_) / std_X_train_
    X_val_scaled = (X_val - mean_X_train_) / std_X_train_

    mean_X_train = np.mean(X_train, axis=0)
    std_X_train = np.std(X_train, axis=0)
    X_train_scaled = (X_train - mean_X_train) / std_X_train
    X_test_scaled = (X_test - mean_X_train) / std_X_train


    # Convert data to PyTorch tensors
    X_train__tensor = torch.tensor(X_train__scaled.values, dtype=torch.float32)
    y_train__tensor = torch.tensor(y_train_.values, dtype=torch.float32)
    X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)
    X_val_tensor = torch.tensor(X_val_scaled.values, dtype=torch.float32)
    y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32)
    X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32)

    # Convert to use GPU if available
    if torch.cuda.is_available():
        X_train__tensor = X_train__tensor.cuda()
        y_train__tensor = y_train__tensor.cuda()
        X_train_tensor = X_train_tensor.cuda()
        y_train_tensor = y_train_tensor.cuda()
        X_val_tensor = X_val_tensor.cuda()
        y_val_tensor = y_val_tensor.cuda()
        X_test_tensor = X_test_tensor.cuda()
        y_test_tensor = y_test_tensor.cuda()

    # Create flattened versions of the data
    y_val_np = y_val.values.flatten()
    y_test_np = y_test.values.flatten()

    # Create TensorDatasets for training and validation sets
    train__dataset = TensorDataset(X_train__tensor, y_train__tensor)
    train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
    val_dataset = TensorDataset(X_val_tensor, y_val_tensor)
    test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

    # Create DataLoaders for training and validation sets
    train__loader = DataLoader(train__dataset, batch_size=BATCH_SIZE, shuffle=True)
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

    d_out = 1  
    d_in=X_train_.shape[1]

    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=5, lam=0)).fit(X_train_, y_train_)
    # Predict on the validation set and calculate the log loss
    y_val_hat_gam = gam.predict_proba(X_val)
    print(np.sum(np.isnan(y_val_hat_gam)))

    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=5, lam=1)).fit(X_train, y_train)
    # Predict on the validation set and calculate the log loss
    y_test_hat_gam = gam.predict_proba(X_test)
    print(np.sum(np.isnan(y_test_hat_gam)))

Task 361055


  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
  out[mask] = y[mask] * np.log(y[mask] / u[mask])


3
13
Task 361060
0
0
Task 361061




0
0
Task 361062




0
0
Task 361063




0
0
Task 361065


  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2


0
0
Task 361066




0
0
Task 361068


  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2
  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
  return dist.levels / (mu * (dist.levels - mu))
  out[mask] = y[mask] * np.log(y[mask] / u[mask])
  out[mask] = y[mask] * np.log(y[mask] / u[mask])


did not converge
0


  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)


173
Task 361069
0
0
Task 361070




0
0
Task 361273




0
0
Task 361274




0
0
Task 361275




0
0
Task 361276




LinAlgError: Singular matrix

In [17]:
corr_matrix = X.corr().abs()
corr_matrix

Unnamed: 0,D1,D2,D3,D5,D6,D7,D8,D9,D10,D11,...,D929,D933,D935,D937,D938,D946,D947,D948,D950,D951
D1,1.000000,0.255137,0.311990,0.089471,0.362843,0.425464,0.103463,0.190638,0.329269,0.134115,...,0.416037,0.121998,0.270514,0.297385,0.383392,0.391688,0.430166,0.067279,0.612300,0.479129
D2,0.255137,1.000000,0.434960,0.207307,0.148738,0.016596,0.169440,0.357585,0.226661,0.020539,...,0.022226,0.105852,0.113732,0.121147,0.164105,0.050552,0.026892,0.049136,0.191521,0.330083
D3,0.311990,0.434960,1.000000,0.135801,0.045007,0.199014,0.088895,0.142389,0.006725,0.027162,...,0.276039,0.105379,0.150724,0.176491,0.285083,0.310214,0.211288,0.056003,0.227322,0.107875
D5,0.089471,0.207307,0.135801,1.000000,0.034942,0.098635,0.450795,0.198092,0.054048,0.466983,...,0.097102,0.027681,0.031046,0.036977,0.075792,0.077651,0.167525,0.050904,0.295367,0.213655
D6,0.362843,0.148738,0.045007,0.034942,1.000000,0.785271,0.408143,0.859403,0.888090,0.248665,...,0.110053,0.073124,0.154782,0.101266,0.091264,0.048149,0.356526,0.241758,0.307950,0.583836
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
D946,0.391688,0.050552,0.310214,0.077651,0.048149,0.054211,0.006797,0.029747,0.043040,0.032129,...,0.783898,0.210663,0.290536,0.316193,0.487899,1.000000,0.272062,0.007217,0.282823,0.266464
D947,0.430166,0.026892,0.211288,0.167525,0.356526,0.428132,0.265090,0.311173,0.458063,0.708899,...,0.414354,0.129694,0.269644,0.255508,0.242798,0.272062,1.000000,0.036361,0.175305,0.708477
D948,0.067279,0.049136,0.056003,0.050904,0.241758,0.160702,0.050996,0.229993,0.188304,0.072346,...,0.012483,0.021634,0.048616,0.003819,0.005024,0.007217,0.036361,1.000000,0.165090,0.090653
D950,0.612300,0.191521,0.227322,0.295367,0.307950,0.248351,0.088807,0.294395,0.232855,0.163787,...,0.343515,0.181610,0.250923,0.226153,0.314023,0.282823,0.175305,0.165090,1.000000,0.432931


In [13]:
# GAM model
def gam_model(trial):

    # Define the hyperparameters to optimize
    params = {'n_splines': trial.suggest_int('n_splines', 5, 20),
            'lam': trial.suggest_float('lam', 1e-3, 1, log=True)}
    
    # Create and train the model
    gam = LogisticGAM(s(0, n_splines=params['n_splines'], lam=params['lam'])).fit(X_train_, y_train_)

    # Predict on the validation set and calculate the log loss
    y_val_hat_gam = gam.predict_proba(X_val)
    y_val_hat_gam_df = pd.DataFrame(y_val_hat_gam)
    y_val_hat_gam_df.fillna(0.5, inplace=True)
    y_val_hat_gam = y_val_hat_gam_df.values
    log_loss_gam = log_loss(y_val, y_val_hat_gam)

    return log_loss_gam

# Create the sampler and study
sampler_gam = optuna.samplers.TPESampler(seed=seed)
study_gam = optuna.create_study(sampler=sampler_gam, direction='minimize')  # We want to minimize log loss

# Optimize the model
study_gam.optimize(gam_model, n_trials=N_TRIALS)

# Create the final model with the best parameters
best_params = study_gam.best_params
final_gam_model = LogisticGAM(s(0, n_splines=best_params['n_splines'], lam=best_params['lam']))

# Fit the model
final_gam_model.fit(X_train, y_train)

# Predict on the test set
y_test_hat_gam = final_gam_model.predict_proba(X_test)
y_test_hat_gam_df = pd.DataFrame(y_test_hat_gam)
y_test_hat_gam_df.fillna(0.5, inplace=True)
y_test_hat_gam = y_test_hat_gam_df.values
# Calculate the log loss
log_loss_gam = log_loss(y_test, y_test_hat_gam)
print("Log Loss GAM: ", log_loss_gam)


[I 2024-03-15 00:01:41,274] A new study created in memory with name: no-name-c3686d59-39e4-4d00-a71a-791301a0c72c
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
[I 2024-03-15 00:01:41,528] Trial 0 finished with value: 0.57351659322509 and parameters: {'n_splines': 17, 'lam': 0.001154132971137168}. Best is trial 0 with value: 0.57351659322509.
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
[I 2024-03-15 00:01:41,652] Trial 1 finished with value: 0.6035648660853605 and parameters: {'n_splines': 15, 'lam': 0.17636469336159113}. Best is trial 0 with value: 0.57351659322509.
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
[I 2024-03-15 00:01:41,757] Trial 2 finished with value: 0.588202876804868 and parameters: {'n_splines': 12, 'lam': 0.00472487079152679}. Best is trial 0 with value:

Log Loss GAM:  0.5633269210014135


  out[mask] = y[mask] * np.log(y[mask] / u[mask])


In [3]:
# Create and train the model
gam = LogisticGAM(s(0, n_splines=5, lam=0)).fit(X_train_, y_train_)
# Predict on the validation set and calculate the log loss
y_val_hat_gam = gam.predict_proba(X_val)
np.sum(np.isnan(y_val_hat_gam))

  elp = np.exp(lp)
  return dist.levels * elp / (elp + 1)
  return dist.levels / (mu * (dist.levels - mu))
  self.link.gradient(mu, self.distribution) ** 2
  self.link.gradient(mu, self.distribution) ** 2


In [10]:
y_val_hat_gam
y_val_hat_gam_df = pd.DataFrame(y_val_hat_gam)
y_val_hat_gam_df.fillna(0.5, inplace=True)
y_val_hat_gam = y_val_hat_gam_df.values
y_val_hat_gam

array([[0.25079435],
       [0.68667415],
       [0.23950299],
       ...,
       [0.77106915],
       [0.56891915],
       [0.60820294]])

In [11]:
log_loss_gam = log_loss(y_val, y_val_hat_gam)

In [12]:
log_loss_gam

0.5709890342165368

In [8]:
np.sum(np.isnan(y_val_hat_gam))

3

In [14]:
np.sum(np.isnan(X_train_))

  return reduction(axis=axis, out=out, **passkwargs)


RevolvingUtilizationOfUnsecuredLines    0
age                                     0
NumberOfTime30-59DaysPastDueNotWorse    0
DebtRatio                               0
MonthlyIncome                           0
NumberOfOpenCreditLinesAndLoans         0
NumberRealEstateLoansOrLines            0
NumberOfDependents                      0
dtype: int64