In [1]:
import numpy as np
import time
import tracemalloc
import matplotlib.pyplot as plt
from scipy.stats import randint as sp_randint
from scipy.stats import uniform, loguniform

from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
#MLP
from sklearn.neural_network import MLPClassifier
#SVC
from sklearn.svm import SVC
#Random Forest
from sklearn.ensemble import RandomForestClassifier
#XGBoost
import xgboost
from xgboost import XGBClassifier
#GBC
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
dname="./DATA/"
str0="_XGB_24.dat"
fnamex=dname+'x'+str0
fnamey=dname+'y'+str0
x = np.loadtxt(fnamex, delimiter=" ",dtype=float)
y = np.loadtxt(fnamey)
y = y.astype(int)
N,L = len(x), len(x[0])

N_train = int(0.75*N)
x_train,y_train = x[:N_train],y[:N_train]
x_test,y_test = x[N_train:],y[N_train:]
print(f"N={N}, N_train={N_train}, L={L}")

N=4000, N_train=3000, L=4


In [3]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [4]:
n_iter_search = 20

## MLPClassifier

In [5]:
param_dist_mlp = {
    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 100), (100, 50, 10)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': uniform(0.0001, 0.1),
    'learning_rate_init': uniform(0.001, 0.1)
}

random_search_mlp = RandomizedSearchCV(MLPClassifier(), param_distributions=param_dist_mlp, n_iter=n_iter_search, cv=3, random_state=42)
random_search_mlp.fit(x_train_scaled, y_train)



In [6]:
best_params_random = random_search_mlp.best_params_

print("Best parameters from RandomizedSearchCV:", best_params_random)

param_grid_mlp = {
    'hidden_layer_sizes': [best_params_random['hidden_layer_sizes'], (best_params_random['hidden_layer_sizes'][0] + 10, )],
    'activation': [best_params_random['activation']],
    'solver': [best_params_random['solver']],
    'alpha': [best_params_random['alpha'] * 0.9, best_params_random['alpha'], best_params_random['alpha'] * 1.1],
    'learning_rate_init': [best_params_random['learning_rate_init'] * 0.9, best_params_random['learning_rate_init'], best_params_random['learning_rate_init'] * 1.1]
}

Best parameters from RandomizedSearchCV: {'activation': 'tanh', 'alpha': 0.0797542986860233, 'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.07896910002727693, 'solver': 'sgd'}


In [7]:
grid_search_mlp = GridSearchCV(MLPClassifier(), param_grid=param_grid_mlp, cv=3)
grid_search_mlp.fit(x_train_scaled, y_train)

best_params_grid = grid_search_mlp.best_params_

print("Best parameters from GridSearchCV:", best_params_grid)



Best parameters from GridSearchCV: {'activation': 'tanh', 'alpha': 0.07177886881742097, 'hidden_layer_sizes': (50, 50), 'learning_rate_init': 0.08686601003000463, 'solver': 'sgd'}




## SVC

In [8]:
param_dist_svc = {
    'C': loguniform(1e-3, 1e3),
    'gamma': loguniform(1e-4, 1e-1),
    'kernel': ['rbf', 'linear']
}

random_search_svc = RandomizedSearchCV(SVC(), param_distributions=param_dist_svc, n_iter=n_iter_search, cv=3, random_state=42, verbose=1)
random_search_svc.fit(x_train_scaled, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [9]:
best_params_random_svc = random_search_svc.best_params_
print("Best parameters from RandomizedSearchCV for SVC:", best_params_random_svc)

param_grid_svc = {
    'C': [best_params_random_svc['C'] / 2, best_params_random_svc['C'], best_params_random_svc['C'] * 2],
    'gamma': [best_params_random_svc['gamma'] / 2, best_params_random_svc['gamma'], best_params_random_svc['gamma'] * 2],
    'kernel': [best_params_random_svc['kernel']]  # Assuming kernel chosen is optimal
}

Best parameters from RandomizedSearchCV for SVC: {'C': 793.2047656808546, 'gamma': 0.0025135566617708314, 'kernel': 'rbf'}


In [10]:
grid_search_svc = GridSearchCV(SVC(), param_grid=param_grid_svc, cv=3, verbose=1)
grid_search_svc.fit(x_train_scaled, y_train)

best_params_grid_svc = grid_search_svc.best_params_
print("Best parameters from GridSearchCV for SVC:", best_params_grid_svc)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
Best parameters from GridSearchCV for SVC: {'C': 396.6023828404273, 'gamma': 0.0025135566617708314, 'kernel': 'rbf'}


## RANDOM FOREST

In [11]:
param_dist_rf = {
    'n_estimators': sp_randint(10, 200),
    'max_depth': sp_randint(1, 50),
    'min_samples_split': sp_randint(2, 11),
    'min_samples_leaf': sp_randint(1, 11)
}

random_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=n_iter_search, cv=3, random_state=42, verbose=1)
random_search_rf.fit(x_train_scaled, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [12]:
best_params_random_rf = random_search_rf.best_params_
print("Best parameters from RandomizedSearchCV for RandomForest:", best_params_random_rf)

# Define a more focused grid based on insights from RandomizedSearchCV
param_grid_rf = {
    'n_estimators': [best_params_random_rf['n_estimators'] - 50, best_params_random_rf['n_estimators'], best_params_random_rf['n_estimators'] + 50],
    'max_depth': [best_params_random_rf['max_depth'] - 10, best_params_random_rf['max_depth'], best_params_random_rf['max_depth'] + 10],
    'min_samples_split': [best_params_random_rf['min_samples_split'] - 2, best_params_random_rf['min_samples_split'], best_params_random_rf['min_samples_split'] + 2],
    'min_samples_leaf': [best_params_random_rf['min_samples_leaf'] - 1, best_params_random_rf['min_samples_leaf'], best_params_random_rf['min_samples_leaf'] + 1]
}

Best parameters from RandomizedSearchCV for RandomForest: {'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 7, 'n_estimators': 98}


In [13]:
grid_search_rf = GridSearchCV(RandomForestClassifier(), param_grid=param_grid_rf, cv=3, verbose=1)
grid_search_rf.fit(x_train_scaled, y_train)

best_params_grid_rf = grid_search_rf.best_params_
print("Best parameters from GridSearchCV for RandomForest:", best_params_grid_rf)

Fitting 3 folds for each of 81 candidates, totalling 243 fits


81 fits failed out of a total of 243.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
81 fits failed with the following error:
Traceback (most recent call last):
  File "/home/alcadis/anaconda3/lib/python3.11/site-packages/sklearn/model_selection/_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/home/alcadis/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 1144, in wrapper
    estimator._validate_params()
  File "/home/alcadis/anaconda3/lib/python3.11/site-packages/sklearn/base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "/home/alcadis/anaconda3/lib/python3.11/site-packages/sklearn/utils/_param_validation.py", line 95, in validate_paramet

Best parameters from GridSearchCV for RandomForest: {'max_depth': 21, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 148}


## XGBOOST

In [14]:
param_dist_xgb = {
    'n_estimators': sp_randint(50, 400),
    'max_depth': sp_randint(3, 10),
    'learning_rate': uniform(0.01, 0.3),
    'colsample_bytree': uniform(0.5, 0.5),
    'subsample': uniform(0.5, 0.5),
    'gamma': uniform(0, 5),
    'reg_lambda': uniform(0.5, 1.5)
}

random_search_xgb = RandomizedSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_distributions=param_dist_xgb, n_iter=n_iter_search, cv=3, random_state=42, verbose=1)
random_search_xgb.fit(x_train_scaled, y_train)



Fitting 3 folds for each of 20 candidates, totalling 60 fits




In [15]:
best_params_random_xgb = random_search_xgb.best_params_
print("Best parameters from RandomizedSearchCV for XGBoost:", best_params_random_xgb)

param_grid_xgb = {
    'n_estimators': [best_params_random_xgb['n_estimators'] - 50, best_params_random_xgb['n_estimators'], best_params_random_xgb['n_estimators'] + 50],
    'max_depth': [max(best_params_random_xgb['max_depth'] - 1, 3), best_params_random_xgb['max_depth'], min(best_params_random_xgb['max_depth'] + 1, 10)],
    'learning_rate': [best_params_random_xgb['learning_rate']],
    'colsample_bytree': [max(best_params_random_xgb['colsample_bytree'] - 0.1, 0.3), best_params_random_xgb['colsample_bytree'], min(best_params_random_xgb['colsample_bytree'] + 0.1, 1.0)],
    'subsample': [max(best_params_random_xgb['subsample'] - 0.1, 0.5), best_params_random_xgb['subsample'], min(best_params_random_xgb['subsample'] + 0.1, 1.0)],
    'gamma': [max(best_params_random_xgb['gamma'] - 0.5, 0), best_params_random_xgb['gamma'], min(best_params_random_xgb['gamma'] + 0.5, 5)],  # Increased steps
    'reg_lambda': [max(best_params_random_xgb['reg_lambda'] - 0.25, 0), best_params_random_xgb['reg_lambda'], min(best_params_random_xgb['reg_lambda'] + 0.25, 2)]
}

Best parameters from RandomizedSearchCV for XGBoost: {'colsample_bytree': 0.8854835899772805, 'gamma': 2.4689779818219537, 'learning_rate': 0.16681984881459822, 'max_depth': 9, 'n_estimators': 395, 'reg_lambda': 0.5381286901161428, 'subsample': 0.5539457134966522}


In [16]:
grid_search_xgb = GridSearchCV(XGBClassifier(use_label_encoder=False, eval_metric='logloss'), param_grid=param_grid_xgb, cv=3, verbose=1)
grid_search_xgb.fit(x_train_scaled, y_train)

best_params_grid_xgb = grid_search_xgb.best_params_
print("Best parameters from GridSearchCV for XGBoost:", best_params_grid_xgb)



Fitting 3 folds for each of 729 candidates, totalling 2187 fits




Best parameters from GridSearchCV for XGBoost: {'colsample_bytree': 0.7854835899772805, 'gamma': 2.4689779818219537, 'learning_rate': 0.16681984881459822, 'max_depth': 9, 'n_estimators': 345, 'reg_lambda': 0.7881286901161428, 'subsample': 0.6539457134966522}


## GBC

In [17]:
param_dist_gbc = {
    'n_estimators': sp_randint(100, 500),
    'learning_rate': uniform(0.01, 0.2),
    'max_depth': sp_randint(3, 10),
    'min_samples_split': sp_randint(2, 20)
}

random_search_gbc = RandomizedSearchCV(GradientBoostingClassifier(), param_distributions=param_dist_gbc, n_iter=n_iter_search, cv=3, random_state=42, verbose=1)
random_search_gbc.fit(x_train_scaled, y_train)

Fitting 3 folds for each of 20 candidates, totalling 60 fits


In [18]:
best_params_random_gbc = random_search_gbc.best_params_
print("Best parameters from RandomizedSearchCV for GradientBoostingClassifier:", best_params_random_gbc)

param_grid_gbc = {
    'n_estimators': [best_params_random_gbc['n_estimators'] - 100, best_params_random_gbc['n_estimators'], best_params_random_gbc['n_estimators'] + 100],
    'learning_rate': [max(best_params_random_gbc['learning_rate'] - 0.05, 0.01), best_params_random_gbc['learning_rate'], min(best_params_random_gbc['learning_rate'] + 0.05, 0.3)],
    'max_depth': [best_params_random_gbc['max_depth'] - 2, best_params_random_gbc['max_depth'], best_params_random_gbc['max_depth'] + 2],
    'min_samples_split': [max(best_params_random_gbc['min_samples_split'] - 5, 2), best_params_random_gbc['min_samples_split'], min(best_params_random_gbc['min_samples_split'] + 5, 20)]
}

Best parameters from RandomizedSearchCV for GradientBoostingClassifier: {'learning_rate': 0.1554543991712842, 'max_depth': 9, 'min_samples_split': 9, 'n_estimators': 314}


In [19]:
grid_search_gbc = GridSearchCV(GradientBoostingClassifier(), param_grid=param_grid_gbc, cv=3, verbose=1)
grid_search_gbc.fit(x_train_scaled, y_train)

best_params_grid_gbc = grid_search_gbc.best_params_
print("Best parameters from GridSearchCV for GradientBoostingClassifier:", best_params_grid_gbc)

Fitting 3 folds for each of 81 candidates, totalling 243 fits
Best parameters from GridSearchCV for GradientBoostingClassifier: {'learning_rate': 0.20545439917128422, 'max_depth': 11, 'min_samples_split': 14, 'n_estimators': 414}
