In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import sys
!{sys.executable} -m pip install optuna



In [3]:
import sys
!{sys.executable} -m pip install scikit-learn --upgrade

Requirement already up-to-date: scikit-learn in /home/idies/miniconda3/envs/py38/lib/python3.8/site-packages (1.2.2)


In [4]:
import sys
!{sys.executable} -m pip install scikit-learn-intelex



In [5]:
from sklearnex import patch_sklearn
patch_sklearn(global_patch=True)

Scikit-learn was successfully globally patched by Intel(R) Extension for Scikit-learn


Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


In [6]:
import pandas as pd
import numpy as np

In [7]:
df = pd.read_csv('https://gitlab.com/mirsakhawathossain/exodata/-/raw/main/dataset/exodata.csv').sort_index(axis=0)

In [8]:
columns_1 = df.columns[df.mean()==1]
columns_2 = df.columns[df.mean()==0]
df = df.drop(columns=columns_1)
df = df.drop(columns=columns_2)

In [9]:
X = df.drop(['exoplanet'],axis=1)

In [10]:
X.shape

(75458, 741)

In [11]:
y = df[['exoplanet']]

In [12]:
y.value_counts()

exoplanet
1            40084
0            35374
dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=43,stratify=y)

In [15]:
print(X_train.shape)
print(X_test.shape)

(45274, 741)
(30184, 741)


In [16]:
from sklearn.preprocessing import QuantileTransformer
qt = QuantileTransformer()

In [17]:
X_train = qt.fit_transform(X_train)
X_test = qt.transform(X_test)

In [18]:
import optuna
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.ensemble import RandomForestClassifier


In [19]:
def objective(trial):
    max_depth = trial.suggest_int('max_depth', 5, 10)
    min_samples_split = trial.suggest_int('min_samples_split', 6, 10)
    min_samples_leaf = trial.suggest_int('min_samples_leaf', 5, 10)
    max_features = trial.suggest_categorical('max_features', ['sqrt', 'log2', None])
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    n_estimators = trial.suggest_int('n_estimators', 150, 200, step=10)

    clf = RandomForestClassifier(
        max_depth=max_depth,
        min_samples_split=min_samples_split,
        min_samples_leaf=min_samples_leaf,
        max_features=max_features,
        criterion=criterion,
        n_estimators=n_estimators,
        random_state=43,n_jobs = -1)

    rskf = RepeatedKFold(n_splits=5, n_repeats=10, random_state=41)
    accs = np.mean(cross_val_score(clf, X_train, y_train, cv=rskf))
    return 1 - np.mean(accs)

In [20]:
# Define the study object to manage the optimization process
study = optuna.create_study(direction='minimize',pruner=optuna.pruners.HyperbandPruner(min_resource=1, max_resource='auto', reduction_factor=3))

[32m[I 2023-03-25 05:54:28,175][0m A new study created in memory with name: no-name-e595c699-c079-42b6-bc64-2e7d52d22304[0m


In [None]:
# Run the hyperparameter optimization
n_trials = None
n_jobs = -1
timeout= 60
study.optimize(objective, n_trials=n_trials, n_jobs=n_jobs, timeout=timeout,show_progress_bar=True,gc_after_trial=True)

In [22]:
# Print the best hyperparameters found
print('Best hyperparameters:')
print(study.best_params)

Best hyperparameters:
{'max_depth': 10, 'min_samples_split': 8, 'min_samples_leaf': 10, 'max_features': None, 'criterion': 'gini'}


In [None]:
best_score = 1 - study.best_value
print('Best score:', best_score)