In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
from itertools import product
from Bio import SeqIO
import pandas as pd

# Define a function to compute TPC
def compute_tpc(sequence):
    tripeptides = [''.join(p) for p in product("ACDEFGHIKLMNPQRSTVWY", repeat=3)]
    tpc_counts = {tripeptide: 0 for tripeptide in tripeptides}

    for i in range(len(sequence) - 2):
        tripeptide = sequence[i:i+3]
        if tripeptide in tpc_counts:
            tpc_counts[tripeptide] += 1

    total_tripeptides = sum(tpc_counts.values())
    if total_tripeptides > 0:
        tpc_counts = {key: val / total_tripeptides for key, val in tpc_counts.items()}

    return tpc_counts

# Process FASTA files
def process_fasta(input_path, output_path):
    data = []
    for record in SeqIO.parse(input_path, "fasta"):
        tpc_features = compute_tpc(str(record.seq))
        data.append(tpc_features)  # Only append TPC features, no IDs

    df = pd.DataFrame(data)
    df.to_csv(output_path, index=False)

# Paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/NEGATIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/NEGATIVE_validation (2) (1).fasta"

# Output paths
output_main_p = "/content/positive_main_tpc.csv"
output_main_n = "/content/negative_main_tpc.csv"
output_validation_p = "/content/positive_validation_tpc.csv"
output_validation_n = "/content/negative_validation_tpc.csv"

# Process and save
process_fasta(main_p, output_main_p)
process_fasta(main_n, output_main_n)
process_fasta(validation_p, output_validation_p)
process_fasta(validation_n, output_validation_n)


# **All Model **

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/positive_main_tpc.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/negative_main_tpc (1).csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/positive_validation_tpc.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/10_TPC (Tripeptide Composition)/negative_validation_tpc.csv")

In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})


Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001556 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Training CatBoost...

Training AdaBoost...





Training Neural Network...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0         Random Forest        0.995704             0.926667
1         Decision Tree        0.995704             0.913333
2        Neural Network        0.995704             0.906667
3                   MLP        0.995704             0.903333
4              CatBoost        0.970790             0.900000
5               XGBoost        0.963918             0.896667
6              AdaBoost        0.939863             0.890000
7     Gradient Boosting        0.939863             0.876667
8           Naive Bayes        0.959622             0.866667
9                  k-NN        0.928694             0.856667
10             LightGBM        0.887457             0.840000
11  Logistic Regression        0.849656             0.823333
12                  SVM        0.809278             0.773333


# **CROSS VALIDATION**

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}


In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001110 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 631
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000955 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 931, number of used 




Performing Cross-validation for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step

Performing Cross-validation for MLP...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0                   MLP          0.932992         0.004345
1        Neural Network          0.925259         0.008398
2         Random Forest          0.919239         0.011677
3              CatBoost          0.898613         0.018209
4           Naive Bayes          0.892648         0.025944
5               XGBoost          0.888320         0.013813
6         Decision Tree          0.869406         0.018605
7     Gradient Boosting          0.868555         0.017520
8              AdaBoost          0.866845         0.015296
9                  k-NN          0.856504         0.022010
10  Logistic Regression          0.804976         0.016045
11             LightGBM          0.784372         0.013287
12                  SVM          0.766320         0.012068


# **Hyperparameter optimization with Optuna**

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m13.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_main_tpc.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_main_tpc (1).csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_validation_tpc.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_validation_tpc.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-09 12:44:45,792] A new study created in memory with name: no-name-ae6c1430-21bc-496f-a97e-12f78d739c7b


Optimizing SVM...


[I 2025-01-09 12:44:56,357] Trial 0 finished with value: 0.9033333333333333 and parameters: {'C': 5.157137305824213, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 12:45:01,139] Trial 1 finished with value: 0.8766666666666667 and parameters: {'C': 5.440333510640703, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 12:45:08,342] Trial 2 finished with value: 0.8966666666666666 and parameters: {'C': 6.586790435439739, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 12:45:17,761] Trial 3 finished with value: 0.9033333333333333 and parameters: {'C': 2.568006909352526, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 12:45:22,999] Trial 4 finished with value: 0.8866666666666667 and parameters: {'C': 6.06768284122189, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 12:45:27,593] Trial 5 finished with value: 0.8866666666666667

Optimizing Decision Tree...


[I 2025-01-09 12:49:17,848] Trial 0 finished with value: 0.7466666666666667 and parameters: {'max_depth': 5, 'min_samples_split': 2}. Best is trial 0 with value: 0.7466666666666667.
[I 2025-01-09 12:49:18,136] Trial 1 finished with value: 0.81 and parameters: {'max_depth': 11, 'min_samples_split': 2}. Best is trial 1 with value: 0.81.
[I 2025-01-09 12:49:18,472] Trial 2 finished with value: 0.8533333333333334 and parameters: {'max_depth': 18, 'min_samples_split': 2}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 12:49:18,680] Trial 3 finished with value: 0.7366666666666667 and parameters: {'max_depth': 5, 'min_samples_split': 3}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 12:49:18,925] Trial 4 finished with value: 0.7933333333333333 and parameters: {'max_depth': 9, 'min_samples_split': 4}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 12:49:19,209] Trial 5 finished with value: 0.7466666666666667 and parameters: {'max_depth': 5, 'min_sa

Optimizing Random Forest...


[I 2025-01-09 12:49:30,305] Trial 0 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 262, 'max_depth': 6, 'min_samples_split': 2}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-09 12:49:31,396] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 178, 'max_depth': 14, 'min_samples_split': 8}. Best is trial 1 with value: 0.86.
[I 2025-01-09 12:49:32,033] Trial 2 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 88, 'max_depth': 15, 'min_samples_split': 6}. Best is trial 1 with value: 0.86.
[I 2025-01-09 12:49:34,292] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 399, 'max_depth': 11, 'min_samples_split': 3}. Best is trial 1 with value: 0.86.
[I 2025-01-09 12:49:36,556] Trial 4 finished with value: 0.83 and parameters: {'n_estimators': 367, 'max_depth': 6, 'min_samples_split': 10}. Best is trial 1 with value: 0.86.
[I 2025-01-09 12:49:38,416] Trial 5 finished with value: 0.8433333

Optimizing Logistic Regression...


[I 2025-01-09 12:50:13,372] Trial 0 finished with value: 0.8833333333333333 and parameters: {'C': 6.533464638260647, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 12:50:13,801] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 9.913998897787232, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-09 12:50:14,035] Trial 2 finished with value: 0.8766666666666667 and parameters: {'C': 4.34884549798885, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-09 12:50:14,272] Trial 3 finished with value: 0.8666666666666667 and parameters: {'C': 2.6469067717233514, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-09 12:50:14,652] Trial 4 finished with value: 0.8233333333333334 and parameters: {'C': 0.8983802713044782, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-09 12:50:15,026] Trial 5 finished with value: 0.883

Optimizing k-NN...


[I 2025-01-09 12:50:30,932] Trial 0 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:31,315] Trial 1 finished with value: 0.86 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:31,710] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 6}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:32,095] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 15}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:32,473] Trial 4 finished with value: 0.8566666666666667 and parameters: {'n_neighbors': 5}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:32,874] Trial 5 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 12}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:50:33,247] Trial 6 finished with value: 0.86 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:5

Optimizing Naive Bayes...


[I 2025-01-09 12:50:44,705] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:44,988] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:45,263] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:45,562] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:45,849] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:46,126] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 12:50:46,409] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2025-01-09 12:51:14,969] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 289, 'learning_rate': 0.3842903994135972, 'max_depth': 3}. Best is trial 0 with value: 0.88.
[I 2025-01-09 12:51:58,484] Trial 1 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 340, 'learning_rate': 0.15524974744822628, 'max_depth': 6}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-09 12:52:12,837] Trial 2 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 157, 'learning_rate': 0.34192793462483645, 'max_depth': 4}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-09 12:53:54,219] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 282, 'learning_rate': 0.4910033291185185, 'max_depth': 20}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-09 12:54:37,522] Trial 4 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 300, 'learning_rate': 0.4154426626754472, 'max_depth': 7}. Best is

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 13:15:57,204] Trial 0 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 291, 'max_depth': 18, 'learning_rate': 0.29183604172169636}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 13:16:18,459] Trial 1 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 285, 'max_depth': 16, 'learning_rate': 0.31482003300437217}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 13:16:38,620] Trial 2 finished with value: 0.9133333333333333 and parameters: {'n_estimators': 255, 'max_depth': 11, 'learning_rate': 0.4637300826539857}. Best is trial 2 with value: 0.9133333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 13:17:11,927] Trial 3 finished with value: 0.9 and parameters: {'n_estimators': 474, 'max_depth': 17, 'learning_rate': 0.44432389

Optimizing LightGBM...


[I 2025-01-09 13:24:38,130] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 146, 'max_depth': 12, 'learning_rate': 0.1686701013042884}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001102 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:38,530] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 272, 'max_depth': 3, 'learning_rate': 0.4946585797134844}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001082 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:38,906] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 229, 'max_depth': 17, 'learning_rate': 0.25020132975121445}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:39,375] Trial 3 finished with value: 0.84 and parameters: {'n_estimators': 398, 'max_depth': 4, 'learning_rate': 0.3570192739092551}. Best is trial 2 with value: 0.8533333333333334.




[I 2025-01-09 13:24:39,639] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 56, 'max_depth': 15, 'learning_rate': 0.36388864383647396}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001414 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:40,204] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 487, 'max_depth': 6, 'learning_rate': 0.20058271979851242}. Best is trial 2 with value: 0.8533333333333334.




[I 2025-01-09 13:24:40,485] Trial 6 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 80, 'max_depth': 17, 'learning_rate': 0.49753764086881835}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:40,996] Trial 7 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 356, 'max_depth': 10, 'learning_rate': 0.10969451098996963}. Best is trial 2 with value: 0.8533333333333334.




[I 2025-01-09 13:24:41,380] Trial 8 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 56, 'max_depth': 19, 'learning_rate': 0.057635999900099356}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001498 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:41,696] Trial 9 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 111, 'max_depth': 18, 'learning_rate': 0.05436350247007646}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001720 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:42,091] Trial 10 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 216, 'max_depth': 14, 'learning_rate': 0.2900447682135694}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 13:24:42,458] Trial 11 finished with value: 0.85 and parameters: {'n_estimators': 211, 'max_depth': 20, 'learning_rate': 0.47475371382011133}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:42,818] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 167, 'max_depth': 16, 'learning_rate': 0.27993364550288485}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:43,270] Trial 13 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 305, 'max_depth': 9, 'learning_rate': 0.38666517383537535}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-09 13:24:43,864] Trial 14 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 254, 'max_depth': 17, 'learning_rate': 0.20182800818043134}. Best is trial 2 with value: 0.8533333333333334.





[I 2025-01-09 13:24:44,317] Trial 15 finished with value: 0.85 and parameters: {'n_estimators': 114, 'max_depth': 14, 'learning_rate': 0.42206653192856053}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001916 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001724 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:44,940] Trial 16 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 341, 'max_depth': 12, 'learning_rate': 0.2937013283069874}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:45,669] Trial 17 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 437, 'max_depth': 20, 'learning_rate': 0.24064005041474335}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001796 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:46,366] Trial 18 finished with value: 0.85 and parameters: {'n_estimators': 204, 'max_depth': 17, 'learning_rate': 0.1316246551357278}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 13:24:46,792] Trial 19 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 90, 'max_depth': 8, 'learning_rate': 0.3244341962738421}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001959 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:47,297] Trial 20 finished with value: 0.84 and parameters: {'n_estimators': 157, 'max_depth': 13, 'learning_rate': 0.44186359917830453}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:47,915] Trial 21 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 310, 'max_depth': 10, 'learning_rate': 0.39386131159302873}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001404 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:48,381] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 307, 'max_depth': 9, 'learning_rate': 0.4326855046350361}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001099 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:48,858] Trial 23 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 388, 'max_depth': 6, 'learning_rate': 0.3501233158357637}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001471 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:49,273] Trial 24 finished with value: 0.85 and parameters: {'n_estimators': 244, 'max_depth': 16, 'learning_rate': 0.46773901014189884}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:49,699] Trial 25 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 294, 'max_depth': 7, 'learning_rate': 0.3954447457394246}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:50,169] Trial 26 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 340, 'max_depth': 11, 'learning_rate': 0.22946129728570633}. Best is trial 2 with value: 0.8533333333333334.




[I 2025-01-09 13:24:50,528] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 188, 'max_depth': 18, 'learning_rate': 0.31453956648935394}. Best is trial 2 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 13:24:51,068] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 242, 'max_depth': 14, 'learning_rate': 0.3944527841990647}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 13:24:51,383] Trial 29 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 126, 'max_depth': 12, 'learning_rate': 0.1555692878208808}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-09 13:24:51,385] A new study created in memory with name: no-name-48034aee-64c2-44f5-93d4-c51e99721853


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Optimizing AdaBoost...


[I 2025-01-09 13:25:28,244] Trial 0 finished with value: 0.8 and parameters: {'n_estimators': 492, 'learning_rate': 0.5174800324246621}. Best is trial 0 with value: 0.8.
[I 2025-01-09 13:25:34,123] Trial 1 finished with value: 0.6333333333333333 and parameters: {'n_estimators': 66, 'learning_rate': 0.08686046409721387}. Best is trial 0 with value: 0.8.
[I 2025-01-09 13:25:45,120] Trial 2 finished with value: 0.72 and parameters: {'n_estimators': 145, 'learning_rate': 0.27846833308656127}. Best is trial 0 with value: 0.8.
[I 2025-01-09 13:26:03,293] Trial 3 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 219, 'learning_rate': 0.2534902161301588}. Best is trial 0 with value: 0.8.
[I 2025-01-09 13:26:07,938] Trial 4 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 57, 'learning_rate': 0.8825913341909899}. Best is trial 0 with value: 0.8.
[I 2025-01-09 13:26:19,398] Trial 5 finished with value: 0.72 and parameters: {'n_estimators': 148, 'lea

Optimizing Neural Network...


[I 2025-01-09 13:37:14,582] Trial 0 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 47, 'hidden_layer_2': 90, 'learning_rate_init': 0.010882835768780751}. Best is trial 0 with value: 0.8966666666666666.
[I 2025-01-09 13:37:29,317] Trial 1 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 11, 'hidden_layer_2': 36, 'learning_rate_init': 0.0597010622485078}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-09 13:37:50,023] Trial 2 finished with value: 0.9 and parameters: {'hidden_layer_1': 51, 'hidden_layer_2': 54, 'learning_rate_init': 0.07957555377787122}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-09 13:38:05,934] Trial 3 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 49, 'hidden_layer_2': 27, 'learning_rate_init': 0.04404997770702086}. Best is trial 3 with value: 0.9066666666666666.
[I 2025-01-09 13:38:30,090] Trial 4 finished with value: 0.9066666666666666 and parameters: {'hidde

Optimizing MLP...


[I 2025-01-09 13:49:22,019] Trial 0 finished with value: 0.9166666666666666 and parameters: {'layer_1': 51, 'layer_2': 58, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.09490444687482802}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 13:49:29,030] Trial 1 finished with value: 0.5 and parameters: {'layer_1': 92, 'layer_2': 140, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.015123139097679233}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 13:49:43,129] Trial 2 finished with value: 0.5 and parameters: {'layer_1': 142, 'layer_2': 50, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.06833250810577617}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 13:49:52,621] Trial 3 finished with value: 0.5 and parameters: {'layer_1': 87, 'layer_2': 58, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.08128143091345429}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 

                  Model  Accuracy  \
0                   SVM  0.906667   
1         Decision Tree  0.866667   
2         Random Forest  0.893333   
3   Logistic Regression  0.893333   
4                  k-NN  0.883333   
5           Naive Bayes  0.866667   
6     Gradient Boosting  0.900000   
7               XGBoost  0.913333   
8              LightGBM  0.853333   
9              AdaBoost  0.886667   
10       Neural Network  0.920000   
11                  MLP  0.920000   

                                          Best Params  
0          {'C': 1.4844504033388803, 'kernel': 'rbf'}  
1           {'max_depth': 20, 'min_samples_split': 7}  
2   {'n_estimators': 61, 'max_depth': 20, 'min_sam...  
3         {'C': 9.988533805424675, 'solver': 'lbfgs'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 96, 'learning_rate': 0.263784...  
7   {'n_estimators': 255, 'max_depth': 11, 'learni...  
8   {'n_estima

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2025-01-09 14:04:56,893] A new study created in memory with name: no-name-7bea2d43-44d0-4544-aa68-4641afb0ce71


Optimizing SVM...


[I 2025-01-09 14:05:02,800] Trial 0 finished with value: 0.8633333333333333 and parameters: {'C': 3.464289084451455, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-09 14:05:11,647] Trial 1 finished with value: 0.8366666666666667 and parameters: {'C': 1.4412721924465302, 'kernel': 'linear'}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-09 14:05:21,775] Trial 2 finished with value: 0.9033333333333333 and parameters: {'C': 6.703779296188013, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-09 14:05:26,697] Trial 3 finished with value: 0.8666666666666667 and parameters: {'C': 4.364403984502521, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-09 14:05:36,266] Trial 4 finished with value: 0.79 and parameters: {'C': 1.9154649505584882, 'kernel': 'poly'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-09 14:05:42,417] Trial 5 finished with value: 0.8833333333333333 and param

Optimizing Decision Tree...


[I 2025-01-09 14:09:20,992] Trial 0 finished with value: 0.81 and parameters: {'max_depth': 11, 'min_samples_split': 5}. Best is trial 0 with value: 0.81.
[I 2025-01-09 14:09:21,245] Trial 1 finished with value: 0.7933333333333333 and parameters: {'max_depth': 9, 'min_samples_split': 9}. Best is trial 0 with value: 0.81.
[I 2025-01-09 14:09:21,592] Trial 2 finished with value: 0.8566666666666667 and parameters: {'max_depth': 17, 'min_samples_split': 7}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-09 14:09:21,827] Trial 3 finished with value: 0.7833333333333333 and parameters: {'max_depth': 8, 'min_samples_split': 7}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-09 14:09:22,065] Trial 4 finished with value: 0.7866666666666666 and parameters: {'max_depth': 8, 'min_samples_split': 9}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-09 14:09:22,358] Trial 5 finished with value: 0.8266666666666667 and parameters: {'max_depth': 13, 'min_samples_split':

Optimizing Random Forest...


[I 2025-01-09 14:09:37,945] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 423, 'max_depth': 9, 'min_samples_split': 3}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-09 14:09:40,733] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 426, 'max_depth': 17, 'min_samples_split': 10}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 14:09:44,672] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 473, 'max_depth': 16, 'min_samples_split': 7}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 14:09:47,824] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 377, 'max_depth': 16, 'min_samples_split': 2}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 14:09:50,387] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 355, 'max_depth': 17, 'min_samples_split': 2}. Best is trial 1 with value: 

Optimizing Logistic Regression...


[I 2025-01-09 14:10:44,121] Trial 0 finished with value: 0.8033333333333333 and parameters: {'C': 0.6760164239856044, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-09 14:10:44,359] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 6.653849573298111, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8833333333333333.
[I 2025-01-09 14:10:44,765] Trial 2 finished with value: 0.83 and parameters: {'C': 1.2563829756465896, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8833333333333333.
[I 2025-01-09 14:10:45,241] Trial 3 finished with value: 0.8833333333333333 and parameters: {'C': 6.242116469922569, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8833333333333333.
[I 2025-01-09 14:10:45,662] Trial 4 finished with value: 0.8633333333333333 and parameters: {'C': 2.514827590362242, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8833333333333333.
[I 2025-01-09 14:10:46,094] Trial 5 finished with value: 0.8133333333333334 and par

Optimizing k-NN...


[I 2025-01-09 14:10:54,002] Trial 0 finished with value: 0.8833333333333333 and parameters: {'n_neighbors': 3}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:54,629] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:55,246] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 17}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:55,861] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 17}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:56,483] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 15}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:57,090] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 19}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 14:10:57,716] Trial 6

Optimizing Naive Bayes...


[I 2025-01-09 14:11:07,752] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:08,047] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:08,327] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:08,607] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:08,902] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:09,192] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 14:11:09,584] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2025-01-09 14:12:33,100] Trial 0 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 422, 'learning_rate': 0.34490417813397667, 'max_depth': 9}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-09 14:14:43,501] Trial 1 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 387, 'learning_rate': 0.3961906292383814, 'max_depth': 19}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-09 14:16:03,077] Trial 2 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 306, 'learning_rate': 0.031192349503581435, 'max_depth': 14}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-09 14:16:44,500] Trial 3 finished with value: 0.89 and parameters: {'n_estimators': 462, 'learning_rate': 0.149484294698322, 'max_depth': 4}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-09 14:17:47,303] Trial 4 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 476, 'learning_rate': 0.4748671208395

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 14:52:43,480] Trial 0 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 426, 'max_depth': 4, 'learning_rate': 0.15019586897381337}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 14:53:06,381] Trial 1 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 359, 'max_depth': 9, 'learning_rate': 0.4667432491540712}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 14:53:16,911] Trial 2 finished with value: 0.91 and parameters: {'n_estimators': 97, 'max_depth': 19, 'learning_rate': 0.12085400584394998}. Best is trial 2 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 14:53:31,107] Trial 3 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 137, 'max_depth': 20, 'learning_rate': 0.2941229089247814}. Best 

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:10,576] Trial 0 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 158, 'max_depth': 20, 'learning_rate': 0.3284347408028104}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:11,086] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 401, 'max_depth': 14, 'learning_rate': 0.18929386893483943}. Best is trial 1 with value: 0.8466666666666667.




[I 2025-01-09 15:02:11,447] Trial 2 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 191, 'max_depth': 18, 'learning_rate': 0.2886502185797411}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:11,990] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 433, 'max_depth': 15, 'learning_rate': 0.3511239949328066}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:12,515] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 391, 'max_depth': 19, 'learning_rate': 0.18743944022048561}. Best is trial 1 with value: 0.8466666666666667.




[I 2025-01-09 15:02:12,773] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 61, 'max_depth': 13, 'learning_rate': 0.28694994204771507}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:13,148] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 207, 'max_depth': 14, 'learning_rate': 0.1916025978231461}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:13,551] Trial 7 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 245, 'max_depth': 15, 'learning_rate': 0.44152857997773576}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:13,973] Trial 8 finished with value: 0.85 and parameters: {'n_estimators': 279, 'max_depth': 15, 'learning_rate': 0.301227804862918}. Best is trial 8 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:14,421] Trial 9 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 306, 'max_depth': 11, 'learning_rate': 0.2631828598219512}. Best is trial 8 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:14,908] Trial 10 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 320, 'max_depth': 6, 'learning_rate': 0.01208374485913244}. Best is trial 8 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001493 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:15,497] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 478, 'max_depth': 9, 'learning_rate': 0.11421847314360423}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:16,123] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 489, 'max_depth': 9, 'learning_rate': 0.041013180993714204}. Best is trial 11 with value: 0.8533333333333334.
[I 2025-01-09 15:02:16,401] Trial 13 finished with value: 0.82 and parameters: {'n_estimators': 84, 'max_depth': 3, 'learning_rate': 0.10796302167592581}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:17,172] Trial 14 finished with value: 0.85 and parameters: {'n_estimators': 490, 'max_depth': 9, 'learning_rate': 0.41792840854432217}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001763 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:17,893] Trial 15 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 350, 'max_depth': 10, 'learning_rate': 0.11847546122987017}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:18,439] Trial 16 finished with value: 0.85 and parameters: {'n_estimators': 265, 'max_depth': 7, 'learning_rate': 0.48695657397195846}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:18,965] Trial 17 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 145, 'max_depth': 17, 'learning_rate': 0.1121744969001074}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:19,679] Trial 18 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 448, 'max_depth': 7, 'learning_rate': 0.3630071411333696}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001582 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:20,376] Trial 19 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 363, 'max_depth': 12, 'learning_rate': 0.22184591671091425}. Best is trial 11 with value: 0.8533333333333334.




[I 2025-01-09 15:02:20,828] Trial 20 finished with value: 0.83 and parameters: {'n_estimators': 107, 'max_depth': 3, 'learning_rate': 0.14786041611389472}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001735 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:21,646] Trial 21 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 493, 'max_depth': 9, 'learning_rate': 0.37646795250540777}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001487 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:22,388] Trial 22 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.37029538477747637}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001703 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:22,966] Trial 23 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 448, 'max_depth': 9, 'learning_rate': 0.41182821628019955}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001456 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:23,523] Trial 24 finished with value: 0.84 and parameters: {'n_estimators': 422, 'max_depth': 11, 'learning_rate': 0.2943819319325961}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001480 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:24,112] Trial 25 finished with value: 0.84 and parameters: {'n_estimators': 468, 'max_depth': 16, 'learning_rate': 0.33308891605359825}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:24,612] Trial 26 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 357, 'max_depth': 8, 'learning_rate': 0.23187758837587533}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:25,079] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 300, 'max_depth': 12, 'learning_rate': 0.4527760209774691}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:25,613] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 408, 'max_depth': 5, 'learning_rate': 0.3836540683468755}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 15:02:26,056] Trial 29 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 247, 'max_depth': 10, 'learning_rate': 0.25313028737286875}. Best is trial 29 with value: 0.8566666666666667.
[I 2025-01-09 15:02:26,072] A new study created in memory with name: no-name-e785c572-c294-4544-8106-c21a3ff80e20


Optimizing AdaBoost...


[I 2025-01-09 15:02:35,323] Trial 0 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 123, 'learning_rate': 0.7212073919657428}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-09 15:02:45,391] Trial 1 finished with value: 0.6333333333333333 and parameters: {'n_estimators': 146, 'learning_rate': 0.026842955734913264}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-09 15:03:14,580] Trial 2 finished with value: 0.79 and parameters: {'n_estimators': 402, 'learning_rate': 0.5030764218769445}. Best is trial 2 with value: 0.79.
[I 2025-01-09 15:03:37,208] Trial 3 finished with value: 0.7433333333333333 and parameters: {'n_estimators': 301, 'learning_rate': 0.3139010887737426}. Best is trial 2 with value: 0.79.
[I 2025-01-09 15:03:57,092] Trial 4 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 283, 'learning_rate': 0.4683878638832181}. Best is trial 2 with value: 0.79.
[I 2025-01-09 15:04:13,902] Trial 5 finished with valu

Optimizing Neural Network...


[I 2025-01-09 15:13:28,772] Trial 0 finished with value: 0.9 and parameters: {'hidden_layer_1': 32, 'hidden_layer_2': 93, 'learning_rate_init': 0.013267786891709595}. Best is trial 0 with value: 0.9.
[I 2025-01-09 15:13:54,301] Trial 1 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 100, 'hidden_layer_2': 52, 'learning_rate_init': 0.09535007404827542}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-09 15:14:02,192] Trial 2 finished with value: 0.9 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 64, 'learning_rate_init': 0.027798667020607522}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-09 15:14:10,493] Trial 3 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 32, 'hidden_layer_2': 12, 'learning_rate_init': 0.044188343438543914}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-09 15:14:26,522] Trial 4 finished with value: 0.9 and parameters: {'hidden_layer_1': 74, 'hidden_layer_2': 19, 'le

Optimizing MLP...


[I 2025-01-09 15:26:03,005] Trial 0 finished with value: 0.5 and parameters: {'layer_1': 138, 'layer_2': 76, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.0745826634625593}. Best is trial 0 with value: 0.5.
[I 2025-01-09 15:26:45,165] Trial 1 finished with value: 0.9233333333333333 and parameters: {'layer_1': 74, 'layer_2': 135, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.08909100795993583}. Best is trial 1 with value: 0.9233333333333333.
[I 2025-01-09 15:28:29,774] Trial 2 finished with value: 0.9066666666666666 and parameters: {'layer_1': 116, 'layer_2': 139, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.011411535215905957}. Best is trial 1 with value: 0.9233333333333333.
[I 2025-01-09 15:28:35,128] Trial 3 finished with value: 0.5 and parameters: {'layer_1': 54, 'layer_2': 67, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.08357682460346604}. Best is trial 1 with value: 0.9233333333333333.
[I 2025-01-09 15

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.903333     0.873333     0.933333  0.808123   
1         Decision Tree  0.873333     0.780000     0.966667  0.760025   
2         Random Forest  0.886667     0.820000     0.953333  0.780300   
3   Logistic Regression  0.893333     0.853333     0.933333  0.789196   
4                  k-NN  0.883333     0.866667     0.900000  0.767093   
5           Naive Bayes  0.866667     0.880000     0.853333  0.733594   
6     Gradient Boosting  0.893333     0.873333     0.913333  0.787297   
7               XGBoost  0.910000     0.886667     0.933333  0.820894   
8              LightGBM  0.856667     0.786667     0.926667  0.720428   
9              AdaBoost  0.860000     0.760000     0.960000  0.734847   
10       Neural Network  0.926667     0.940000     0.913333  0.853637   
11                  MLP  0.926667     0.933333     0.920000  0.853409   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_TPC_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2025-01-09 16:05:36,845] A new study created in memory with name: no-name-20a35274-ccab-43e0-b457-6b7d89e52530


Optimizing SVM...


[I 2025-01-09 16:06:19,886] Trial 0 finished with value: 0.9033333333333333 and parameters: {'C': 6.4235074464357504, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 16:06:40,211] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 7.668764259807423, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 16:07:18,772] Trial 2 finished with value: 0.83 and parameters: {'C': 3.7360319976837792, 'kernel': 'poly'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 16:07:49,754] Trial 3 finished with value: 0.8933333333333333 and parameters: {'C': 6.92146927782229, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 16:08:25,719] Trial 4 finished with value: 0.87 and parameters: {'C': 2.5342611510835513, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-09 16:09:05,330] Trial 5 finished with value: 0.8433333333333334 and parameters: {'C': 1.

Optimizing Decision Tree...


[I 2025-01-09 16:25:43,740] Trial 0 finished with value: 0.78 and parameters: {'max_depth': 7, 'min_samples_split': 5}. Best is trial 0 with value: 0.78.
[I 2025-01-09 16:25:44,051] Trial 1 finished with value: 0.83 and parameters: {'max_depth': 14, 'min_samples_split': 2}. Best is trial 1 with value: 0.83.
[I 2025-01-09 16:25:44,269] Trial 2 finished with value: 0.7766666666666666 and parameters: {'max_depth': 7, 'min_samples_split': 7}. Best is trial 1 with value: 0.83.
[I 2025-01-09 16:25:44,620] Trial 3 finished with value: 0.85 and parameters: {'max_depth': 18, 'min_samples_split': 5}. Best is trial 3 with value: 0.85.
[I 2025-01-09 16:25:44,894] Trial 4 finished with value: 0.81 and parameters: {'max_depth': 11, 'min_samples_split': 8}. Best is trial 3 with value: 0.85.
[I 2025-01-09 16:25:45,133] Trial 5 finished with value: 0.7233333333333334 and parameters: {'max_depth': 4, 'min_samples_split': 10}. Best is trial 3 with value: 0.85.
[I 2025-01-09 16:25:45,707] Trial 6 finished

Optimizing Random Forest...


[I 2025-01-09 16:25:56,733] Trial 0 finished with value: 0.77 and parameters: {'n_estimators': 457, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 0 with value: 0.77.
[I 2025-01-09 16:25:57,185] Trial 1 finished with value: 0.82 and parameters: {'n_estimators': 87, 'max_depth': 7, 'min_samples_split': 4}. Best is trial 1 with value: 0.82.
[I 2025-01-09 16:25:58,196] Trial 2 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 253, 'max_depth': 6, 'min_samples_split': 6}. Best is trial 1 with value: 0.82.
[I 2025-01-09 16:26:02,695] Trial 3 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 482, 'max_depth': 20, 'min_samples_split': 9}. Best is trial 3 with value: 0.8733333333333333.
[I 2025-01-09 16:26:04,681] Trial 4 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 437, 'max_depth': 4, 'min_samples_split': 10}. Best is trial 3 with value: 0.8733333333333333.
[I 2025-01-09 16:26:05,229] Trial 5 finished with valu

Optimizing Logistic Regression...


[I 2025-01-09 16:27:15,655] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 5.904372620431427, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-09 16:27:16,077] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 6.139821733802031, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-09 16:27:16,392] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 9.260438166352763, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-09 16:27:16,643] Trial 3 finished with value: 0.8866666666666667 and parameters: {'C': 5.709923158933572, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-09 16:27:16,886] Trial 4 finished with value: 0.8133333333333334 and parameters: {'C': 0.7697952167916379, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-09 16:27:17,265] Trial 5 finished with value: 0.813

Optimizing k-NN...


[I 2025-01-09 16:27:26,382] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-09 16:27:26,993] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 16:27:27,612] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 16:27:28,229] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 16:27:28,824] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 16:27:29,440] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 19}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-09 16:27:30,052] Trial 

Optimizing Naive Bayes...


[I 2025-01-09 16:27:39,877] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:40,140] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:40,402] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:40,736] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:41,104] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:41,459] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-09 16:27:41,823] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2025-01-09 16:28:08,566] Trial 0 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 104, 'learning_rate': 0.04618147964459654, 'max_depth': 10}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 16:28:36,853] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 106, 'learning_rate': 0.24339580155400872, 'max_depth': 14}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 16:29:20,738] Trial 2 finished with value: 0.88 and parameters: {'n_estimators': 176, 'learning_rate': 0.4293600260747484, 'max_depth': 13}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-09 16:29:25,359] Trial 3 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 56, 'learning_rate': 0.4086456238385332, 'max_depth': 3}. Best is trial 3 with value: 0.8866666666666667.
[I 2025-01-09 16:30:01,789] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 337, 'learning_rate': 0.3411775728870275, 'max_dept

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 17:07:33,099] Trial 0 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 429, 'max_depth': 18, 'learning_rate': 0.35366691709110454}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 17:07:39,670] Trial 1 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 68, 'max_depth': 4, 'learning_rate': 0.49781225654953903}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 17:08:11,582] Trial 2 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 438, 'max_depth': 19, 'learning_rate': 0.2562903273755281}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-09 17:08:34,412] Trial 3 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 290, 'max_depth': 18, 'learning_rate

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002911 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:00,292] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 161, 'max_depth': 20, 'learning_rate': 0.4432950713416471}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-09 17:18:00,730] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 106, 'max_depth': 17, 'learning_rate': 0.23469914228491714}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001746 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:01,308] Trial 2 finished with value: 0.85 and parameters: {'n_estimators': 268, 'max_depth': 6, 'learning_rate': 0.19707810137328058}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:01,870] Trial 3 finished with value: 0.85 and parameters: {'n_estimators': 263, 'max_depth': 17, 'learning_rate': 0.24457895866217824}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:02,395] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 316, 'max_depth': 3, 'learning_rate': 0.33093695624580394}. Best is trial 1 with value: 0.85.
[I 2025-01-09 17:18:02,799] Trial 5 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 61, 'max_depth': 16, 'learning_rate': 0.3033291746931892}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001783 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-09 17:18:03,436] Trial 6 finished with value: 0.84 and parameters: {'n_estimators': 263, 'max_depth': 19, 'learning_rate': 0.07094755995436222}. Best is trial 1 with value: 0.85.





[I 2025-01-09 17:18:03,858] Trial 7 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 73, 'max_depth': 7, 'learning_rate': 0.24466184594132737}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001847 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:04,305] Trial 8 finished with value: 0.84 and parameters: {'n_estimators': 74, 'max_depth': 14, 'learning_rate': 0.36636885227377736}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001799 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:04,731] Trial 9 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 156, 'max_depth': 19, 'learning_rate': 0.48788629484387436}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001520 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001475 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:05,330] Trial 10 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 480, 'max_depth': 11, 'learning_rate': 0.09440330475621161}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:05,867] Trial 11 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 389, 'max_depth': 10, 'learning_rate': 0.1615156590120656}. Best is trial 1 with value: 0.85.




[I 2025-01-09 17:18:06,357] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 178, 'max_depth': 6, 'learning_rate': 0.16035875148417045}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:06,871] Trial 13 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 342, 'max_depth': 13, 'learning_rate': 0.1822839025821675}. Best is trial 1 with value: 0.85.




[I 2025-01-09 17:18:07,258] Trial 14 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 189, 'max_depth': 8, 'learning_rate': 0.2058406505019704}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001519 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:07,791] Trial 15 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 440, 'max_depth': 3, 'learning_rate': 0.11443336085550965}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001472 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:08,194] Trial 16 finished with value: 0.84 and parameters: {'n_estimators': 221, 'max_depth': 9, 'learning_rate': 0.298888902087978}. Best is trial 1 with value: 0.85.
[I 2025-01-09 17:18:08,535] Trial 17 finished with value: 0.7866666666666666 and parameters: {'n_estimators': 133, 'max_depth': 5, 'learning_rate': 0.014436857950304138}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:08,871] Trial 18 finished with value: 0.84 and parameters: {'n_estimators': 114, 'max_depth': 13, 'learning_rate': 0.38538967855617123}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:09,345] Trial 19 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 327, 'max_depth': 15, 'learning_rate': 0.22062086550709048}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:09,775] Trial 20 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 231, 'max_depth': 17, 'learning_rate': 0.2835827863328237}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001558 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:10,351] Trial 21 finished with value: 0.84 and parameters: {'n_estimators': 287, 'max_depth': 17, 'learning_rate': 0.2533119991014131}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:10,887] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 384, 'max_depth': 18, 'learning_rate': 0.14024264854622448}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:11,305] Trial 23 finished with value: 0.84 and parameters: {'n_estimators': 242, 'max_depth': 12, 'learning_rate': 0.22307240259591815}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:11,756] Trial 24 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 289, 'max_depth': 15, 'learning_rate': 0.3431741006142891}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:12,263] Trial 25 finished with value: 0.85 and parameters: {'n_estimators': 375, 'max_depth': 20, 'learning_rate': 0.27120726418977015}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:12,657] Trial 26 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 207, 'max_depth': 16, 'learning_rate': 0.19347403704452693}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:13,104] Trial 27 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 269, 'max_depth': 5, 'learning_rate': 0.061550119306662404}. Best is trial 1 with value: 0.85.
[I 2025-01-09 17:18:13,429] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 127, 'max_depth': 11, 'learning_rate': 0.23908060535800654}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001430 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:13,880] Trial 29 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 101, 'max_depth': 20, 'learning_rate': 0.3268933009898417}. Best is trial 1 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001787 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-09 17:18:14,171] A new study created in memory with name: no-name-c8163be0-a3a9-401c-9f4b-b52f1c8771f6


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Optimizing AdaBoost...


[I 2025-01-09 17:18:35,713] Trial 0 finished with value: 0.6633333333333333 and parameters: {'n_estimators': 274, 'learning_rate': 0.04445566810215214}. Best is trial 0 with value: 0.6633333333333333.
[I 2025-01-09 17:18:59,771] Trial 1 finished with value: 0.79 and parameters: {'n_estimators': 327, 'learning_rate': 0.6007332279297912}. Best is trial 1 with value: 0.79.
[I 2025-01-09 17:19:22,024] Trial 2 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 291, 'learning_rate': 0.4227014910842193}. Best is trial 1 with value: 0.79.
[I 2025-01-09 17:19:58,698] Trial 3 finished with value: 0.79 and parameters: {'n_estimators': 491, 'learning_rate': 0.38679911591786564}. Best is trial 1 with value: 0.79.
[I 2025-01-09 17:20:22,155] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 303, 'learning_rate': 0.9388900015620332}. Best is trial 4 with value: 0.8466666666666667.
[I 2025-01-09 17:20:50,802] Trial 5 finished with value: 0.85 and pa

Optimizing Neural Network...


[I 2025-01-09 17:32:47,533] Trial 0 finished with value: 0.92 and parameters: {'hidden_layer_1': 60, 'hidden_layer_2': 22, 'learning_rate_init': 0.04766872956542842}. Best is trial 0 with value: 0.92.
[I 2025-01-09 17:33:08,891] Trial 1 finished with value: 0.92 and parameters: {'hidden_layer_1': 74, 'hidden_layer_2': 59, 'learning_rate_init': 0.09105616871960961}. Best is trial 0 with value: 0.92.
[I 2025-01-09 17:33:57,652] Trial 2 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 54, 'hidden_layer_2': 76, 'learning_rate_init': 0.0015010942069476935}. Best is trial 0 with value: 0.92.
[I 2025-01-09 17:34:14,579] Trial 3 finished with value: 0.91 and parameters: {'hidden_layer_1': 46, 'hidden_layer_2': 50, 'learning_rate_init': 0.06044859810453942}. Best is trial 0 with value: 0.92.
[I 2025-01-09 17:34:32,583] Trial 4 finished with value: 0.91 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 30, 'learning_rate_init': 0.07056557670290124}. Best is trial

Optimizing MLP...


[I 2025-01-09 17:45:19,724] Trial 0 finished with value: 0.9166666666666666 and parameters: {'layer_1': 129, 'layer_2': 81, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.06404304420935956}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 17:46:02,487] Trial 1 finished with value: 0.91 and parameters: {'layer_1': 109, 'layer_2': 129, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.054499384177819056}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 17:46:15,226] Trial 2 finished with value: 0.88 and parameters: {'layer_1': 76, 'layer_2': 79, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.09932092910554714}. Best is trial 0 with value: 0.9166666666666666.
[I 2025-01-09 17:47:36,379] Trial 3 finished with value: 0.9066666666666666 and parameters: {'layer_1': 75, 'layer_2': 50, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.00996740257579267}. Best is trial 0 with value: 0.9166666666666666.
[I

Dataset saved successfully!


Class Feature Vector (CFV)


In [2]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_main_tpc.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_main_tpc (1).csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_validation_tpc.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_validation_tpc.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_TPC.csv", index=False)
print("CFV dataset created and saved!")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-16 13:55:11,762] A new study created in memory with name: no-name-d98548f2-1d11-452d-830b-a1fd9464f393


Training and predicting with SVM...


[I 2025-01-16 13:55:47,867] Trial 0 finished with value: 0.81 and parameters: {'C': 2.576322755143849, 'kernel': 'poly'}. Best is trial 0 with value: 0.81.
[I 2025-01-16 13:56:24,424] Trial 1 finished with value: 0.8333333333333334 and parameters: {'C': 4.346971155793399, 'kernel': 'poly'}. Best is trial 1 with value: 0.8333333333333334.
[I 2025-01-16 13:56:53,976] Trial 2 finished with value: 0.8833333333333333 and parameters: {'C': 4.706341750295366, 'kernel': 'linear'}. Best is trial 2 with value: 0.8833333333333333.
[I 2025-01-16 13:57:33,057] Trial 3 finished with value: 0.9033333333333333 and parameters: {'C': 8.446869128095893, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9033333333333333.
[I 2025-01-16 13:58:11,215] Trial 4 finished with value: 0.86 and parameters: {'C': 7.022367754624726, 'kernel': 'poly'}. Best is trial 3 with value: 0.9033333333333333.
[I 2025-01-16 13:58:51,146] Trial 5 finished with value: 0.9033333333333333 and parameters: {'C': 4.739543195512678, 'ker

Training and predicting with Decision Tree...


[I 2025-01-16 14:13:38,150] Trial 0 finished with value: 0.7633333333333333 and parameters: {'max_depth': 6, 'min_samples_split': 9}. Best is trial 0 with value: 0.7633333333333333.
[I 2025-01-16 14:13:38,405] Trial 1 finished with value: 0.7866666666666666 and parameters: {'max_depth': 8, 'min_samples_split': 10}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 14:13:38,646] Trial 2 finished with value: 0.77 and parameters: {'max_depth': 7, 'min_samples_split': 6}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 14:13:38,855] Trial 3 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 9}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 14:13:39,107] Trial 4 finished with value: 0.7866666666666666 and parameters: {'max_depth': 8, 'min_samples_split': 9}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 14:13:39,418] Trial 5 finished with value: 0.8166666666666667 and parameters: {'max_depth': 13, 'min_sa

Training and predicting with Random Forest...


[I 2025-01-16 14:13:51,060] Trial 0 finished with value: 0.8033333333333333 and parameters: {'n_estimators': 417, 'max_depth': 3, 'min_samples_split': 2}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-16 14:13:52,970] Trial 1 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 323, 'max_depth': 16, 'min_samples_split': 10}. Best is trial 1 with value: 0.8533333333333334.
[I 2025-01-16 14:13:54,810] Trial 2 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 460, 'max_depth': 8, 'min_samples_split': 9}. Best is trial 1 with value: 0.8533333333333334.
[I 2025-01-16 14:13:57,851] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 478, 'max_depth': 17, 'min_samples_split': 4}. Best is trial 3 with value: 0.8766666666666667.
[I 2025-01-16 14:13:59,536] Trial 4 finished with value: 0.8066666666666666 and parameters: {'n_estimators': 363, 'max_depth': 4, 'min_samples_split': 5}. Best is trial 3 with value: 0.

Training and predicting with Logistic Regression...


[I 2025-01-16 14:14:51,446] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 5.815936362145891, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-16 14:14:51,734] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 7.427505853688792, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-16 14:14:52,011] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 4.81228155822232, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-16 14:14:52,290] Trial 3 finished with value: 0.8833333333333333 and parameters: {'C': 6.634268101370288, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-16 14:14:52,760] Trial 4 finished with value: 0.8866666666666667 and parameters: {'C': 8.359637759158872, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-16 14:14:53,046] Trial 5 finished with value: 0.85666

Training and predicting with k-NN...


[I 2025-01-16 14:15:05,242] Trial 0 finished with value: 0.87 and parameters: {'n_neighbors': 13}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:05,818] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 7}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:06,406] Trial 2 finished with value: 0.87 and parameters: {'n_neighbors': 13}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:06,976] Trial 3 finished with value: 0.87 and parameters: {'n_neighbors': 10}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:07,563] Trial 4 finished with value: 0.84 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:08,147] Trial 5 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 11}. Best is trial 0 with value: 0.87.
[I 2025-01-16 14:15:08,722] Trial 6 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 6 with value: 0.88.
[I 2025-01-16 14:15:09,308] Trial 7 finishe

Training and predicting with Naive Bayes...


[I 2025-01-16 14:15:24,525] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:24,850] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:25,159] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:25,465] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:25,780] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:26,102] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 14:15:26,414] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Training and predicting with Gradient Boosting...


[I 2025-01-16 14:16:11,835] Trial 0 finished with value: 0.89 and parameters: {'n_estimators': 295, 'learning_rate': 0.21803013102337818, 'max_depth': 6}. Best is trial 0 with value: 0.89.
[I 2025-01-16 14:16:32,233] Trial 1 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 282, 'learning_rate': 0.3847723219325019, 'max_depth': 3}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-16 14:17:24,492] Trial 2 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 276, 'learning_rate': 0.3595763375324751, 'max_depth': 10}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-16 14:17:39,536] Trial 3 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 121, 'learning_rate': 0.4307860767469923, 'max_depth': 6}. Best is trial 1 with value: 0.8966666666666666.
[I 2025-01-16 14:18:18,358] Trial 4 finished with value: 0.88 and parameters: {'n_estimators': 160, 'learning_rate': 0.3230232833232413, 'max_depth': 13}. Best is

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 14:37:10,150] Trial 0 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 314, 'max_depth': 16, 'learning_rate': 0.18311037236566277}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 14:37:26,160] Trial 1 finished with value: 0.9 and parameters: {'n_estimators': 211, 'max_depth': 12, 'learning_rate': 0.04604450143115096}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 14:37:55,699] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 473, 'max_depth': 15, 'learning_rate': 0.21551971670470874}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 14:38:18,977] Trial 3 finished with value: 0.9133333333333333 and parameters: {'n_estimators': 275, 'max_depth': 19, 'learning_rate': 0.052878517481328935}. B

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002321 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:08,222] Trial 0 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 123, 'max_depth': 9, 'learning_rate': 0.20857462121329798}. Best is trial 0 with value: 0.8433333333333334.
[I 2025-01-16 14:47:08,613] Trial 1 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 55, 'max_depth': 4, 'learning_rate': 0.23475850299614273}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001905 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001670 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:09,168] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 204, 'max_depth': 18, 'learning_rate': 0.3403641677152746}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:09,614] Trial 3 finished with value: 0.84 and parameters: {'n_estimators': 104, 'max_depth': 20, 'learning_rate': 0.4466234676570681}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:10,100] Trial 4 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 141, 'max_depth': 4, 'learning_rate': 0.08712485319905243}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:10,751] Trial 5 finished with value: 0.84 and parameters: {'n_estimators': 328, 'max_depth': 15, 'learning_rate': 0.39664865724878623}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:11,593] Trial 6 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 491, 'max_depth': 8, 'learning_rate': 0.07467843478560046}. Best is trial 2 with value: 0.8466666666666667.




[I 2025-01-16 14:47:11,929] Trial 7 finished with value: 0.84 and parameters: {'n_estimators': 119, 'max_depth': 17, 'learning_rate': 0.16249813641256408}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:12,391] Trial 8 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 169, 'max_depth': 13, 'learning_rate': 0.43503463097851497}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:12,718] Trial 9 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 131, 'max_depth': 16, 'learning_rate': 0.22773101202011645}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001310 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001750 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:13,196] Trial 10 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 303, 'max_depth': 12, 'learning_rate': 0.47224334419882835}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:13,615] Trial 11 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 222, 'max_depth': 20, 'learning_rate': 0.34406372382813205}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:14,032] Trial 12 finished with value: 0.84 and parameters: {'n_estimators': 222, 'max_depth': 13, 'learning_rate': 0.33239883627118466}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001455 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:14,429] Trial 13 finished with value: 0.84 and parameters: {'n_estimators': 211, 'max_depth': 9, 'learning_rate': 0.3180777110633147}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001466 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:14,944] Trial 14 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 377, 'max_depth': 18, 'learning_rate': 0.40767344529163396}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001454 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:15,355] Trial 15 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 192, 'max_depth': 14, 'learning_rate': 0.49808774669105493}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001433 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:15,806] Trial 16 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 266, 'max_depth': 14, 'learning_rate': 0.4968719412127685}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:16,360] Trial 17 finished with value: 0.84 and parameters: {'n_estimators': 400, 'max_depth': 11, 'learning_rate': 0.4171183506956513}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001470 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:16,872] Trial 18 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 173, 'max_depth': 11, 'learning_rate': 0.019353394919141814}. Best is trial 8 with value: 0.8566666666666667.
[I 2025-01-16 14:47:17,160] Trial 19 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 54, 'max_depth': 7, 'learning_rate': 0.27433249414540084}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:17,624] Trial 20 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 279, 'max_depth': 14, 'learning_rate': 0.49707604136949834}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:18,078] Trial 21 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 265, 'max_depth': 14, 'learning_rate': 0.49757408547712845}. Best is trial 8 with value: 0.8566666666666667.
[I 2025-01-16 14:47:18,450] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 175, 'max_depth': 12, 'learning_rate': 0.4503319847974701}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001465 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001522 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:18,955] Trial 23 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 347, 'max_depth': 15, 'learning_rate': 0.3852525087701756}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:19,418] Trial 24 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 257, 'max_depth': 10, 'learning_rate': 0.45166352776063795}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:19,892] Trial 25 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 297, 'max_depth': 16, 'learning_rate': 0.3776994074067851}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001488 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:20,487] Trial 26 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 301, 'max_depth': 17, 'learning_rate': 0.3738411752575721}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001453 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:21,060] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 433, 'max_depth': 16, 'learning_rate': 0.2984396021070007}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001980 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:21,540] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 297, 'max_depth': 13, 'learning_rate': 0.4290274529407524}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:22,217] Trial 29 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 349, 'max_depth': 19, 'learning_rate': 0.3684053552443131}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 14:47:22,672] A new study created in memory with name: no-name-c1fc6b41-b969-4d9c-b623-063296618e1b


Training and predicting with AdaBoost...


[I 2025-01-16 14:47:32,800] Trial 0 finished with value: 0.75 and parameters: {'n_estimators': 132, 'learning_rate': 0.5735340207832375}. Best is trial 0 with value: 0.75.
[I 2025-01-16 14:47:58,640] Trial 1 finished with value: 0.82 and parameters: {'n_estimators': 342, 'learning_rate': 0.7677632273591166}. Best is trial 1 with value: 0.82.
[I 2025-01-16 14:48:30,155] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 429, 'learning_rate': 0.8753618225123775}. Best is trial 2 with value: 0.8633333333333333.
[I 2025-01-16 14:48:42,243] Trial 3 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 159, 'learning_rate': 0.4407725449835149}. Best is trial 2 with value: 0.8633333333333333.
[I 2025-01-16 14:49:09,405] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 363, 'learning_rate': 0.9983634376830197}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-16 14:49:21,798] Trial 5 finished with value:

Training and predicting with Neural Network...


[I 2025-01-16 15:01:11,482] Trial 0 finished with value: 0.9 and parameters: {'hidden_layer_1': 96, 'hidden_layer_2': 95, 'learning_rate_init': 0.011396815284265085}. Best is trial 0 with value: 0.9.
[I 2025-01-16 15:01:22,796] Trial 1 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 50, 'hidden_layer_2': 27, 'learning_rate_init': 0.04772385042498817}. Best is trial 0 with value: 0.9.
[I 2025-01-16 15:01:54,194] Trial 2 finished with value: 0.91 and parameters: {'hidden_layer_1': 84, 'hidden_layer_2': 71, 'learning_rate_init': 0.055484974533201836}. Best is trial 2 with value: 0.91.
[I 2025-01-16 15:02:15,520] Trial 3 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 80, 'hidden_layer_2': 61, 'learning_rate_init': 0.02563372781870428}. Best is trial 2 with value: 0.91.
[I 2025-01-16 15:02:30,928] Trial 4 finished with value: 0.91 and parameters: {'hidden_layer_1': 62, 'hidden_layer_2': 99, 'learning_rate_init': 0.07560629746956768}. Be

Training and predicting with MLP...


[I 2025-01-16 15:11:09,796] Trial 0 finished with value: 0.9133333333333333 and parameters: {'layer_1': 112, 'layer_2': 113, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.058958334853773935}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-16 15:11:18,777] Trial 1 finished with value: 0.5 and parameters: {'layer_1': 145, 'layer_2': 66, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.020216416881694073}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-16 15:11:45,071] Trial 2 finished with value: 0.89 and parameters: {'layer_1': 93, 'layer_2': 91, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.09172591324497353}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-16 15:12:35,806] Trial 3 finished with value: 0.9166666666666666 and parameters: {'layer_1': 107, 'layer_2': 113, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.08274676664271781}. Best is trial 3 with value: 0.91666666666666

CFV dataset created and saved!


CPFV (Combined Probability and Class Feature Vector)

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load datasets
# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_main_tpc.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_main_tpc (1).csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/positive_validation_tpc.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/10_TPC (Tripeptide Composition)/negative_validation_tpc.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_TPC.csv", index=False)


Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...
