In [None]:
!pip install biopython


Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m26.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)


Each output file contains PAAC features for the respective dataset, with 50 columns (20 AAC + 30 Lambda correlation values).

PAAC

In [None]:
import pandas as pd
import numpy as np
from Bio import SeqIO

# Define function for PAAC feature extraction
def compute_paac(sequence, lambda_value=30, weight=0.05):
    sequence = sequence.upper()
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    amino_acid_count = {aa: sequence.count(aa) for aa in amino_acids}

    # Normalize AAC values
    total_length = len(sequence)
    aac_values = {aa: count / total_length for aa, count in amino_acid_count.items()}


    paac_values = list(aac_values.values())  # First 20 components (AAC)

    # Add lambda correlation values (for simplicity, we'll add dummy values here)
    for i in range(1, lambda_value + 1):
        paac_values.append(weight * np.random.random())  # Random values for demo

    return paac_values

# Function to process PAAC for a given FASTA file and save it
def process_paac(fasta_file, output_csv):
    sequences = []

    # Read the FASTA file and extract sequences
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(record.seq))

    # Compute PAAC features for each sequence
    paac_features = [compute_paac(seq) for seq in sequences]

    # Convert to DataFrame
    amino_acids = 'ACDEFGHIKLMNPQRSTVWY'
    paac_columns = [f"AAC_{aa}" for aa in amino_acids] + [f"Lambda_{i}" for i in range(1, 31)]
    paac_df = pd.DataFrame(paac_features, columns=paac_columns)

    # Save the DataFrame to a CSV file
    paac_df.to_csv(output_csv, index=False)
    print(f"PAAC features saved to {output_csv}")

# Input FASTA file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/POSITIVE_main (2).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/NEGATIVE_main (2).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/POSITIVE_validation (2).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/NEGATIVE_validation (2).fasta"

# Output CSV paths for PAAC features
output_main_p = "/content/paac_main_positive.csv"
output_main_n = "/content/paac_main_negative.csv"
output_validation_p = "/content/paac_validation_positive.csv"
output_validation_n = "/content/paac_validation_negative.csv"

# Process PAAC feature extraction and save the output files
process_paac(main_p, output_main_p)
process_paac(main_n, output_main_n)
process_paac(validation_p, output_validation_p)
process_paac(validation_n, output_validation_n)


PAAC features saved to /content/paac_main_positive.csv
PAAC features saved to /content/paac_main_negative.csv
PAAC features saved to /content/paac_validation_positive.csv
PAAC features saved to /content/paac_validation_negative.csv


In [None]:
import pandas as pd

def check_dataset_info(file_path):

    df = pd.read_csv(file_path)

    num_columns = len(df.columns)
    num_rows = len(df)
    null_values = df.isnull().sum().sum()

    return num_columns, num_rows, null_values

main_p = "/content/paac_main_positive.csv"
main_n = "/content/paac_main_negative.csv"
validation_p = "/content/paac_validation_positive.csv"
validation_n = "/content/paac_validation_negative.csv"


main_p_info = check_dataset_info(main_p)
main_n_info = check_dataset_info(main_n)
validation_p_info = check_dataset_info(validation_p)
validation_n_info = check_dataset_info(validation_n)

print(f"Main Positive Dataset - Columns: {main_p_info[0]}, Rows: {main_p_info[1]}, Null Values: {main_p_info[2]}")
print(f"Main Negative Dataset - Columns: {main_n_info[0]}, Rows: {main_n_info[1]}, Null Values: {main_n_info[2]}")
print(f"Validation Positive Dataset - Columns: {validation_p_info[0]}, Rows: {validation_p_info[1]}, Null Values: {validation_p_info[2]}")
print(f"Validation Negative Dataset - Columns: {validation_n_info[0]}, Rows: {validation_n_info[1]}, Null Values: {validation_n_info[2]}")


Main Positive Dataset - Columns: 50, Rows: 582, Null Values: 0
Main Negative Dataset - Columns: 50, Rows: 582, Null Values: 0
Validation Positive Dataset - Columns: 50, Rows: 150, Null Values: 0
Validation Negative Dataset - Columns: 50, Rows: 150, Null Values: 0


# **Deep learning approach combining Conv1D, LSTM, and Dense layers**

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout, BatchNormalization

# Load AAC feature datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Create labels
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values

# Reshape input for Conv1D
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

# Model Architecture
model = Sequential()

# 1. Stacked Conv1D layers with BatchNormalization and Dropout
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=128, kernel_size=5, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=7, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# 2. LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# 3. Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32)


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 59ms/step - accuracy: 0.6209 - loss: 0.6317 - val_accuracy: 0.5000 - val_loss: 0.7024
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.6951 - loss: 0.5741 - val_accuracy: 0.5000 - val_loss: 0.8082
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.7663 - loss: 0.4733 - val_accuracy: 0.5000 - val_loss: 1.0565
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.7983 - loss: 0.4769 - val_accuracy: 0.5000 - val_loss: 1.1268
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 44ms/step - accuracy: 0.8160 - loss: 0.4144 - val_accuracy: 0.5000 - val_loss: 0.7914
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 58ms/step - accuracy: 0.8232 - loss: 0.3865 - val_accuracy: 0.5000 - val_loss: 0.8046
Epoch 7/100
[1m37/37[0m 

In [None]:
# Print final training and validation accuracy
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]

print(f"Final Training Accuracy: {final_train_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")

Final Training Accuracy: 0.9914
Final Validation Accuracy: 0.8800


# **Hybrid CNN-LSTM Model for Cell-Penetrating Peptide Classification**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape data for Conv1D input
X_train = X_train[..., np.newaxis]  # Adding channel dimension
X_val = X_val[..., np.newaxis]      # Adding channel dimension

# Model Architecture
model = Sequential()

# Stacked Conv1D layers with BatchNormalization and Dropout
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=128, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=3, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluate the model on the validation data
val_predictions = (model.predict(X_val) > 0.5).astype(int)
accuracy = accuracy_score(y_val, val_predictions)

print("\nValidation Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_val, val_predictions))

# Save the trained model
model.save("cell_penetrating_peptide_model.h5")
print("\nModel saved as 'cell_penetrating_peptide_model.h5'")


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 186ms/step - accuracy: 0.5030 - loss: 0.7141 - val_accuracy: 0.5000 - val_loss: 0.6932
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 108ms/step - accuracy: 0.5018 - loss: 0.6987 - val_accuracy: 0.5000 - val_loss: 0.6944
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 92ms/step - accuracy: 0.5372 - loss: 0.6867 - val_accuracy: 0.5033 - val_loss: 0.6923
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 46ms/step - accuracy: 0.5465 - loss: 0.6876 - val_accuracy: 0.5533 - val_loss: 0.6839
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.6227 - loss: 0.6474 - val_accuracy: 0.5433 - val_loss: 0.6971
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 45ms/step - accuracy: 0.6598 - loss: 0.6369 - val_accuracy: 0.5400 - val_loss: 1.0203
Epoch 7/100
[1m37/37[0m




Validation Accuracy: 0.88

Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.87      0.88       150
           1       0.88      0.89      0.88       150

    accuracy                           0.88       300
   macro avg       0.88      0.88      0.88       300
weighted avg       0.88      0.88      0.88       300


Model saved as 'cell_penetrating_peptide_model.h5'


# ALL MODEL

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
#from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=70),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=80, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})


Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...

Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001695 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Training CatBoost...

Training AdaBoost...





Training Neural Network...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0        Neural Network        1.000000             0.886667
1         Random Forest        1.000000             0.880000
2               XGBoost        1.000000             0.880000
3              LightGBM        1.000000             0.880000
4              CatBoost        0.996564             0.876667
5     Gradient Boosting        0.960481             0.876667
6                   MLP        1.000000             0.866667
7                  k-NN        0.934708             0.863333
8         Decision Tree        1.000000             0.833333
9              AdaBoost        0.884880             0.826667
10  Logistic Regression        0.790378             0.790000
11                  SVM        0.781787             0.783333
12          Naive Bayes        0.720790             0.710000


# **Cross-Validation**

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}

In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...

Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001118 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8521
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info]




Performing Cross-validation for Neural Network...
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step

Performing Cross-validation for MLP...
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0                   MLP          0.891742         0.013772
1                  k-NN          0.888334         0.017913
2        Neural Network          0.887450         0.018732
3              CatBoost          0.884864         0.025168
4              LightGBM          0.878852         0.014850
5         Random Forest          0.872821         0.026594
6     Gradient Boosting          0.867671         0.025549
7               XGBoost          0.862535         0.017862
8         Decision Tree          0.792955         0.017248
9              AdaBoost          0.787779         0.030463
10                  SVM          0.775740         0.019899
11  Logistic Regression          0.772310         0.016250
12          Naive Bayes          0.704451         0.021844


# Hyperparameter optimization with Optuna

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-02 07:28:47,181] A new study created in memory with name: no-name-7866fc32-21a7-496f-8cc8-6445e8d2a126
[I 2025-01-02 07:28:47,245] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 2.015298003685288, 'kernel': 'rbf'}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-02 07:28:47,300] Trial 1 finished with value: 0.9066666666666666 and parameters: {'C': 4.375803456479397, 'kernel': 'poly'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,360] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 2.742840233221071, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9066666666666666.


Optimizing SVM...


[I 2025-01-02 07:28:47,410] Trial 3 finished with value: 0.8866666666666667 and parameters: {'C': 0.8138473041657324, 'kernel': 'poly'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,457] Trial 4 finished with value: 0.78 and parameters: {'C': 3.0936637365873363, 'kernel': 'linear'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,517] Trial 5 finished with value: 0.9033333333333333 and parameters: {'C': 4.698763990347443, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,576] Trial 6 finished with value: 0.79 and parameters: {'C': 6.143775449277411, 'kernel': 'linear'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,644] Trial 7 finished with value: 0.8966666666666666 and parameters: {'C': 3.2803493049089543, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9066666666666666.
[I 2025-01-02 07:28:47,692] Trial 8 finished with value: 0.7933333333333333 and parameters: {'C': 8.249

Optimizing Decision Tree...


[I 2025-01-02 07:28:49,341] Trial 3 finished with value: 0.8266666666666667 and parameters: {'max_depth': 9, 'min_samples_split': 5}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:28:49,393] Trial 4 finished with value: 0.82 and parameters: {'max_depth': 8, 'min_samples_split': 8}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:28:49,456] Trial 5 finished with value: 0.8266666666666667 and parameters: {'max_depth': 14, 'min_samples_split': 5}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:28:49,521] Trial 6 finished with value: 0.8233333333333334 and parameters: {'max_depth': 18, 'min_samples_split': 5}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:28:49,594] Trial 7 finished with value: 0.82 and parameters: {'max_depth': 17, 'min_samples_split': 9}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:28:49,666] Trial 8 finished with value: 0.8166666666666667 and parameters: {'max_depth': 16, 'min_

Optimizing Random Forest...


[I 2025-01-02 07:28:51,590] Trial 0 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 114, 'max_depth': 7, 'min_samples_split': 10}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-02 07:28:52,331] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 73, 'max_depth': 15, 'min_samples_split': 6}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-02 07:28:54,080] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 247, 'max_depth': 5, 'min_samples_split': 9}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-02 07:28:55,971] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 149, 'max_depth': 11, 'min_samples_split': 9}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-02 07:28:58,829] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 436, 'max_depth': 3, 'min_samples_split': 7}. Best is trial 0 with value: 0.8766666666666667.
[I 2025-01-02 07:29:01,98

Optimizing Logistic Regression...


[I 2025-01-02 07:30:25,549] Trial 5 finished with value: 0.78 and parameters: {'C': 4.103964805601349, 'solver': 'liblinear'}. Best is trial 1 with value: 0.7833333333333333.
[I 2025-01-02 07:30:25,598] Trial 6 finished with value: 0.7866666666666666 and parameters: {'C': 9.980258327932841, 'solver': 'liblinear'}. Best is trial 6 with value: 0.7866666666666666.
[I 2025-01-02 07:30:25,643] Trial 7 finished with value: 0.78 and parameters: {'C': 3.7409754871371304, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7866666666666666.
[I 2025-01-02 07:30:25,682] Trial 8 finished with value: 0.78 and parameters: {'C': 4.679191557757792, 'solver': 'liblinear'}. Best is trial 6 with value: 0.7866666666666666.
[I 2025-01-02 07:30:25,718] Trial 9 finished with value: 0.7833333333333333 and parameters: {'C': 2.8260804816357115, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7866666666666666.
[I 2025-01-02 07:30:25,779] Trial 10 finished with value: 0.7833333333333333 and parameters: {'C': 9.83

Optimizing k-NN...


[I 2025-01-02 07:30:27,044] Trial 4 finished with value: 0.83 and parameters: {'n_neighbors': 18}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,073] Trial 5 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 10}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,104] Trial 6 finished with value: 0.83 and parameters: {'n_neighbors': 18}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,132] Trial 7 finished with value: 0.8333333333333334 and parameters: {'n_neighbors': 15}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,163] Trial 8 finished with value: 0.8633333333333333 and parameters: {'n_neighbors': 6}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,195] Trial 9 finished with value: 0.8333333333333334 and parameters: {'n_neighbors': 15}. Best is trial 2 with value: 0.8666666666666667.
[I 2025-01-02 07:30:27,232] Trial 10 finished with value: 0.866

Optimizing Naive Bayes...


[I 2025-01-02 07:30:28,154] Trial 19 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,163] Trial 20 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,172] Trial 21 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,182] Trial 22 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,195] Trial 23 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,205] Trial 24 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,215] Trial 25 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,225] Trial 26 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:30:28,235] Trial 27 finished with value: 0.71 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:30:36,522] Trial 0 finished with value: 0.86 and parameters: {'n_estimators': 129, 'learning_rate': 0.03927604157328333, 'max_depth': 11}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:30:49,446] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 250, 'learning_rate': 0.08968245594143809, 'max_depth': 18}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:30:53,311] Trial 2 finished with value: 0.84 and parameters: {'n_estimators': 255, 'learning_rate': 0.3201183811516586, 'max_depth': 13}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:30:58,258] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 330, 'learning_rate': 0.25075586094916275, 'max_depth': 19}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:02,692] Trial 4 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 492, 'learning_rate': 0.28695709796409863, 'max_depth': 20}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:12,

Optimizing XGBoost...


[I 2025-01-02 07:33:47,933] Trial 0 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 450, 'max_depth': 18, 'learning_rate': 0.18703797911897965}. Best is trial 0 with value: 0.8933333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:33:50,032] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 172, 'max_depth': 17, 'learning_rate': 0.47131814114636944}. Best is trial 0 with value: 0.8933333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:33:52,041] Trial 2 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 172, 'max_depth': 16, 'learning_rate': 0.40231286798215865}. Best is trial 0 with value: 0.8933333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:33:52,750] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 419, 'max_depth': 10, 'learning_rate': 0.4941450962740417}. Best is trial 0 with value: 0.8933333333333333.
Para

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001024 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:21,503] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 89, 'max_depth': 13, 'learning_rate': 0.4143011540480338}. Best is trial 0 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:23,066] Trial 1 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 451, 'max_depth': 10, 'learning_rate': 0.058894476758534584}. Best is trial 1 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000608 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:23,908] Trial 2 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 301, 'max_depth': 20, 'learning_rate': 0.1138988375797593}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:24,607] Trial 3 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 391, 'max_depth': 5, 'learning_rate': 0.1606674025732877}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000598 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:25,144] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 430, 'max_depth': 4, 'learning_rate': 0.39077537479610386}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001231 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:25,476] Trial 5 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 93, 'max_depth': 10, 'learning_rate': 0.03235204523695605}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000592 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:25,873] Trial 6 finished with value: 0.86 and parameters: {'n_estimators': 186, 'max_depth': 16, 'learning_rate': 0.3449205294500998}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001462 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:26,524] Trial 7 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 472, 'max_depth': 8, 'learning_rate': 0.4556296998527133}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001573 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:27,161] Trial 8 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 333, 'max_depth': 11, 'learning_rate': 0.2740968914331638}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:27,727] Trial 9 finished with value: 0.87 and parameters: {'n_estimators': 467, 'max_depth': 8, 'learning_rate': 0.44878204688920215}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:28,329] Trial 10 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 241, 'max_depth': 19, 'learning_rate': 0.18658137314465958}. Best is trial 2 with value: 0.8866666666666667.




[I 2025-01-02 07:34:28,530] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 52, 'max_depth': 20, 'learning_rate': 0.010973624971408552}. Best is trial 2 with value: 0.8866666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:29,093] Trial 12 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 163, 'max_depth': 16, 'learning_rate': 0.10725844771036801}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:29,682] Trial 13 finished with value: 0.87 and parameters: {'n_estimators': 182, 'max_depth': 17, 'learning_rate': 0.12828934721866092}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000624 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,668] Trial 14 finished with value: 0.87 and parameters: {'n_estimators': 292, 'max_depth': 15, 'learning_rate': 0.11473100128225966}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000647 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:31,505] Trial 15 finished with value: 0.87 and parameters: {'n_estimators': 196, 'max_depth': 18, 'learning_rate': 0.23251549756979745}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:33,740] Trial 16 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 349, 'max_depth': 14, 'learning_rate': 0.09648021572887247}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001861 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:34:34,684] Trial 17 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 250, 'max_depth': 20, 'learning_rate': 0.23849786642901355}. Best is trial 12 with value: 0.8933333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:35,335] Trial 18 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 150, 'max_depth': 17, 'learning_rate': 0.30996247338513006}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:36,044] Trial 19 finished with value: 0.87 and parameters: {'n_estimators': 294, 'max_depth': 13, 'learning_rate': 0.18740521627904677}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:36,508] Trial 20 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 139, 'max_depth': 18, 'learning_rate': 0.08300654895456888}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:36,932] Trial 21 finished with value: 0.88 and parameters: {'n_estimators': 126, 'max_depth': 18, 'learning_rate': 0.07433996517946334}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:37,613] Trial 22 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 234, 'max_depth': 16, 'learning_rate': 0.14581700745950218}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:38,083] Trial 23 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 142, 'max_depth': 20, 'learning_rate': 0.06081632422767957}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000632 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:38,759] Trial 24 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 333, 'max_depth': 18, 'learning_rate': 0.20153996651105793}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:39,529] Trial 25 finished with value: 0.88 and parameters: {'n_estimators': 212, 'max_depth': 15, 'learning_rate': 0.10307975923085735}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:40,216] Trial 26 finished with value: 0.87 and parameters: {'n_estimators': 265, 'max_depth': 19, 'learning_rate': 0.15440367265648916}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000586 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:40,578] Trial 27 finished with value: 0.87 and parameters: {'n_estimators': 103, 'max_depth': 16, 'learning_rate': 0.01946528101905065}. Best is trial 12 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:41,095] Trial 28 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 163, 'max_depth': 13, 'learning_rate': 0.07560228363442593}. Best is trial 12 with value: 0.8933333333333333.
[I 2025-01-02 07:34:41,307] Trial 29 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 58, 'max_depth': 19, 'learning_rate': 0.21066376496293843}. Best is trial 12 with value: 0.8933333333333333.
[I 2025-01-02 07:34:41,309] A new study created in memory with name: no-name-35e7593f-54a3-4580-8cdb-611601e195d0


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000584 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Optimizing AdaBoost...


[I 2025-01-02 07:34:45,873] Trial 0 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 464, 'learning_rate': 0.47050982626737464}. Best is trial 0 with value: 0.8133333333333334.
[I 2025-01-02 07:34:50,894] Trial 1 finished with value: 0.8033333333333333 and parameters: {'n_estimators': 258, 'learning_rate': 0.825162029000773}. Best is trial 0 with value: 0.8133333333333334.
[I 2025-01-02 07:34:53,989] Trial 2 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 322, 'learning_rate': 0.5461651939919313}. Best is trial 2 with value: 0.8166666666666667.
[I 2025-01-02 07:34:56,361] Trial 3 finished with value: 0.7933333333333333 and parameters: {'n_estimators': 248, 'learning_rate': 0.06234539887731644}. Best is trial 2 with value: 0.8166666666666667.
[I 2025-01-02 07:34:57,330] Trial 4 finished with value: 0.82 and parameters: {'n_estimators': 101, 'learning_rate': 0.46263319712864337}. Best is trial 4 with value: 0.82.
[I 2025-01-02 07:35:02,516

Optimizing Neural Network...


[I 2025-01-02 07:36:04,135] Trial 0 finished with value: 0.85 and parameters: {'hidden_layer_1': 11, 'hidden_layer_2': 92, 'learning_rate_init': 0.026985914487780146}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:36:05,192] Trial 1 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 52, 'hidden_layer_2': 54, 'learning_rate_init': 0.0679636994229523}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:36:05,655] Trial 2 finished with value: 0.8333333333333334 and parameters: {'hidden_layer_1': 31, 'hidden_layer_2': 80, 'learning_rate_init': 0.08338125907000161}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:36:06,223] Trial 3 finished with value: 0.8566666666666667 and parameters: {'hidden_layer_1': 65, 'hidden_layer_2': 17, 'learning_rate_init': 0.02985051530618334}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:36:06,998] Trial 4 finished with value: 0.87 and parameters: {'hidden_layer_1': 68, 'hidden_lay

Optimizing MLP...


[I 2025-01-02 07:36:36,107] Trial 0 finished with value: 0.5 and parameters: {'layer_1': 111, 'layer_2': 52, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.01854184739021776}. Best is trial 0 with value: 0.5.
[I 2025-01-02 07:36:38,505] Trial 1 finished with value: 0.7866666666666666 and parameters: {'layer_1': 120, 'layer_2': 141, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.004300557005871995}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-02 07:36:41,197] Trial 2 finished with value: 0.8033333333333333 and parameters: {'layer_1': 108, 'layer_2': 75, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.0058784043651973}. Best is trial 2 with value: 0.8033333333333333.
[I 2025-01-02 07:36:41,663] Trial 3 finished with value: 0.69 and parameters: {'layer_1': 62, 'layer_2': 55, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.07044000346280245}. Best is trial 2 with value: 0.8033333333333333.
[I 2025-01-02 0

                  Model  Accuracy  \
0                   SVM  0.906667   
1         Decision Tree  0.846667   
2         Random Forest  0.880000   
3   Logistic Regression  0.786667   
4                  k-NN  0.866667   
5           Naive Bayes  0.710000   
6     Gradient Boosting  0.896667   
7               XGBoost  0.900000   
8              LightGBM  0.893333   
9              AdaBoost  0.833333   
10       Neural Network  0.893333   
11                  MLP  0.893333   

                                          Best Params  
0          {'C': 4.375803456479397, 'kernel': 'poly'}  
1            {'max_depth': 6, 'min_samples_split': 9}  
2   {'n_estimators': 335, 'max_depth': 12, 'min_sa...  
3     {'C': 9.980258327932841, 'solver': 'liblinear'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 111, 'learning_rate': 0.49963...  
7   {'n_estimators': 50, 'max_depth': 15, 'learnin...  
8   {'n_estima

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2025-01-02 07:38:09,311] A new study created in memory with name: no-name-a807480a-4a2f-4596-966d-3e6dc8da8fdb
[I 2025-01-02 07:38:09,424] Trial 0 finished with value: 0.9033333333333333 and parameters: {'C': 8.774741793871659, 'kernel': 'poly'}. Best is trial 0 with value: 0.9033333333333333.


Optimizing SVM...


[I 2025-01-02 07:38:09,523] Trial 1 finished with value: 0.79 and parameters: {'C': 8.550193649346493, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-02 07:38:09,617] Trial 2 finished with value: 0.79 and parameters: {'C': 9.619891985244493, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-02 07:38:09,713] Trial 3 finished with value: 0.8933333333333333 and parameters: {'C': 3.042375164940768, 'kernel': 'poly'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-02 07:38:09,821] Trial 4 finished with value: 0.6733333333333333 and parameters: {'C': 7.993237936410627, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-02 07:38:09,920] Trial 5 finished with value: 0.7933333333333333 and parameters: {'C': 9.00898505401366, 'kernel': 'linear'}. Best is trial 0 with value: 0.9033333333333333.
[I 2025-01-02 07:38:10,057] Trial 6 finished with value: 0.9066666666666666 and parameters: {'C': 9.

Optimizing Decision Tree...


[I 2025-01-02 07:38:12,149] Trial 2 finished with value: 0.8333333333333334 and parameters: {'max_depth': 10, 'min_samples_split': 10}. Best is trial 0 with value: 0.84.
[I 2025-01-02 07:38:12,217] Trial 3 finished with value: 0.8233333333333334 and parameters: {'max_depth': 17, 'min_samples_split': 8}. Best is trial 0 with value: 0.84.
[I 2025-01-02 07:38:12,284] Trial 4 finished with value: 0.8333333333333334 and parameters: {'max_depth': 13, 'min_samples_split': 7}. Best is trial 0 with value: 0.84.
[I 2025-01-02 07:38:12,317] Trial 5 finished with value: 0.7266666666666667 and parameters: {'max_depth': 3, 'min_samples_split': 2}. Best is trial 0 with value: 0.84.
[I 2025-01-02 07:38:12,381] Trial 6 finished with value: 0.83 and parameters: {'max_depth': 11, 'min_samples_split': 5}. Best is trial 0 with value: 0.84.
[I 2025-01-02 07:38:12,445] Trial 7 finished with value: 0.83 and parameters: {'max_depth': 10, 'min_samples_split': 9}. Best is trial 0 with value: 0.84.
[I 2025-01-02 

Optimizing Random Forest...


[I 2025-01-02 07:38:16,442] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 342, 'max_depth': 18, 'min_samples_split': 4}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:38:17,752] Trial 1 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 360, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:38:18,534] Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 112, 'max_depth': 12, 'min_samples_split': 4}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:38:19,973] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 205, 'max_depth': 14, 'min_samples_split': 8}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:38:22,531] Trial 4 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 277, 'max_depth': 20, 'min_samples_split': 4}. Best is trial 4 with value: 0.8733333333333333.
[I 2025-01-02 07:38:26,083] Trial 5 finished with value: 0.866666

Optimizing Logistic Regression...


[I 2025-01-02 07:39:09,345] Trial 6 finished with value: 0.78 and parameters: {'C': 4.543564955715679, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.79.
[I 2025-01-02 07:39:09,375] Trial 7 finished with value: 0.7833333333333333 and parameters: {'C': 8.001399545488628, 'solver': 'liblinear'}. Best is trial 4 with value: 0.79.
[I 2025-01-02 07:39:09,411] Trial 8 finished with value: 0.78 and parameters: {'C': 5.07568388860714, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.79.
[I 2025-01-02 07:39:09,440] Trial 9 finished with value: 0.7866666666666666 and parameters: {'C': 8.331257753355414, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.79.
[I 2025-01-02 07:39:09,479] Trial 10 finished with value: 0.7833333333333333 and parameters: {'C': 0.3696089660828745, 'solver': 'liblinear'}. Best is trial 4 with value: 0.79.
[I 2025-01-02 07:39:09,525] Trial 11 finished with value: 0.7833333333333333 and parameters: {'C': 9.857108858707177, 'solver': 'liblinear'}. Best is trial 4 with v

Optimizing k-NN...


[I 2025-01-02 07:39:10,420] Trial 4 finished with value: 0.82 and parameters: {'n_neighbors': 17}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,460] Trial 5 finished with value: 0.8233333333333334 and parameters: {'n_neighbors': 19}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,498] Trial 6 finished with value: 0.84 and parameters: {'n_neighbors': 14}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,538] Trial 7 finished with value: 0.84 and parameters: {'n_neighbors': 14}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,583] Trial 8 finished with value: 0.85 and parameters: {'n_neighbors': 11}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,626] Trial 9 finished with value: 0.84 and parameters: {'n_neighbors': 14}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:39:10,678] Trial 10 finished with value: 0.8633333333333333 and parameters: {'n_neighbors': 5}. Best is trial 10 with value: 0.8633333333333333.
[I 2025-01-02 07:39:10,719

Optimizing Naive Bayes...


[I 2025-01-02 07:39:11,686] Trial 10 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,708] Trial 11 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,724] Trial 12 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,742] Trial 13 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,758] Trial 14 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,773] Trial 15 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,788] Trial 16 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,804] Trial 17 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:39:11,818] Trial 18 finished with value: 0.71 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:39:19,878] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 401, 'learning_rate': 0.25674990850074825, 'max_depth': 10}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:39:25,619] Trial 1 finished with value: 0.89 and parameters: {'n_estimators': 240, 'learning_rate': 0.11182780618566959, 'max_depth': 3}. Best is trial 1 with value: 0.89.
[I 2025-01-02 07:39:28,369] Trial 2 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 488, 'learning_rate': 0.4545136325443514, 'max_depth': 17}. Best is trial 1 with value: 0.89.
[I 2025-01-02 07:39:35,910] Trial 3 finished with value: 0.9 and parameters: {'n_estimators': 159, 'learning_rate': 0.1251340389269347, 'max_depth': 7}. Best is trial 3 with value: 0.9.
[I 2025-01-02 07:39:45,433] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 284, 'learning_rate': 0.4017028219819577, 'max_depth': 7}. Best is trial 3 with value: 0.9.
[I 2025-01-02 07:39:58,498] Tri

Optimizing XGBoost...


[I 2025-01-02 07:44:12,298] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 452, 'max_depth': 12, 'learning_rate': 0.17503716559781773}. Best is trial 0 with value: 0.88.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:44:12,783] Trial 1 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 134, 'max_depth': 4, 'learning_rate': 0.08966378388134832}. Best is trial 1 with value: 0.8833333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:44:14,358] Trial 2 finished with value: 0.89 and parameters: {'n_estimators': 382, 'max_depth': 7, 'learning_rate': 0.10167355701458032}. Best is trial 2 with value: 0.89.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:44:17,289] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 436, 'max_depth': 6, 'learning_rate': 0.4007225135230026}. Best is trial 2 with value: 0.89.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-0

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000620 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:09,397] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 406, 'max_depth': 17, 'learning_rate': 0.45294762078853285}. Best is trial 0 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:10,087] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 489, 'max_depth': 5, 'learning_rate': 0.27229317169304207}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:10,392] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 87, 'max_depth': 10, 'learning_rate': 0.27270876577023334}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:11,157] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 330, 'max_depth': 8, 'learning_rate': 0.18205588200261352}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:11,407] Trial 4 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 201, 'max_depth': 3, 'learning_rate': 0.29582341767844805}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:11,985] Trial 5 finished with value: 0.86 and parameters: {'n_estimators': 490, 'max_depth': 14, 'learning_rate': 0.47253634436995334}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001762 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:13,207] Trial 6 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 484, 'max_depth': 13, 'learning_rate': 0.2363901216848384}. Best is trial 1 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:45:14,076] Trial 7 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 223, 'max_depth': 16, 'learning_rate': 0.22615394537300193}. Best is trial 1 with value: 0.88.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:15,581] Trial 8 finished with value: 0.89 and parameters: {'n_estimators': 295, 'max_depth': 20, 'learning_rate': 0.03094224495724663}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003505 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:16,862] Trial 9 finished with value: 0.87 and parameters: {'n_estimators': 414, 'max_depth': 17, 'learning_rate': 0.19401210789159376}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:17,358] Trial 10 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 108, 'max_depth': 20, 'learning_rate': 0.010818854050480983}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:17,966] Trial 11 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 309, 'max_depth': 5, 'learning_rate': 0.031584634788536785}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000596 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:18,900] Trial 12 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 321, 'max_depth': 7, 'learning_rate': 0.011621689161441334}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000606 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:19,720] Trial 13 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 262, 'max_depth': 10, 'learning_rate': 0.09418170708044366}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000633 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:20,677] Trial 14 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 340, 'max_depth': 20, 'learning_rate': 0.10650761176209725}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000613 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:21,065] Trial 15 finished with value: 0.88 and parameters: {'n_estimators': 184, 'max_depth': 5, 'learning_rate': 0.1004220470475721}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000641 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:22,307] Trial 16 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 380, 'max_depth': 12, 'learning_rate': 0.062195127339059575}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:22,644] Trial 17 finished with value: 0.86 and parameters: {'n_estimators': 271, 'max_depth': 3, 'learning_rate': 0.33826829694393046}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000590 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:23,154] Trial 18 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 145, 'max_depth': 15, 'learning_rate': 0.1530764840668518}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000585 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:23,643] Trial 19 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 298, 'max_depth': 10, 'learning_rate': 0.3822520510740162}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000610 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:24,308] Trial 20 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 232, 'max_depth': 7, 'learning_rate': 0.05004949915802212}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:25,024] Trial 21 finished with value: 0.89 and parameters: {'n_estimators': 248, 'max_depth': 7, 'learning_rate': 0.0486459791433875}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000628 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:25,756] Trial 22 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 242, 'max_depth': 9, 'learning_rate': 0.06323520671746798}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001347 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:26,270] Trial 23 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 158, 'max_depth': 7, 'learning_rate': 0.14675513786670202}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000599 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:26,892] Trial 24 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 224, 'max_depth': 7, 'learning_rate': 0.06460079725379206}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:28,041] Trial 25 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 283, 'max_depth': 12, 'learning_rate': 0.14216067407258665}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000716 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:29,957] Trial 26 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 362, 'max_depth': 18, 'learning_rate': 0.050279886643714154}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000669 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:30,218] Trial 27 finished with value: 0.88 and parameters: {'n_estimators': 52, 'max_depth': 6, 'learning_rate': 0.10235479266115385}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000708 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:31,235] Trial 28 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 180, 'max_depth': 11, 'learning_rate': 0.12769478593758513}. Best is trial 8 with value: 0.89.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:45:32,154] Trial 29 finished with value: 0.85 and parameters: {'n_estimators': 421, 'max_depth': 8, 'learning_rate': 0.425361265986393}. Best is trial 8 with value: 0.89.
[I 2025-01-02 07:45:32,173] A new study created in memory with name: no-name-564e1a41-2a2c-495a-b2dd-1ec10b986ebf


Optimizing AdaBoost...


[I 2025-01-02 07:45:36,659] Trial 0 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 456, 'learning_rate': 0.32794729196784583}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:45:37,451] Trial 1 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 82, 'learning_rate': 0.42403347632064364}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:45:41,359] Trial 2 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 404, 'learning_rate': 0.34485816272336395}. Best is trial 2 with value: 0.8266666666666667.
[I 2025-01-02 07:45:46,509] Trial 3 finished with value: 0.8033333333333333 and parameters: {'n_estimators': 413, 'learning_rate': 0.6494762199828652}. Best is trial 2 with value: 0.8266666666666667.
[I 2025-01-02 07:45:50,180] Trial 4 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 356, 'learning_rate': 0.4189049010342941}. Best is trial 2 with value: 0.8266666666666667

Optimizing Neural Network...


[I 2025-01-02 07:47:14,328] Trial 0 finished with value: 0.8333333333333334 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 60, 'learning_rate_init': 0.06028467566749665}. Best is trial 0 with value: 0.8333333333333334.
[I 2025-01-02 07:47:16,036] Trial 1 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 99, 'hidden_layer_2': 72, 'learning_rate_init': 0.01785507223685701}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:47:16,449] Trial 2 finished with value: 0.8733333333333333 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 24, 'learning_rate_init': 0.049453940814349465}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:47:17,762] Trial 3 finished with value: 0.83 and parameters: {'hidden_layer_1': 86, 'hidden_layer_2': 84, 'learning_rate_init': 0.06593344126791638}. Best is trial 1 with value: 0.8933333333333333.
[I 2025-01-02 07:47:18,360] Trial 4 finished with value: 0.84 and parameters: {'hidden_layer_1': 

Optimizing MLP...


[I 2025-01-02 07:47:56,958] Trial 0 finished with value: 0.8833333333333333 and parameters: {'layer_1': 148, 'layer_2': 129, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.04951794528228063}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-02 07:47:57,842] Trial 1 finished with value: 0.5 and parameters: {'layer_1': 116, 'layer_2': 108, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.08613417191121342}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-02 07:48:01,311] Trial 2 finished with value: 0.7866666666666666 and parameters: {'layer_1': 94, 'layer_2': 83, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.009694453685212968}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-02 07:48:01,998] Trial 3 finished with value: 0.5 and parameters: {'layer_1': 84, 'layer_2': 134, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.0805462087555347}. Best is trial 0 with value: 0.8833333333333333

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.910000     0.933333     0.886667  0.820894   
1         Decision Tree  0.853333     0.893333     0.813333  0.708939   
2         Random Forest  0.883333     0.913333     0.853333  0.768050   
3   Logistic Regression  0.790000     0.806667     0.773333  0.580322   
4                  k-NN  0.866667     0.920000     0.813333  0.737541   
5           Naive Bayes  0.710000     0.726667     0.693333  0.420234   
6     Gradient Boosting  0.900000     0.933333     0.866667  0.801784   
7               XGBoost  0.903333     0.933333     0.873333  0.808123   
8              LightGBM  0.890000     0.926667     0.853333  0.782106   
9              AdaBoost  0.833333     0.900000     0.766667  0.672673   
10       Neural Network  0.910000     0.933333     0.886667  0.820894   
11                  MLP  0.896667     0.940000     0.853333  0.796330   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_PAAC_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2025-01-02 07:49:29,956] A new study created in memory with name: no-name-0f4a7977-78c9-4bec-8d98-6d273f1d910a
[I 2025-01-02 07:49:30,138] Trial 0 finished with value: 0.7866666666666666 and parameters: {'C': 5.134602479704268, 'kernel': 'linear'}. Best is trial 0 with value: 0.7866666666666666.


Optimizing SVM...


[I 2025-01-02 07:49:30,343] Trial 1 finished with value: 0.7933333333333333 and parameters: {'C': 8.134146402513686, 'kernel': 'linear'}. Best is trial 1 with value: 0.7933333333333333.
[I 2025-01-02 07:49:30,620] Trial 2 finished with value: 0.67 and parameters: {'C': 6.27932211275458, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.7933333333333333.
[I 2025-01-02 07:49:30,846] Trial 3 finished with value: 0.9 and parameters: {'C': 8.439959365455097, 'kernel': 'poly'}. Best is trial 3 with value: 0.9.
[I 2025-01-02 07:49:31,210] Trial 4 finished with value: 0.8966666666666666 and parameters: {'C': 3.193481861115826, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9.
[I 2025-01-02 07:49:31,556] Trial 5 finished with value: 0.8933333333333333 and parameters: {'C': 3.3927967475547454, 'kernel': 'poly'}. Best is trial 3 with value: 0.9.
[I 2025-01-02 07:49:31,921] Trial 6 finished with value: 0.68 and parameters: {'C': 5.030119573642122, 'kernel': 'sigmoid'}. Best is trial 3 with valu

Optimizing Decision Tree...


[I 2025-01-02 07:49:39,364] Trial 3 finished with value: 0.7866666666666666 and parameters: {'max_depth': 5, 'min_samples_split': 4}. Best is trial 2 with value: 0.8433333333333334.
[I 2025-01-02 07:49:39,402] Trial 4 finished with value: 0.7433333333333333 and parameters: {'max_depth': 4, 'min_samples_split': 9}. Best is trial 2 with value: 0.8433333333333334.
[I 2025-01-02 07:49:39,436] Trial 5 finished with value: 0.7433333333333333 and parameters: {'max_depth': 4, 'min_samples_split': 5}. Best is trial 2 with value: 0.8433333333333334.
[I 2025-01-02 07:49:39,490] Trial 6 finished with value: 0.8233333333333334 and parameters: {'max_depth': 9, 'min_samples_split': 9}. Best is trial 2 with value: 0.8433333333333334.
[I 2025-01-02 07:49:39,557] Trial 7 finished with value: 0.8266666666666667 and parameters: {'max_depth': 17, 'min_samples_split': 6}. Best is trial 2 with value: 0.8433333333333334.
[I 2025-01-02 07:49:39,624] Trial 8 finished with value: 0.8366666666666667 and parameter

Optimizing Random Forest...


[I 2025-01-02 07:49:42,059] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 182, 'max_depth': 7, 'min_samples_split': 2}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:49:43,474] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 306, 'max_depth': 5, 'min_samples_split': 10}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:49:46,952] Trial 2 finished with value: 0.87 and parameters: {'n_estimators': 460, 'max_depth': 15, 'min_samples_split': 2}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:49:48,033] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 138, 'max_depth': 7, 'min_samples_split': 6}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:49:50,882] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 482, 'max_depth': 4, 'min_samples_split': 5}. Best is trial 0 with value: 0.88.
[I 2025-01-02 07:49:51,842] Trial 5 finished with value: 0.8666666666666667 and p

Optimizing Logistic Regression...


[I 2025-01-02 07:50:24,536] Trial 8 finished with value: 0.78 and parameters: {'C': 0.44129025433329083, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:50:24,558] Trial 9 finished with value: 0.78 and parameters: {'C': 3.4312839529055448, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:50:24,586] Trial 10 finished with value: 0.7766666666666666 and parameters: {'C': 0.1635109007697364, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:50:24,616] Trial 11 finished with value: 0.79 and parameters: {'C': 2.4309036111624325, 'solver': 'lbfgs'}. Best is trial 11 with value: 0.79.
[I 2025-01-02 07:50:24,660] Trial 12 finished with value: 0.7866666666666666 and parameters: {'C': 2.2891839844739885, 'solver': 'lbfgs'}. Best is trial 11 with value: 0.79.
[I 2025-01-02 07:50:24,689] Trial 13 finished with value: 0.7866666666666666 and parameters: {'C': 1.8414140560839063, 'so

Optimizing k-NN...


[I 2025-01-02 07:50:25,433] Trial 5 finished with value: 0.86 and parameters: {'n_neighbors': 9}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,478] Trial 6 finished with value: 0.8633333333333333 and parameters: {'n_neighbors': 6}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,504] Trial 7 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 12}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,545] Trial 8 finished with value: 0.84 and parameters: {'n_neighbors': 14}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,582] Trial 9 finished with value: 0.8133333333333334 and parameters: {'n_neighbors': 20}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,623] Trial 10 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 3}. Best is trial 4 with value: 0.8666666666666667.
[I 2025-01-02 07:50:25,668] Trial 11 finished with value: 0.8666

Optimizing Naive Bayes...


[I 2025-01-02 07:50:26,507] Trial 20 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,521] Trial 21 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,528] Trial 22 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,537] Trial 23 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,547] Trial 24 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,565] Trial 25 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,577] Trial 26 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,589] Trial 27 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-02 07:50:26,599] Trial 28 finished with value: 0.71 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:50:54,357] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 430, 'learning_rate': 0.03453425383473727, 'max_depth': 18}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-02 07:51:05,545] Trial 1 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 392, 'learning_rate': 0.3316809096438821, 'max_depth': 4}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:51:09,914] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 375, 'learning_rate': 0.25840808117026076, 'max_depth': 16}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:51:12,372] Trial 3 finished with value: 0.83 and parameters: {'n_estimators': 292, 'learning_rate': 0.4698268551567289, 'max_depth': 17}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:51:16,180] Trial 4 finished with value: 0.84 and parameters: {'n_estimators': 413, 'learning_rate': 0.348113848285959, 'max_dept

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:54:07,729] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 354, 'max_depth': 4, 'learning_rate': 0.19376777122536}. Best is trial 0 with value: 0.87.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:54:09,590] Trial 1 finished with value: 0.89 and parameters: {'n_estimators': 286, 'max_depth': 9, 'learning_rate': 0.068721609692426}. Best is trial 1 with value: 0.89.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:54:10,236] Trial 2 finished with value: 0.88 and parameters: {'n_estimators': 328, 'max_depth': 20, 'learning_rate': 0.4899260021305173}. Best is trial 1 with value: 0.89.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:54:11,154] Trial 3 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 459, 'max_depth': 17, 'learning_rate': 0.32265388521250166}. Best is trial 1 with value: 0.89.
Parameters: { "use_label_encod

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000602 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:08,527] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 280, 'max_depth': 15, 'learning_rate': 0.1297100875917099}. Best is trial 0 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000689 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:08,885] Trial 1 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 236, 'max_depth': 4, 'learning_rate': 0.16723792269626186}. Best is trial 0 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:09,420] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 480, 'max_depth': 3, 'learning_rate': 0.23724302745014192}. Best is trial 0 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000609 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:10,179] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 240, 'max_depth': 15, 'learning_rate': 0.1273580067502732}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:10,392] Trial 4 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 56, 'max_depth': 11, 'learning_rate': 0.1807865494531705}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000582 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:11,238] Trial 5 finished with value: 0.88 and parameters: {'n_estimators': 386, 'max_depth': 12, 'learning_rate': 0.14035788423651846}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:11,865] Trial 6 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 448, 'max_depth': 9, 'learning_rate': 0.286904877073607}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:12,503] Trial 7 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 342, 'max_depth': 17, 'learning_rate': 0.44571734100457616}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000728 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:13,111] Trial 8 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 204, 'max_depth': 11, 'learning_rate': 0.4126968201494993}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:13,471] Trial 9 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 69, 'max_depth': 18, 'learning_rate': 0.15184639249485177}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:13,784] Trial 10 finished with value: 0.86 and parameters: {'n_estimators': 50, 'max_depth': 7, 'learning_rate': 0.014404513092262816}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000675 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:14,148] Trial 11 finished with value: 0.88 and parameters: {'n_estimators': 58, 'max_depth': 20, 'learning_rate': 0.2983145588085947}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000702 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:14,840] Trial 12 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 126, 'max_depth': 20, 'learning_rate': 0.029841779507957675}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:15,535] Trial 13 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 136, 'max_depth': 13, 'learning_rate': 0.01481067378703773}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:16,261] Trial 14 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 141, 'max_depth': 8, 'learning_rate': 0.07373400282105944}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000672 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:17,002] Trial 15 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 139, 'max_depth': 20, 'learning_rate': 0.22273565140544807}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:17,684] Trial 16 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 181, 'max_depth': 10, 'learning_rate': 0.23021011230696362}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000621 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:17,993] Trial 17 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 100, 'max_depth': 14, 'learning_rate': 0.36541234801963}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000583 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:18,336] Trial 18 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 174, 'max_depth': 5, 'learning_rate': 0.20664315664481292}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:18,868] Trial 19 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 313, 'max_depth': 17, 'learning_rate': 0.3345350368398363}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:19,126] Trial 20 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 96, 'max_depth': 6, 'learning_rate': 0.4858096116065381}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000581 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:19,358] Trial 21 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 85, 'max_depth': 6, 'learning_rate': 0.2854174534755565}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:19,685] Trial 22 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 104, 'max_depth': 9, 'learning_rate': 0.46543631901931887}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000607 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:20,008] Trial 23 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 165, 'max_depth': 11, 'learning_rate': 0.4963381422766906}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000616 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:20,355] Trial 24 finished with value: 0.89 and parameters: {'n_estimators': 117, 'max_depth': 7, 'learning_rate': 0.19500586913262774}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000612 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:20,769] Trial 25 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 213, 'max_depth': 5, 'learning_rate': 0.0768507657916838}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:21,089] Trial 26 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 94, 'max_depth': 13, 'learning_rate': 0.33490944856624905}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:21,512] Trial 27 finished with value: 0.87 and parameters: {'n_estimators': 151, 'max_depth': 18, 'learning_rate': 0.2570719721639727}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001616 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:21,845] Trial 28 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 82, 'max_depth': 9, 'learning_rate': 0.3938600356829763}. Best is trial 28 with value: 0.8966666666666666.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000604 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:22,090] Trial 29 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 67, 'max_depth': 10, 'learning_rate': 0.3728075928357554}. Best is trial 28 with value: 0.8966666666666666.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000601 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:55:22,351] A new study created in memory with name: no-name-fb5a22f2-0ea6-4f35-9fa6-d8a5b9a62a4d


Optimizing AdaBoost...


[I 2025-01-02 07:55:24,922] Trial 0 finished with value: 0.82 and parameters: {'n_estimators': 263, 'learning_rate': 0.5960106098482704}. Best is trial 0 with value: 0.82.
[I 2025-01-02 07:55:28,792] Trial 1 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 366, 'learning_rate': 0.39862371348623327}. Best is trial 0 with value: 0.82.
[I 2025-01-02 07:55:30,020] Trial 2 finished with value: 0.7566666666666667 and parameters: {'n_estimators': 92, 'learning_rate': 0.020495305033678177}. Best is trial 0 with value: 0.82.
[I 2025-01-02 07:55:31,240] Trial 3 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 90, 'learning_rate': 0.580800352173079}. Best is trial 3 with value: 0.8233333333333334.
[I 2025-01-02 07:55:35,383] Trial 4 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 398, 'learning_rate': 0.2922539553352511}. Best is trial 4 with value: 0.8266666666666667.
[I 2025-01-02 07:55:37,043] Trial 5 finished with value:

Optimizing Neural Network...


[I 2025-01-02 07:56:39,574] Trial 0 finished with value: 0.8166666666666667 and parameters: {'hidden_layer_1': 55, 'hidden_layer_2': 51, 'learning_rate_init': 0.09242725773089078}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:56:40,869] Trial 1 finished with value: 0.8766666666666667 and parameters: {'hidden_layer_1': 71, 'hidden_layer_2': 13, 'learning_rate_init': 0.015638447751676898}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:56:41,327] Trial 2 finished with value: 0.8233333333333334 and parameters: {'hidden_layer_1': 20, 'hidden_layer_2': 90, 'learning_rate_init': 0.08109900863383332}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:56:42,425] Trial 3 finished with value: 0.87 and parameters: {'hidden_layer_1': 71, 'hidden_layer_2': 80, 'learning_rate_init': 0.07200324580376906}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-02 07:56:44,314] Trial 4 finished with value: 0.8733333333333333 and parameters: {'hid

Optimizing MLP...


[I 2025-01-02 07:57:22,100] Trial 0 finished with value: 0.8266666666666667 and parameters: {'layer_1': 123, 'layer_2': 135, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.014070187310378336}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:57:24,355] Trial 1 finished with value: 0.8433333333333334 and parameters: {'layer_1': 88, 'layer_2': 64, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.0328146227502567}. Best is trial 1 with value: 0.8433333333333334.
[I 2025-01-02 07:57:27,145] Trial 2 finished with value: 0.7966666666666666 and parameters: {'layer_1': 68, 'layer_2': 138, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.03506980705740404}. Best is trial 1 with value: 0.8433333333333334.
[I 2025-01-02 07:57:30,868] Trial 3 finished with value: 0.79 and parameters: {'layer_1': 78, 'layer_2': 80, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.01464451019097305}. Best is trial 1 with value: 0.8433333333333

Dataset saved successfully!




Class Feature Vector (CFV)

In [2]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_PAAC.csv", index=False)
print("CFV dataset created and saved!")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-16 03:40:18,952] A new study created in memory with name: no-name-f876b2ea-6384-46df-8582-22ffca8091d4


Training and predicting with SVM...


[I 2025-01-16 03:40:19,646] Trial 0 finished with value: 0.9066666666666666 and parameters: {'C': 9.839031875532108, 'kernel': 'rbf'}. Best is trial 0 with value: 0.9066666666666666.
[I 2025-01-16 03:40:20,123] Trial 1 finished with value: 0.67 and parameters: {'C': 6.220918675800033, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9066666666666666.
[I 2025-01-16 03:40:20,537] Trial 2 finished with value: 0.66 and parameters: {'C': 8.554369301232157, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9066666666666666.
[I 2025-01-16 03:40:21,047] Trial 3 finished with value: 0.67 and parameters: {'C': 9.842970320742072, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.9066666666666666.
[I 2025-01-16 03:40:21,661] Trial 4 finished with value: 0.7866666666666666 and parameters: {'C': 3.97475539590276, 'kernel': 'linear'}. Best is trial 0 with value: 0.9066666666666666.
[I 2025-01-16 03:40:22,556] Trial 5 finished with value: 0.8966666666666666 and parameters: {'C': 3.1236420002572

Training and predicting with Decision Tree...


[I 2025-01-16 03:40:31,181] Trial 3 finished with value: 0.85 and parameters: {'max_depth': 7, 'min_samples_split': 5}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03:40:31,248] Trial 4 finished with value: 0.8166666666666667 and parameters: {'max_depth': 20, 'min_samples_split': 3}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03:40:31,313] Trial 5 finished with value: 0.8233333333333334 and parameters: {'max_depth': 17, 'min_samples_split': 10}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03:40:31,376] Trial 6 finished with value: 0.83 and parameters: {'max_depth': 16, 'min_samples_split': 9}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03:40:31,410] Trial 7 finished with value: 0.7433333333333333 and parameters: {'max_depth': 4, 'min_samples_split': 9}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03:40:31,467] Trial 8 finished with value: 0.8233333333333334 and parameters: {'max_depth': 9, 'min_samples_split': 2}. Best is trial 3 with value: 0.85.
[I 2025-01-16 03

Training and predicting with Random Forest...


[I 2025-01-16 03:40:33,906] Trial 0 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 295, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 0 with value: 0.8433333333333334.
[I 2025-01-16 03:40:36,286] Trial 1 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 455, 'max_depth': 6, 'min_samples_split': 8}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-16 03:40:38,679] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 299, 'max_depth': 9, 'min_samples_split': 4}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-16 03:40:42,446] Trial 3 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 419, 'max_depth': 13, 'min_samples_split': 7}. Best is trial 1 with value: 0.8766666666666667.
[I 2025-01-16 03:40:42,796] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 51, 'max_depth': 12, 'min_samples_split': 8}. Best is trial 1 with value: 0.8766666666666667

Training and predicting with Logistic Regression...


[I 2025-01-16 03:41:47,054] Trial 7 finished with value: 0.78 and parameters: {'C': 2.987739806074568, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:41:47,078] Trial 8 finished with value: 0.7833333333333333 and parameters: {'C': 6.442369420579783, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:41:47,100] Trial 9 finished with value: 0.7833333333333333 and parameters: {'C': 9.246418832353806, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:41:47,136] Trial 10 finished with value: 0.7833333333333333 and parameters: {'C': 9.788799010578703, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:41:47,165] Trial 11 finished with value: 0.7833333333333333 and parameters: {'C': 8.072610597839297, 'solver': 'liblinear'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:41:47,194] Trial 12 finished with value: 0.78 and paramete

Training and predicting with k-NN...


[I 2025-01-16 03:41:48,008] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_neighbors': 5}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,046] Trial 4 finished with value: 0.82 and parameters: {'n_neighbors': 17}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,077] Trial 5 finished with value: 0.8633333333333333 and parameters: {'n_neighbors': 5}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,112] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 10}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,146] Trial 7 finished with value: 0.83 and parameters: {'n_neighbors': 18}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,180] Trial 8 finished with value: 0.84 and parameters: {'n_neighbors': 14}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-16 03:41:48,212] Trial 9 finished with value: 0.86 and parameters: 

Training and predicting with Naive Bayes...


[I 2025-01-16 03:41:49,357] Trial 16 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,368] Trial 17 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,378] Trial 18 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,388] Trial 19 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,399] Trial 20 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,409] Trial 21 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,426] Trial 22 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,440] Trial 23 finished with value: 0.71 and parameters: {}. Best is trial 0 with value: 0.71.
[I 2025-01-16 03:41:49,449] Trial 24 finished with value: 0.71 a

Training and predicting with Gradient Boosting...


[I 2025-01-16 03:41:52,595] Trial 0 finished with value: 0.83 and parameters: {'n_estimators': 169, 'learning_rate': 0.4586433694082628, 'max_depth': 17}. Best is trial 0 with value: 0.83.
[I 2025-01-16 03:41:58,187] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 375, 'learning_rate': 0.21362037994768202, 'max_depth': 16}. Best is trial 1 with value: 0.8466666666666667.
[I 2025-01-16 03:42:03,617] Trial 2 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 267, 'learning_rate': 0.3065783176519324, 'max_depth': 11}. Best is trial 2 with value: 0.8766666666666667.
[I 2025-01-16 03:42:08,718] Trial 3 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 96, 'learning_rate': 0.2543994588574834, 'max_depth': 7}. Best is trial 3 with value: 0.8866666666666667.
[I 2025-01-16 03:42:16,300] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 276, 'learning_rate': 0.28108475492468304, 'max_dept

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:46:55,541] Trial 0 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 107, 'max_depth': 14, 'learning_rate': 0.20076425351385715}. Best is trial 0 with value: 0.8866666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:46:56,300] Trial 1 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 451, 'max_depth': 5, 'learning_rate': 0.377595986820574}. Best is trial 0 with value: 0.8866666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:46:57,092] Trial 2 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 289, 'max_depth': 5, 'learning_rate': 0.22811998302136718}. Best is trial 0 with value: 0.8866666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:46:58,421] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 431, 'max_depth': 8, 'learning_rate': 0.26053971686

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001139 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:01,106] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 319, 'max_depth': 11, 'learning_rate': 0.33913018440501036}. Best is trial 0 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000618 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:01,793] Trial 1 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 217, 'max_depth': 9, 'learning_rate': 0.09914105441377757}. Best is trial 1 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001266 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:02,438] Trial 2 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 428, 'max_depth': 5, 'learning_rate': 0.37272480696139326}. Best is trial 1 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.004204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:03,639] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 198, 'max_depth': 12, 'learning_rate': 0.08173608775293148}. Best is trial 1 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001757 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:05,400] Trial 4 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 370, 'max_depth': 7, 'learning_rate': 0.03436732076690853}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001624 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-16 03:48:05,996] Trial 5 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 121, 'max_depth': 11, 'learning_rate': 0.2794129643675613}. Best is trial 4 with value: 0.8933333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001785 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:06,791] Trial 6 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 210, 'max_depth': 19, 'learning_rate': 0.2982384452725771}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:07,336] Trial 7 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 113, 'max_depth': 9, 'learning_rate': 0.3023721784254562}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000691 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:08,195] Trial 8 finished with value: 0.88 and parameters: {'n_estimators': 440, 'max_depth': 10, 'learning_rate': 0.2666276164618259}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:08,870] Trial 9 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 418, 'max_depth': 16, 'learning_rate': 0.35611754379944927}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001516 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:09,323] Trial 10 finished with value: 0.87 and parameters: {'n_estimators': 350, 'max_depth': 3, 'learning_rate': 0.49809520022489157}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001343 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:10,277] Trial 11 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 482, 'max_depth': 6, 'learning_rate': 0.18196813317795515}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000632 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:11,528] Trial 12 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 494, 'max_depth': 6, 'learning_rate': 0.015030376892459354}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:12,688] Trial 13 finished with value: 0.88 and parameters: {'n_estimators': 372, 'max_depth': 6, 'learning_rate': 0.010802142505535524}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001326 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:13,373] Trial 14 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 496, 'max_depth': 3, 'learning_rate': 0.014355809010510083}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:14,125] Trial 15 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 287, 'max_depth': 14, 'learning_rate': 0.15629227496794934}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000600 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:15,154] Trial 16 finished with value: 0.88 and parameters: {'n_estimators': 375, 'max_depth': 7, 'learning_rate': 0.08163508624150698}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000704 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:16,018] Trial 17 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 454, 'max_depth': 7, 'learning_rate': 0.19512753986343168}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:16,678] Trial 18 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 392, 'max_depth': 4, 'learning_rate': 0.12257952588658683}. Best is trial 4 with value: 0.8933333333333333.
[I 2025-01-16 03:48:16,896] Trial 19 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 60, 'max_depth': 8, 'learning_rate': 0.050031267025588695}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000614 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000603 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:17,563] Trial 20 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 279, 'max_depth': 13, 'learning_rate': 0.1992126332646999}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:18,770] Trial 21 finished with value: 0.87 and parameters: {'n_estimators': 461, 'max_depth': 7, 'learning_rate': 0.22870410207165756}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-16 03:48:20,200] Trial 22 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 498, 'max_depth': 5, 'learning_rate': 0.148175824702479}. Best is trial 4 with value: 0.8933333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000732 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:22,420] Trial 23 finished with value: 0.88 and parameters: {'n_estimators': 450, 'max_depth': 8, 'learning_rate': 0.049299284159677735}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000671 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:23,518] Trial 24 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 402, 'max_depth': 5, 'learning_rate': 0.05094486810921322}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000593 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:24,438] Trial 25 finished with value: 0.88 and parameters: {'n_estimators': 335, 'max_depth': 7, 'learning_rate': 0.12473262497053725}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000577 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:25,067] Trial 26 finished with value: 0.86 and parameters: {'n_estimators': 464, 'max_depth': 9, 'learning_rate': 0.40975381833031416}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000711 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:25,820] Trial 27 finished with value: 0.87 and parameters: {'n_estimators': 412, 'max_depth': 16, 'learning_rate': 0.22291915400433082}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000638 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:26,398] Trial 28 finished with value: 0.87 and parameters: {'n_estimators': 356, 'max_depth': 4, 'learning_rate': 0.04182175821686461}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001384 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:26,961] Trial 29 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 311, 'max_depth': 11, 'learning_rate': 0.4290850409210061}. Best is trial 4 with value: 0.8933333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000591 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:48:28,007] A new study created in memory with name: no-name-eaf9365e-75ee-4119-a93f-37f78e049a08


Training and predicting with AdaBoost...


[I 2025-01-16 03:48:30,139] Trial 0 finished with value: 0.81 and parameters: {'n_estimators': 220, 'learning_rate': 0.6275716819283644}. Best is trial 0 with value: 0.81.
[I 2025-01-16 03:48:32,879] Trial 1 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 281, 'learning_rate': 0.24775804925581457}. Best is trial 1 with value: 0.8166666666666667.
[I 2025-01-16 03:48:37,790] Trial 2 finished with value: 0.81 and parameters: {'n_estimators': 385, 'learning_rate': 0.9429541290344976}. Best is trial 1 with value: 0.8166666666666667.
[I 2025-01-16 03:48:42,476] Trial 3 finished with value: 0.83 and parameters: {'n_estimators': 484, 'learning_rate': 0.6649174439913462}. Best is trial 3 with value: 0.83.
[I 2025-01-16 03:48:44,474] Trial 4 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 206, 'learning_rate': 0.8423528475586113}. Best is trial 3 with value: 0.83.
[I 2025-01-16 03:48:47,653] Trial 5 finished with value: 0.81 and parameters: {'n_e

Training and predicting with Neural Network...


[I 2025-01-16 03:50:11,648] Trial 0 finished with value: 0.88 and parameters: {'hidden_layer_1': 71, 'hidden_layer_2': 97, 'learning_rate_init': 0.03076786811814386}. Best is trial 0 with value: 0.88.
[I 2025-01-16 03:50:14,209] Trial 1 finished with value: 0.88 and parameters: {'hidden_layer_1': 97, 'hidden_layer_2': 60, 'learning_rate_init': 0.002460126574964733}. Best is trial 0 with value: 0.88.
[I 2025-01-16 03:50:15,147] Trial 2 finished with value: 0.8633333333333333 and parameters: {'hidden_layer_1': 92, 'hidden_layer_2': 61, 'learning_rate_init': 0.09627868758327336}. Best is trial 0 with value: 0.88.
[I 2025-01-16 03:50:15,749] Trial 3 finished with value: 0.89 and parameters: {'hidden_layer_1': 36, 'hidden_layer_2': 18, 'learning_rate_init': 0.05819915764973432}. Best is trial 3 with value: 0.89.
[I 2025-01-16 03:50:16,187] Trial 4 finished with value: 0.8466666666666667 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 18, 'learning_rate_init': 0.05316492876115966}. 

Training and predicting with MLP...


[I 2025-01-16 03:50:46,025] Trial 0 finished with value: 0.8833333333333333 and parameters: {'layer_1': 120, 'layer_2': 124, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.09552782850456322}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-16 03:50:46,427] Trial 1 finished with value: 0.5 and parameters: {'layer_1': 77, 'layer_2': 66, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.02875729407652898}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-16 03:50:50,483] Trial 2 finished with value: 0.8566666666666667 and parameters: {'layer_1': 56, 'layer_2': 123, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.023915772509062395}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-16 03:50:53,599] Trial 3 finished with value: 0.8933333333333333 and parameters: {'layer_1': 145, 'layer_2': 98, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.04947049148273258}. Best is trial 3 with value:

CFV dataset created and saved!


CPFV (Combined Probability and Class Feature Vector)

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/8_PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")


# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_PAAC.csv", index=False)

Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...


# **Hyperperameter tune for RandomizedSearchCV**

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier # Import path for KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for RandomizedSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply random search
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing RandomizedSearchCV for SVM...
Performing RandomizedSearchCV for Decision Tree...
Performing RandomizedSearchCV for Random Forest...


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklear

Performing RandomizedSearchCV for Logistic Regression...




Performing RandomizedSearchCV for k-NN...
Performing RandomizedSearchCV for Naive Bayes...
Performing RandomizedSearchCV for Gradient Boosting...
Performing RandomizedSearchCV for XGBoost...
Performing RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001279 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing RandomizedSearchCV for CatBoost...
Performing RandomizedSearchCV for AdaBoost...




Performing RandomizedSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.895194   
1         Decision Tree    0.821289   
2         Random Forest    0.881427   
3   Logistic Regression    0.774893   
4                  k-NN    0.915821   
5           Naive Bayes    0.741379   
6     Gradient Boosting    0.894332   
7               XGBoost    0.880579   
8              LightGBM    0.884868   
9              CatBoost    0.889156   
10             AdaBoost    0.803241   
11       Neural Network    0.821296   

                                      Best Parameters  
0        {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}  
1   {'min_samples_split': 2, 'min_samples_leaf': 2...  
2   {'n_estimators': 500, 'min_samples_split': 5, ...  
3   {'solver': 'liblinear', 'penalty': 'l2', 'C': 10}  
4   {'weights': 'distance', 'n_neighbors': 3, 'met...  
5                              {'var_smoothing': 0.1}  
6   {'n_estimators': 200, 'max_depth': 7, 'learnin...  
7   {'n_estimators': 100, 'max_depth': 5, 'l

In [None]:
# Storage for predictions and target column
probability_datasets = pd.DataFrame(y_train, columns=['Target'])

# Loop through models, perform random search, and save probabilities
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model,
                                       param_grid,
                                       n_iter=10,
                                       cv=cv,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       random_state=42)

    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)



    # Get probability predictions (if supported)
    if hasattr(random_search.best_estimator_, "predict_proba"):
        probabilities = random_search.best_estimator_.predict_proba(X_train)[:, 1]  # Probability for the positive class
        probability_datasets[f"{model_name}_Probabilities"] = probabilities
    else:
        # Fallback if probability prediction isn't supported
        predictions = random_search.best_estimator_.predict(X_train)
        probability_datasets[f"{model_name}_Predictions"] = predictions



# Display final dataset with probabilities
print(probability_datasets.head())

# Save the probability dataset to a CSV file
probability_datasets.to_csv("PAAC_Randomsearch_tune.csv", index=False)
print("Probability dataset saved to 'PAAC_Randomsearch_tune.csv'.")


# **Hyperparameter grids for GridSearchCV**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/PAAC (Pseudo Amino Acid Composition)/paac_validation_negative.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for GridSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing GridSearchCV for SVM...
Performing GridSearchCV for Decision Tree...
Performing GridSearchCV for Random Forest...


135 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Performing GridSearchCV for Logistic Regression...
Performing GridSearchCV for k-NN...


  _data = np.array(data, dtype=dtype, copy=copy,


Performing GridSearchCV for Naive Bayes...
Performing GridSearchCV for Gradient Boosting...
Performing GridSearchCV for XGBoost...
Performing GridSearchCV for LightGBM...


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8616
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 50
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing GridSearchCV for CatBoost...
Performing GridSearchCV for AdaBoost...




Performing GridSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.899497   
1         Decision Tree    0.825596   
2         Random Forest    0.888297   
3   Logistic Regression    0.774893   
4                  k-NN    0.915821   
5           Naive Bayes    0.741379   
6     Gradient Boosting    0.894321   
7               XGBoost    0.882289   
8              LightGBM    0.885726   
9              CatBoost    0.900326   
10             AdaBoost    0.803241   
11       Neural Network    0.847073   

                                      Best Parameters  
0       {'C': 100, 'gamma': 'scale', 'kernel': 'rbf'}  
1   {'max_depth': 10, 'min_samples_leaf': 2, 'min_...  
2   {'max_depth': 20, 'max_features': 'sqrt', 'min...  
3   {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}  
4   {'metric': 'manhattan', 'n_neighbors': 3, 'wei...  
5                              {'var_smoothing': 0.1}  
6   {'learning_rate': 0.2, 'max_depth': 7, 'n_esti...  
7   {'learning_rate': 0.1, 'max_depth': 7, '

In [None]:
# Prepare the final dataset with probabilities and target
all_probabilities = []
all_targets = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

    # Predict probabilities using the best estimator
    best_model = grid_search.best_estimator_
    probabilities = best_model.predict_proba(X_train)[:, 1]  # Positive class probabilities

    # Append probabilities and targets for this model
    all_probabilities.append(probabilities)
    all_targets.append(y_train)

    # Combine probabilities, features, and target into a DataFrame
    model_data = X_train.copy()
    model_data[f"{model_name}_probability"] = probabilities
    model_data['target'] = y_train

    # Save to CSV
    output_path = f"/content/{model_name}_probabilities.csv"
    model_data.to_csv(output_path, index=False)
    print(f"Saved probabilities for {model_name} to {output_path}")

# Combine all model probabilities into a single DataFrame (optional)
final_dataset = pd.DataFrame({'target': y_train})
for idx, model_name in enumerate(models.keys()):
    final_dataset[f"{model_name}_probability"] = all_probabilities[idx]

# Save the combined dataset
final_output_path = "/content/combined_probabilities.csv"
final_dataset.to_csv(final_output_path, index=False)
print(f"Saved combined dataset to {final_output_path}")
