In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning)

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m3.2/3.2 MB[0m [31m169.9 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m77.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


## GDC

Grouping the Amino Acids
Below are some common groupings based on physicochemical properties:

Hydrophobic (H): A, F, G, I, L, M, P, V, W
Hydrophilic (P): D, E, K, N, Q, R
Neutral (N): C, H, S, T, Y

In [None]:
import pandas as pd
from Bio import SeqIO

# Define amino acid groups (e.g., based on hydrophobicity or charge)
amino_acid_groups = {
    'H': ['A', 'F', 'G', 'I', 'L', 'M', 'P', 'V', 'W'],  # Hydrophobic
    'P': ['D', 'E', 'K', 'N', 'Q', 'R'],                # Hydrophilic
    'N': ['C', 'H', 'S', 'T', 'Y']                      # Neutral
}

# Create all possible group dipeptides from the groups defined above
groups = list(amino_acid_groups.keys())
group_dipeptides = [g1 + g2 for g1 in groups for g2 in groups]

# Function to map an amino acid to its group
def map_to_group(aa):
    for group, aa_list in amino_acid_groups.items():
        if aa in aa_list:
            return group
    return None

# Function to compute GDC features
def compute_gdc_features(sequence):
    sequence = sequence.upper()
    seq_len = len(sequence)

    # Initialize the GDC vector for each group dipeptide
    gdc_vector = {dipeptide: 0 for dipeptide in group_dipeptides}

    # Convert each amino acid in the sequence to its group and calculate dipeptide composition
    for i in range(seq_len - 1):
        aa1, aa2 = sequence[i], sequence[i+1]
        group1, group2 = map_to_group(aa1), map_to_group(aa2)

        if group1 and group2:
            dipeptide = group1 + group2
            gdc_vector[dipeptide] += 1

    # Normalize the counts by the total number of dipeptides in the sequence
    total_dipeptides = seq_len - 1
    for dipeptide in gdc_vector:
        gdc_vector[dipeptide] /= total_dipeptides

    return list(gdc_vector.values())

# Function to process a FASTA file and extract GDC features, saving them to a CSV
def process_gdc(fasta_file, output_csv):
    sequences = []

    # Read the sequences from the FASTA file
    for record in SeqIO.parse(fasta_file, "fasta"):
        sequences.append(str(record.seq))

    # Compute GDC features for each sequence
    gdc_features = [compute_gdc_features(seq) for seq in sequences]

    # Convert GDC features to DataFrame
    gdc_df = pd.DataFrame(gdc_features, columns=group_dipeptides)

    # Save to CSV
    gdc_df.to_csv(output_csv, index=False)
    print(f"GDC features saved to {output_csv}")

# Input FASTA file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/POSITIVE_main (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/NEGATIVE_main (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/POSITIVE_validation (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/NEGATIVE_validation (1).fasta"

# Output CSV file paths
output_main_p = "/content/gdc_main_positive.csv"
output_main_n = "/content/gdc_main_negative.csv"
output_validation_p = "/content/gdc_validation_positive.csv"
output_validation_n = "/content/gdc_validation_negative.csv"

# Process and save GDC features for each dataset
process_gdc(main_p, output_main_p)
process_gdc(main_n, output_main_n)
process_gdc(validation_p, output_validation_p)
process_gdc(validation_n, output_validation_n)


GDC features saved to /content/gdc_main_positive.csv
GDC features saved to /content/gdc_main_negative.csv
GDC features saved to /content/gdc_validation_positive.csv
GDC features saved to /content/gdc_validation_negative.csv


In [None]:
import pandas as pd

# Define function to check columns, length, and null values in the dataset
def check_dataset_info(file_path):
    try:
        # Load the CSV file
        df = pd.read_csv(file_path)

        # Get the number of columns and length (number of rows)
        num_columns = len(df.columns)
        num_rows = len(df)

        # Check for null values
        null_values = df.isnull().sum().sum()  # Count total number of null values

        # Return the information
        return num_columns, num_rows, null_values
    except FileNotFoundError:
        return "File not found: " + file_path

# Paths for input CSV files (update these paths if necessary)
main_p = "/content/gdc_main_positive.csv"
main_n = "/content/gdc_main_negative.csv"
validation_p = "/content/gdc_validation_positive.csv"
validation_n = "/content/gdc_validation_negative.csv"

# Check columns, length, and null values for each file
main_p_info = check_dataset_info(main_p)
main_n_info = check_dataset_info(main_n)
validation_p_info = check_dataset_info(validation_p)
validation_n_info = check_dataset_info(validation_n)

# Print the results
print(f"Main Positive Dataset - Columns: {main_p_info[0]}, Rows: {main_p_info[1]}, Null Values: {main_p_info[2]}")
print(f"Main Negative Dataset - Columns: {main_n_info[0]}, Rows: {main_n_info[1]}, Null Values: {main_n_info[2]}")
print(f"Validation Positive Dataset - Columns: {validation_p_info[0]}, Rows: {validation_p_info[1]}, Null Values: {validation_p_info[2]}")
print(f"Validation Negative Dataset - Columns: {validation_n_info[0]}, Rows: {validation_n_info[1]}, Null Values: {validation_n_info[2]}")


Main Positive Dataset - Columns: 9, Rows: 582, Null Values: 0
Main Negative Dataset - Columns: 9, Rows: 582, Null Values: 0
Validation Positive Dataset - Columns: 9, Rows: 150, Null Values: 0
Validation Negative Dataset - Columns: 9, Rows: 150, Null Values: 0


# **Deep learning approach combining Conv1D, LSTM, and Dense layers**

In [None]:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout, BatchNormalization

# Load AAC feature datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Create labels
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values

# Reshape input for Conv1D
X_train = X_train.reshape(X_train.shape[0], X_train.shape[1], 1)
X_val = X_val.reshape(X_val.shape[0], X_val.shape[1], 1)

# Model Architecture
model = Sequential()

# 1. Stacked Conv1D layers with BatchNormalization and Dropout
model.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1), padding='same'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# Adjusted Conv1D to handle small input size with padding
model.add(Conv1D(filters=128, kernel_size=3, activation='relu', padding='same'))
model.add(BatchNormalization())
# Removed MaxPooling1D here to prevent reduction of the dimension further
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'))
model.add(BatchNormalization())
# Removed MaxPooling1D here to prevent further reduction of the dimension
model.add(Dropout(0.3))

# 2. LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# 3. Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32)


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 232ms/step - accuracy: 0.5657 - loss: 0.6525 - val_accuracy: 0.5000 - val_loss: 0.6920
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.6734 - loss: 0.5804 - val_accuracy: 0.5000 - val_loss: 0.6881
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7306 - loss: 0.5320 - val_accuracy: 0.5400 - val_loss: 0.6849
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6986 - loss: 0.5453 - val_accuracy: 0.5700 - val_loss: 0.6712
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7113 - loss: 0.5161 - val_accuracy: 0.5000 - val_loss: 0.9036
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7502 - loss: 0.4960 - val_accuracy: 0.5767 - val_loss: 0.6808
Epoch 7/100
[1m37/37[0m [32m

In [None]:
# Print final training and validation accuracy
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]

print(f"Final Training Accuracy: {final_train_acc:.4f}")
print(f"Final Validation Accuracy: {final_val_acc:.4f}")

Final Training Accuracy: 0.9227
Final Validation Accuracy: 0.8367


# **Hybrid CNN-LSTM Model for Cell-Penetrating Peptide Classification**

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Flatten, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape data for Conv1D input
X_train = X_train[..., np.newaxis]  # Adding channel dimension
X_val = X_val[..., np.newaxis]      # Adding channel dimension

# Model Architecture
model = Sequential()

# Conv1D layers with reduced kernel sizes and fewer pooling layers
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(X_train.shape[1], 1)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Conv1D(filters=256, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# LSTM layer for sequential dependencies
model.add(LSTM(64, return_sequences=False, activation='relu'))

# Dense Layers for final prediction with Dropout for regularization
model.add(Dense(128, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(64, activation='swish'))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))  # Output layer for binary classification

# Compile the Model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Model Summary
model.summary()

# Train the model
history = model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=100, batch_size=32, verbose=1)

# Evaluate the model on the validation data
val_predictions = (model.predict(X_val) > 0.5).astype(int)
accuracy = accuracy_score(y_val, val_predictions)

print("\nValidation Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_val, val_predictions))

# Save the trained model
model.save("cell_penetrating_peptide_model.h5")
print("\nModel saved as 'cell_penetrating_peptide_model.h5'")


Epoch 1/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 201ms/step - accuracy: 0.5835 - loss: 0.6732 - val_accuracy: 0.6433 - val_loss: 0.6870
Epoch 2/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.6771 - loss: 0.5803 - val_accuracy: 0.5967 - val_loss: 0.6774
Epoch 3/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6913 - loss: 0.5775 - val_accuracy: 0.5800 - val_loss: 0.6640
Epoch 4/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7250 - loss: 0.5328 - val_accuracy: 0.6033 - val_loss: 0.6357
Epoch 5/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.7325 - loss: 0.5069 - val_accuracy: 0.6333 - val_loss: 0.6068
Epoch 6/100
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.7429 - loss: 0.5149 - val_accuracy: 0.7300 - val_loss: 0.5516
Epoch 7/100
[1m37/37[0m [32m




Validation Accuracy: 0.8433333333333334

Classification Report:
               precision    recall  f1-score   support

           0       0.82      0.87      0.85       150
           1       0.87      0.81      0.84       150

    accuracy                           0.84       300
   macro avg       0.84      0.84      0.84       300
weighted avg       0.84      0.84      0.84       300


Model saved as 'cell_penetrating_peptide_model.h5'


# ALL Algorithm for this Dataset

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m8.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam



Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")


In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    # Append results
    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})



Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...

Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000690 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Training CatBoost...

Training AdaBoost...

Training Neural Network...




[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0               XGBoost        0.979381             0.876667
1              LightGBM        0.977663             0.863333
2         Random Forest        0.981100             0.856667
3                  k-NN        0.871993             0.850000
4         Decision Tree        0.981100             0.840000
5              CatBoost        0.943299             0.833333
6     Gradient Boosting        0.902921             0.826667
7        Neural Network        0.851375             0.823333
8                   MLP        0.918385             0.816667
9              AdaBoost        0.823024             0.763333
10                  SVM        0.704467             0.703333
11  Logistic Regression        0.687285             0.683333
12          Naive Bayes        0.676976             0.680000


# Code for Cross-Validation

In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}


In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...

Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 552
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Numb




Performing Cross-validation for Neural Network...
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step




[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step

Performing Cross-validation for MLP...
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0               XGBoost          0.865983         0.011274
1              LightGBM          0.860830         0.009173
2         Random Forest          0.853955         0.003708
3              CatBoost          0.845360         0.006075
4     Gradient Boosting          0.833336         0.017650
5         Decision Tree          0.829040         0.008702
6                   MLP          0.822998         0.019541
7                  k-NN          0.804965         0.016424
8        Neural Network          0.786048         0.021675
9              AdaBoost          0.780032         0.026457
10                  SVM          0.694990         0.016600
11  Logistic Regression          0.687232         0.035055
12          Naive Bayes          0.679521         0.021140


# hyperparameter optimization with Optuna

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-02 07:34:35,234] A new study created in memory with name: no-name-2cdf44ca-757a-4436-aa34-283560313131
[I 2025-01-02 07:34:35,297] Trial 0 finished with value: 0.7866666666666666 and parameters: {'C': 1.5587163393417423, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:34:35,371] Trial 1 finished with value: 0.6233333333333333 and parameters: {'C': 4.947158447036206, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:34:35,420] Trial 2 finished with value: 0.7033333333333334 and parameters: {'C': 0.33221625597117066, 'kernel': 'linear'}. Best is trial 0 with value: 0.7866666666666666.


Optimizing SVM...


[I 2025-01-02 07:34:35,505] Trial 3 finished with value: 0.7833333333333333 and parameters: {'C': 9.183702801710174, 'kernel': 'poly'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:34:35,581] Trial 4 finished with value: 0.6333333333333333 and parameters: {'C': 0.3388479693620935, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.7866666666666666.
[I 2025-01-02 07:34:35,655] Trial 5 finished with value: 0.79 and parameters: {'C': 7.430929552803818, 'kernel': 'poly'}. Best is trial 5 with value: 0.79.
[I 2025-01-02 07:34:35,729] Trial 6 finished with value: 0.7766666666666666 and parameters: {'C': 3.429998415859077, 'kernel': 'rbf'}. Best is trial 5 with value: 0.79.
[I 2025-01-02 07:34:35,777] Trial 7 finished with value: 0.73 and parameters: {'C': 8.239638347026967, 'kernel': 'linear'}. Best is trial 5 with value: 0.79.
[I 2025-01-02 07:34:35,838] Trial 8 finished with value: 0.6233333333333333 and parameters: {'C': 5.461408171643127, 'kernel': 'sigmoid'}. Best i

Optimizing Decision Tree...


[I 2025-01-02 07:34:37,732] Trial 11 finished with value: 0.85 and parameters: {'max_depth': 15, 'min_samples_split': 5}. Best is trial 11 with value: 0.85.
[I 2025-01-02 07:34:37,764] Trial 12 finished with value: 0.8266666666666667 and parameters: {'max_depth': 14, 'min_samples_split': 6}. Best is trial 11 with value: 0.85.
[I 2025-01-02 07:34:37,791] Trial 13 finished with value: 0.8466666666666667 and parameters: {'max_depth': 16, 'min_samples_split': 3}. Best is trial 11 with value: 0.85.
[I 2025-01-02 07:34:37,819] Trial 14 finished with value: 0.8433333333333334 and parameters: {'max_depth': 16, 'min_samples_split': 3}. Best is trial 11 with value: 0.85.
[I 2025-01-02 07:34:37,843] Trial 15 finished with value: 0.8366666666666667 and parameters: {'max_depth': 17, 'min_samples_split': 10}. Best is trial 11 with value: 0.85.
[I 2025-01-02 07:34:37,870] Trial 16 finished with value: 0.8366666666666667 and parameters: {'max_depth': 13, 'min_samples_split': 5}. Best is trial 11 with 

Optimizing Random Forest...


[I 2025-01-02 07:34:39,223] Trial 0 finished with value: 0.82 and parameters: {'n_estimators': 386, 'max_depth': 5, 'min_samples_split': 6}. Best is trial 0 with value: 0.82.
[I 2025-01-02 07:34:40,566] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 434, 'max_depth': 19, 'min_samples_split': 3}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-02 07:34:41,520] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 311, 'max_depth': 15, 'min_samples_split': 5}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-02 07:34:42,342] Trial 3 finished with value: 0.7533333333333333 and parameters: {'n_estimators': 379, 'max_depth': 3, 'min_samples_split': 10}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01-02 07:34:43,498] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 343, 'max_depth': 9, 'min_samples_split': 9}. Best is trial 1 with value: 0.8666666666666667.
[I 2025-01

Optimizing Logistic Regression...


[I 2025-01-02 07:35:07,866] Trial 11 finished with value: 0.7066666666666667 and parameters: {'C': 9.78296996615968, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.7066666666666667.
[I 2025-01-02 07:35:07,895] Trial 12 finished with value: 0.7066666666666667 and parameters: {'C': 7.583475888730705, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.7066666666666667.
[I 2025-01-02 07:35:07,925] Trial 13 finished with value: 0.7 and parameters: {'C': 9.69378722736192, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.7066666666666667.
[I 2025-01-02 07:35:07,959] Trial 14 finished with value: 0.7033333333333334 and parameters: {'C': 7.343021491482241, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.7066666666666667.
[I 2025-01-02 07:35:07,989] Trial 15 finished with value: 0.7066666666666667 and parameters: {'C': 9.994105216574017, 'solver': 'lbfgs'}. Best is trial 4 with value: 0.7066666666666667.
[I 2025-01-02 07:35:08,013] Trial 16 finished with value: 0.7066666666666667 and parame

Optimizing k-NN...


[I 2025-01-02 07:35:08,604] Trial 5 finished with value: 0.7866666666666666 and parameters: {'n_neighbors': 20}. Best is trial 1 with value: 0.82.
[I 2025-01-02 07:35:08,646] Trial 6 finished with value: 0.84 and parameters: {'n_neighbors': 6}. Best is trial 6 with value: 0.84.
[I 2025-01-02 07:35:08,679] Trial 7 finished with value: 0.85 and parameters: {'n_neighbors': 4}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:35:08,720] Trial 8 finished with value: 0.8033333333333333 and parameters: {'n_neighbors': 16}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:35:08,756] Trial 9 finished with value: 0.82 and parameters: {'n_neighbors': 11}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:35:08,798] Trial 10 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8533333333333334.
[I 2025-01-02 07:35:08,841] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.853

Optimizing Naive Bayes...


[I 2025-01-02 07:35:09,690] Trial 21 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,702] Trial 22 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,711] Trial 23 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,721] Trial 24 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,729] Trial 25 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,739] Trial 26 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,749] Trial 27 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,758] Trial 28 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:35:09,768] Trial 29 finished with value: 0.68 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:35:14,665] Trial 0 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 484, 'learning_rate': 0.44349977160117665, 'max_depth': 12}. Best is trial 0 with value: 0.8533333333333334.
[I 2025-01-02 07:35:18,657] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 295, 'learning_rate': 0.3454389926595417, 'max_depth': 11}. Best is trial 1 with value: 0.86.
[I 2025-01-02 07:35:20,330] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 148, 'learning_rate': 0.47718499950837473, 'max_depth': 19}. Best is trial 1 with value: 0.86.
[I 2025-01-02 07:35:21,297] Trial 3 finished with value: 0.85 and parameters: {'n_estimators': 156, 'learning_rate': 0.10844900890594372, 'max_depth': 6}. Best is trial 1 with value: 0.86.
[I 2025-01-02 07:35:24,451] Trial 4 finished with value: 0.87 and parameters: {'n_estimators': 406, 'learning_rate': 0.18357356478489018, 'max_depth': 7}. Best is trial 4 with value: 0.87.
[I 2025-01-

Optimizing XGBoost...


[I 2025-01-02 07:36:37,480] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 381, 'max_depth': 8, 'learning_rate': 0.3351686330097642}. Best is trial 0 with value: 0.8733333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:36:37,817] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 381, 'max_depth': 13, 'learning_rate': 0.26018241233540335}. Best is trial 0 with value: 0.8733333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:36:37,962] Trial 2 finished with value: 0.87 and parameters: {'n_estimators': 126, 'max_depth': 8, 'learning_rate': 0.14755771202196577}. Best is trial 0 with value: 0.8733333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:36:38,145] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 195, 'max_depth': 9, 'learning_rate': 0.31509929668959746}. Best is trial 0 with value: 0.8733333333333333.
Paramet

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001070 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> init

[I 2025-01-02 07:36:48,321] Trial 1 finished with value: 0.87 and parameters: {'n_estimators': 218, 'max_depth': 15, 'learning_rate': 0.4548962608014512}. Best is trial 1 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:48,596] Trial 2 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 363, 'max_depth': 11, 'learning_rate': 0.06711066051305971}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:48,790] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 240, 'max_depth': 15, 'learning_rate': 0.23834364751427697}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:49,052] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 338, 'max_depth': 9, 'learning_rate': 0.18216239271625628}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000243 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:49,366] Trial 5 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 358, 'max_depth': 8, 'learning_rate': 0.25608648856871485}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:49,480] Trial 6 finished with value: 0.86 and parameters: {'n_estimators': 147, 'max_depth': 17, 'learning_rate': 0.40787863184532575}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:49,731] Trial 7 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 313, 'max_depth': 10, 'learning_rate': 0.08642939654537085}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:49,987] Trial 8 finished with value: 0.86 and parameters: {'n_estimators': 352, 'max_depth': 3, 'learning_rate': 0.2669834768419611}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:50,277] Trial 9 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 357, 'max_depth': 10, 'learning_rate': 0.18945676791928404}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:50,627] Trial 10 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 500, 'max_depth': 19, 'learning_rate': 0.022332408660918464}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:50,992] Trial 11 finished with value: 0.87 and parameters: {'n_estimators': 460, 'max_depth': 13, 'learning_rate': 0.09876607554004094}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:51,335] Trial 12 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 432, 'max_depth': 12, 'learning_rate': 0.11546998879101546}. Best is trial 2 with value: 0.8766666666666667.
[I 2025-01-02 07:36:51,559] Trial 13 finished with value: 0.86 and parameters: {'n_estimators': 272, 'max_depth': 7, 'learning_rate': 0.021521045614429135}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000341 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:51,982] Trial 14 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 418, 'max_depth': 11, 'learning_rate': 0.3524602791511147}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:52,258] Trial 15 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 299, 'max_depth': 6, 'learning_rate': 0.1497420830845116}. Best is trial 2 with value: 0.8766666666666667.
[I 2025-01-02 07:36:52,421] Trial 16 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 186, 'max_depth': 13, 'learning_rate': 0.06898488018112983}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000358 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:52,799] Trial 17 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 394, 'max_depth': 15, 'learning_rate': 0.3240454163055303}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:53,029] Trial 18 finished with value: 0.87 and parameters: {'n_estimators': 298, 'max_depth': 10, 'learning_rate': 0.1382611431173265}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:53,148] Trial 19 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 138, 'max_depth': 20, 'learning_rate': 0.011254841605317228}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:53,384] Trial 20 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 282, 'max_depth': 8, 'learning_rate': 0.19662156624031343}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000355 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:53,809] Trial 21 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 426, 'max_depth': 11, 'learning_rate': 0.3391986999760598}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:54,153] Trial 22 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 398, 'max_depth': 12, 'learning_rate': 0.3378622214402739}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:54,585] Trial 23 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 482, 'max_depth': 10, 'learning_rate': 0.3996318168444342}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:54,955] Trial 24 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 396, 'max_depth': 13, 'learning_rate': 0.49022908281672717}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:55,228] Trial 25 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 324, 'max_depth': 11, 'learning_rate': 0.29598424825116715}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:55,596] Trial 26 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 437, 'max_depth': 5, 'learning_rate': 0.3719934023078216}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:55,911] Trial 27 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 383, 'max_depth': 14, 'learning_rate': 0.0660660201336439}. Best is trial 2 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:36:56,232] Trial 28 finished with value: 0.87 and parameters: {'n_estimators': 252, 'max_depth': 17, 'learning_rate': 0.22663330552468997}. Best is trial 2 with value: 0.8766666666666667.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:36:56,378] Trial 29 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 84, 'max_depth': 8, 'learning_rate': 0.07125396534008169}. Best is trial 2 with value: 0.8766666666666667.
[I 2025-01-02 07:36:56,382] A new study created in memory with name: no-name-5ead2720-5e6a-47f3-8953-a48af5049e39


Optimizing AdaBoost...


[I 2025-01-02 07:36:58,004] Trial 0 finished with value: 0.7566666666666667 and parameters: {'n_estimators': 363, 'learning_rate': 0.9632024081309768}. Best is trial 0 with value: 0.7566666666666667.
[I 2025-01-02 07:37:00,865] Trial 1 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 430, 'learning_rate': 0.6130604018183939}. Best is trial 0 with value: 0.7566666666666667.
[I 2025-01-02 07:37:02,460] Trial 2 finished with value: 0.7233333333333334 and parameters: {'n_estimators': 487, 'learning_rate': 0.16685625030808243}. Best is trial 0 with value: 0.7566666666666667.
[I 2025-01-02 07:37:02,936] Trial 3 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 164, 'learning_rate': 0.25893423052813497}. Best is trial 0 with value: 0.7566666666666667.
[I 2025-01-02 07:37:03,221] Trial 4 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 92, 'learning_rate': 0.4565322624376687}. Best is trial 0 with value: 0.7566666666666667.

Optimizing Neural Network...


[I 2025-01-02 07:37:31,001] Trial 0 finished with value: 0.75 and parameters: {'hidden_layer_1': 63, 'hidden_layer_2': 36, 'learning_rate_init': 0.03524606650272839}. Best is trial 0 with value: 0.75.
[I 2025-01-02 07:37:31,882] Trial 1 finished with value: 0.7933333333333333 and parameters: {'hidden_layer_1': 36, 'hidden_layer_2': 15, 'learning_rate_init': 0.010132694609414562}. Best is trial 1 with value: 0.7933333333333333.
[I 2025-01-02 07:37:32,787] Trial 2 finished with value: 0.7866666666666666 and parameters: {'hidden_layer_1': 73, 'hidden_layer_2': 35, 'learning_rate_init': 0.088116520008631}. Best is trial 1 with value: 0.7933333333333333.
[I 2025-01-02 07:37:34,248] Trial 3 finished with value: 0.76 and parameters: {'hidden_layer_1': 17, 'hidden_layer_2': 61, 'learning_rate_init': 0.007294480290683855}. Best is trial 1 with value: 0.7933333333333333.
[I 2025-01-02 07:37:35,051] Trial 4 finished with value: 0.76 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 16, 'le

Optimizing MLP...


[I 2025-01-02 07:38:04,000] Trial 0 finished with value: 0.7133333333333334 and parameters: {'layer_1': 64, 'layer_2': 52, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.015444342040856079}. Best is trial 0 with value: 0.7133333333333334.
[I 2025-01-02 07:38:06,868] Trial 1 finished with value: 0.7766666666666666 and parameters: {'layer_1': 60, 'layer_2': 106, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.08864550864066753}. Best is trial 1 with value: 0.7766666666666666.
[I 2025-01-02 07:38:11,242] Trial 2 finished with value: 0.6966666666666667 and parameters: {'layer_1': 117, 'layer_2': 53, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.014964933328409945}. Best is trial 1 with value: 0.7766666666666666.
[I 2025-01-02 07:38:14,193] Trial 3 finished with value: 0.8033333333333333 and parameters: {'layer_1': 108, 'layer_2': 84, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.07248205294208769}. Best is trial 3 

                  Model  Accuracy  \
0                   SVM  0.790000   
1         Decision Tree  0.850000   
2         Random Forest  0.866667   
3   Logistic Regression  0.706667   
4                  k-NN  0.853333   
5           Naive Bayes  0.680000   
6     Gradient Boosting  0.870000   
7               XGBoost  0.886667   
8              LightGBM  0.876667   
9              AdaBoost  0.760000   
10       Neural Network  0.836667   
11                  MLP  0.840000   

                                          Best Params  
0          {'C': 7.430929552803818, 'kernel': 'poly'}  
1           {'max_depth': 15, 'min_samples_split': 5}  
2   {'n_estimators': 434, 'max_depth': 19, 'min_sa...  
3     {'C': 7.596524581447028, 'solver': 'liblinear'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 406, 'learning_rate': 0.18357...  
7   {'n_estimators': 333, 'max_depth': 6, 'learnin...  
8   {'n_estima

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2025-01-02 07:39:39,532] A new study created in memory with name: no-name-08d7851b-85a1-4cda-a43a-0dcefd3eb883
[I 2025-01-02 07:39:39,601] Trial 0 finished with value: 0.6266666666666667 and parameters: {'C': 6.717374225234966, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.6266666666666667.
[I 2025-01-02 07:39:39,684] Trial 1 finished with value: 0.6166666666666667 and parameters: {'C': 2.8225806330375063, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.6266666666666667.


Optimizing SVM...


[I 2025-01-02 07:39:39,754] Trial 2 finished with value: 0.62 and parameters: {'C': 7.7116843707125176, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.6266666666666667.
[I 2025-01-02 07:39:39,809] Trial 3 finished with value: 0.6933333333333334 and parameters: {'C': 0.14732256675530456, 'kernel': 'linear'}. Best is trial 3 with value: 0.6933333333333334.
[I 2025-01-02 07:39:39,874] Trial 4 finished with value: 0.7833333333333333 and parameters: {'C': 3.016066253630342, 'kernel': 'poly'}. Best is trial 4 with value: 0.7833333333333333.
[I 2025-01-02 07:39:39,928] Trial 5 finished with value: 0.73 and parameters: {'C': 8.073220604081383, 'kernel': 'linear'}. Best is trial 4 with value: 0.7833333333333333.
[I 2025-01-02 07:39:39,994] Trial 6 finished with value: 0.7866666666666666 and parameters: {'C': 6.156471117558458, 'kernel': 'rbf'}. Best is trial 6 with value: 0.7866666666666666.
[I 2025-01-02 07:39:40,051] Trial 7 finished with value: 0.73 and parameters: {'C': 7.9982842345332

Optimizing Decision Tree...


[I 2025-01-02 07:39:42,220] Trial 7 finished with value: 0.84 and parameters: {'max_depth': 18, 'min_samples_split': 3}. Best is trial 1 with value: 0.8466666666666667.
[I 2025-01-02 07:39:42,244] Trial 8 finished with value: 0.83 and parameters: {'max_depth': 19, 'min_samples_split': 7}. Best is trial 1 with value: 0.8466666666666667.
[I 2025-01-02 07:39:42,275] Trial 9 finished with value: 0.8266666666666667 and parameters: {'max_depth': 7, 'min_samples_split': 8}. Best is trial 1 with value: 0.8466666666666667.
[I 2025-01-02 07:39:42,334] Trial 10 finished with value: 0.8666666666666667 and parameters: {'max_depth': 11, 'min_samples_split': 2}. Best is trial 10 with value: 0.8666666666666667.
[I 2025-01-02 07:39:42,378] Trial 11 finished with value: 0.81 and parameters: {'max_depth': 11, 'min_samples_split': 10}. Best is trial 10 with value: 0.8666666666666667.
[I 2025-01-02 07:39:42,421] Trial 12 finished with value: 0.8566666666666667 and parameters: {'max_depth': 11, 'min_samples

Optimizing Random Forest...


[I 2025-01-02 07:39:45,288] Trial 0 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 428, 'max_depth': 14, 'min_samples_split': 7}. Best is trial 0 with value: 0.8566666666666667.
[I 2025-01-02 07:39:46,506] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 248, 'max_depth': 15, 'min_samples_split': 8}. Best is trial 0 with value: 0.8566666666666667.
[I 2025-01-02 07:39:47,059] Trial 2 finished with value: 0.77 and parameters: {'n_estimators': 169, 'max_depth': 4, 'min_samples_split': 8}. Best is trial 0 with value: 0.8566666666666667.
[I 2025-01-02 07:39:48,427] Trial 3 finished with value: 0.86 and parameters: {'n_estimators': 425, 'max_depth': 19, 'min_samples_split': 5}. Best is trial 3 with value: 0.86.
[I 2025-01-02 07:39:48,893] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 153, 'max_depth': 19, 'min_samples_split': 10}. Best is trial 3 with value: 0.86.
[I 2025-01-02 07:39:49,032] Trial 5 

Optimizing Logistic Regression...


[I 2025-01-02 07:40:20,121] Trial 8 finished with value: 0.7 and parameters: {'C': 2.6058231853390024, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-02 07:40:20,150] Trial 9 finished with value: 0.7066666666666667 and parameters: {'C': 9.047825569410945, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-02 07:40:20,183] Trial 10 finished with value: 0.7066666666666667 and parameters: {'C': 9.576497862329807, 'solver': 'liblinear'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-02 07:40:20,221] Trial 11 finished with value: 0.7 and parameters: {'C': 9.95408130582828, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-02 07:40:20,253] Trial 12 finished with value: 0.7066666666666667 and parameters: {'C': 7.830464901019001, 'solver': 'liblinear'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-02 07:40:20,292] Trial 13 finished with value: 0.7033333333333334 and parameters: {

Optimizing k-NN...


[I 2025-01-02 07:40:21,109] Trial 4 finished with value: 0.83 and parameters: {'n_neighbors': 8}. Best is trial 4 with value: 0.83.
[I 2025-01-02 07:40:21,145] Trial 5 finished with value: 0.7966666666666666 and parameters: {'n_neighbors': 17}. Best is trial 4 with value: 0.83.
[I 2025-01-02 07:40:21,178] Trial 6 finished with value: 0.82 and parameters: {'n_neighbors': 13}. Best is trial 4 with value: 0.83.
[I 2025-01-02 07:40:21,220] Trial 7 finished with value: 0.85 and parameters: {'n_neighbors': 5}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:40:21,254] Trial 8 finished with value: 0.85 and parameters: {'n_neighbors': 4}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:40:21,293] Trial 9 finished with value: 0.8033333333333333 and parameters: {'n_neighbors': 16}. Best is trial 7 with value: 0.85.
[I 2025-01-02 07:40:21,337] Trial 10 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8533333333333334.
[I 2025-01-02 0

Optimizing Naive Bayes...


[I 2025-01-02 07:40:22,431] Trial 13 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,446] Trial 14 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,460] Trial 15 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,474] Trial 16 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,494] Trial 17 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,510] Trial 18 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,524] Trial 19 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,545] Trial 20 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:40:22,570] Trial 21 finished with value: 0.68 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:40:25,093] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 235, 'learning_rate': 0.45947426604946706, 'max_depth': 13}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:40:30,493] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 278, 'learning_rate': 0.034808956933589125, 'max_depth': 13}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:40:33,820] Trial 2 finished with value: 0.87 and parameters: {'n_estimators': 462, 'learning_rate': 0.40392834654004134, 'max_depth': 7}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:40:35,639] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 122, 'learning_rate': 0.12336233680712244, 'max_depth': 14}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:40:37,194] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 409, 'learning_rate': 0.06724356173092091, 'max_depth': 4}. Best is trial 0 with value: 0.87.
[I 2025-01-02 07:40:40,

Optimizing XGBoost...


[I 2025-01-02 07:41:49,253] Trial 0 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 117, 'max_depth': 9, 'learning_rate': 0.04344737292219013}. Best is trial 0 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:41:49,642] Trial 1 finished with value: 0.87 and parameters: {'n_estimators': 288, 'max_depth': 15, 'learning_rate': 0.07521790677105743}. Best is trial 0 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:41:49,876] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 185, 'max_depth': 17, 'learning_rate': 0.22845468736513752}. Best is trial 0 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:41:50,002] Trial 3 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 113, 'max_depth': 8, 'learning_rate': 0.4189098766019848}. Best is trial 0 with value: 0.8766666666

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.


[I 2025-01-02 07:41:59,936] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 56, 'max_depth': 17, 'learning_rate': 0.06614979441737587}. Best is trial 1 with value: 0.8633333333333333.


[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:00,142] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 251, 'max_depth': 4, 'learning_rate': 0.07203890553000852}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-02 07:42:00,233] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 114, 'max_depth': 13, 'learning_rate': 0.022195995089324405}. Best is trial 1 with value: 0.8633333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000203 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000354 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:00,704] Trial 4 finished with value: 0.88 and parameters: {'n_estimators': 492, 'max_depth': 9, 'learning_rate': 0.14497872568690334}. Best is trial 4 with value: 0.88.




[I 2025-01-02 07:42:00,806] Trial 5 finished with value: 0.87 and parameters: {'n_estimators': 116, 'max_depth': 3, 'learning_rate': 0.4939895956788042}. Best is trial 4 with value: 0.88.
[I 2025-01-02 07:42:00,865] Trial 6 finished with value: 0.86 and parameters: {'n_estimators': 56, 'max_depth': 11, 'learning_rate': 0.4205887687299038}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGB

[I 2025-01-02 07:42:01,155] Trial 7 finished with value: 0.86 and parameters: {'n_estimators': 339, 'max_depth': 11, 'learning_rate': 0.4641593873364719}. Best is trial 4 with value: 0.88.
[I 2025-01-02 07:42:01,264] Trial 8 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 138, 'max_depth': 11, 'learning_rate': 0.05272970212327956}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:42:01,692] Trial 9 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 477, 'max_depth': 8, 'learning_rate': 0.21916355698485104}. Best is trial 4 with value: 0.88.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:02,152] Trial 10 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 495, 'max_depth': 7, 'learning_rate': 0.19248648646843503}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:02,468] Trial 11 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 376, 'max_depth': 14, 'learning_rate': 0.1432825285960333}. Best is trial 4 with value: 0.88.
[I 2025-01-02 07:42:02,647] Trial 12 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 183, 'max_depth': 8, 'learning_rate': 0.13713573438174295}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000140 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:02,863] Trial 13 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 214, 'max_depth': 7, 'learning_rate': 0.2874106254773202}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:03,208] Trial 14 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 393, 'max_depth': 8, 'learning_rate': 0.14459548704731956}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:03,394] Trial 15 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 187, 'max_depth': 5, 'learning_rate': 0.13521773396894918}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:03,689] Trial 16 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 316, 'max_depth': 9, 'learning_rate': 0.28300175364049973}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000364 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:04,072] Trial 17 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 414, 'max_depth': 15, 'learning_rate': 0.18768135767506133}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:04,354] Trial 18 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 300, 'max_depth': 6, 'learning_rate': 0.11388577638893863}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:04,764] Trial 19 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 442, 'max_depth': 9, 'learning_rate': 0.3389908272914961}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582


[I 2025-01-02 07:42:04,924] Trial 20 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 165, 'max_depth': 10, 'learning_rate': 0.23631693571541218}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:05,208] Trial 21 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 324, 'max_depth': 13, 'learning_rate': 0.2846400738259873}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:05,463] Trial 22 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 268, 'max_depth': 9, 'learning_rate': 0.323741859068051}. Best is trial 4 with value: 0.88.
[I 2025-01-02 07:42:05,672] Trial 23 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 212, 'max_depth': 5, 'learning_rate': 0.18873055860413865}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000193 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:06,012] Trial 24 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 350, 'max_depth': 9, 'learning_rate': 0.39135553369023396}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:06,409] Trial 25 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 453, 'max_depth': 7, 'learning_rate': 0.26842107932085496}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:06,649] Trial 26 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 297, 'max_depth': 12, 'learning_rate': 0.10131915838480923}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:06,855] Trial 27 finished with value: 0.88 and parameters: {'n_estimators': 206, 'max_depth': 10, 'learning_rate': 0.22079549686477656}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000397 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:07,056] Trial 28 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 222, 'max_depth': 16, 'learning_rate': 0.15940648600476798}. Best is trial 4 with value: 0.88.
[I 2025-01-02 07:42:07,205] Trial 29 finished with value: 0.87 and parameters: {'n_estimators': 161, 'max_depth': 10, 'learning_rate': 0.2198971184115842}. Best is trial 4 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:42:07,224] A new study created in memory with name: no-name-a0f9943d-5e62-459d-a4ab-010cca0fe29f


Optimizing AdaBoost...


[I 2025-01-02 07:42:08,070] Trial 0 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 273, 'learning_rate': 0.17496817257677608}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-02 07:42:09,162] Trial 1 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 372, 'learning_rate': 0.12899268757000856}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-02 07:42:10,258] Trial 2 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 277, 'learning_rate': 0.471469455482111}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-02 07:42:10,758] Trial 3 finished with value: 0.74 and parameters: {'n_estimators': 115, 'learning_rate': 0.254325103815148}. Best is trial 3 with value: 0.74.
[I 2025-01-02 07:42:12,047] Trial 4 finished with value: 0.7333333333333333 and parameters: {'n_estimators': 286, 'learning_rate': 0.8991567005575007}. Best is trial 3 with value: 0.74.
[I 2025-01-02 07:42:12,851] Trial 5 finish

Optimizing Neural Network...


[I 2025-01-02 07:42:40,917] Trial 0 finished with value: 0.7966666666666666 and parameters: {'hidden_layer_1': 35, 'hidden_layer_2': 17, 'learning_rate_init': 0.019642110483302454}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:42:43,780] Trial 1 finished with value: 0.77 and parameters: {'hidden_layer_1': 45, 'hidden_layer_2': 61, 'learning_rate_init': 0.01648006148294367}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:42:44,840] Trial 2 finished with value: 0.7766666666666666 and parameters: {'hidden_layer_1': 10, 'hidden_layer_2': 20, 'learning_rate_init': 0.013716018794592645}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:42:45,665] Trial 3 finished with value: 0.81 and parameters: {'hidden_layer_1': 99, 'hidden_layer_2': 40, 'learning_rate_init': 0.052260790715027074}. Best is trial 3 with value: 0.81.
[I 2025-01-02 07:42:46,635] Trial 4 finished with value: 0.77 and parameters: {'hidden_layer_1': 58, 'hidden_layer_2': 84, 

Optimizing MLP...


[I 2025-01-02 07:43:32,163] Trial 0 finished with value: 0.8166666666666667 and parameters: {'layer_1': 63, 'layer_2': 106, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.04365631190271952}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:43:32,774] Trial 1 finished with value: 0.5 and parameters: {'layer_1': 76, 'layer_2': 121, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.08925264091097065}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:43:34,628] Trial 2 finished with value: 0.7533333333333333 and parameters: {'layer_1': 58, 'layer_2': 139, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.08242562471049965}. Best is trial 0 with value: 0.8166666666666667.
[I 2025-01-02 07:43:35,629] Trial 3 finished with value: 0.77 and parameters: {'layer_1': 92, 'layer_2': 55, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.08288551283410249}. Best is trial 0 with value: 0.8166666666666667.
[I 2

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.790000     0.713333     0.866667  0.586941   
1         Decision Tree  0.866667     0.853333     0.880000  0.733594   
2         Random Forest  0.866667     0.840000     0.893333  0.734379   
3   Logistic Regression  0.706667     0.680000     0.733333  0.413922   
4                  k-NN  0.853333     0.840000     0.866667  0.706918   
5           Naive Bayes  0.680000     0.660000     0.700000  0.360288   
6     Gradient Boosting  0.873333     0.860000     0.886667  0.746932   
7               XGBoost  0.880000     0.860000     0.900000  0.760609   
8              LightGBM  0.880000     0.853333     0.906667  0.761083   
9              AdaBoost  0.763333     0.726667     0.800000  0.528089   
10       Neural Network  0.840000     0.793333     0.886667  0.682981   
11                  MLP  0.850000     0.833333     0.866667  0.700389   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_GDC_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2025-01-02 07:45:04,782] A new study created in memory with name: no-name-060fe927-db82-4829-9670-0abae4329b95


Optimizing SVM...


[I 2025-01-02 07:45:05,390] Trial 0 finished with value: 0.78 and parameters: {'C': 9.435187062312181, 'kernel': 'poly'}. Best is trial 0 with value: 0.78.
[I 2025-01-02 07:45:05,625] Trial 1 finished with value: 0.78 and parameters: {'C': 5.402191374444786, 'kernel': 'rbf'}. Best is trial 0 with value: 0.78.
[I 2025-01-02 07:45:05,858] Trial 2 finished with value: 0.62 and parameters: {'C': 8.33048583802585, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.78.
[I 2025-01-02 07:45:06,027] Trial 3 finished with value: 0.7366666666666667 and parameters: {'C': 9.517428581832885, 'kernel': 'linear'}. Best is trial 0 with value: 0.78.
[I 2025-01-02 07:45:06,195] Trial 4 finished with value: 0.71 and parameters: {'C': 2.0784494759671523, 'kernel': 'linear'}. Best is trial 0 with value: 0.78.
[I 2025-01-02 07:45:06,490] Trial 5 finished with value: 0.79 and parameters: {'C': 5.094289375863873, 'kernel': 'poly'}. Best is trial 5 with value: 0.79.
[I 2025-01-02 07:45:06,756] Trial 6 finished

Optimizing Decision Tree...


[I 2025-01-02 07:45:13,332] Trial 12 finished with value: 0.8533333333333334 and parameters: {'max_depth': 20, 'min_samples_split': 2}. Best is trial 11 with value: 0.8533333333333334.
[I 2025-01-02 07:45:13,368] Trial 13 finished with value: 0.85 and parameters: {'max_depth': 13, 'min_samples_split': 2}. Best is trial 11 with value: 0.8533333333333334.
[I 2025-01-02 07:45:13,393] Trial 14 finished with value: 0.85 and parameters: {'max_depth': 20, 'min_samples_split': 3}. Best is trial 11 with value: 0.8533333333333334.
[I 2025-01-02 07:45:13,419] Trial 15 finished with value: 0.86 and parameters: {'max_depth': 13, 'min_samples_split': 3}. Best is trial 15 with value: 0.86.
[I 2025-01-02 07:45:13,444] Trial 16 finished with value: 0.8433333333333334 and parameters: {'max_depth': 12, 'min_samples_split': 4}. Best is trial 15 with value: 0.86.
[I 2025-01-02 07:45:13,469] Trial 17 finished with value: 0.83 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 15 with va

Optimizing Random Forest...


[I 2025-01-02 07:45:14,416] Trial 0 finished with value: 0.7533333333333333 and parameters: {'n_estimators': 250, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 0 with value: 0.7533333333333333.
[I 2025-01-02 07:45:14,681] Trial 1 finished with value: 0.81 and parameters: {'n_estimators': 104, 'max_depth': 5, 'min_samples_split': 9}. Best is trial 1 with value: 0.81.
[I 2025-01-02 07:45:15,345] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 218, 'max_depth': 14, 'min_samples_split': 7}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-02 07:45:16,505] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 275, 'max_depth': 8, 'min_samples_split': 5}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-02 07:45:18,116] Trial 4 finished with value: 0.7533333333333333 and parameters: {'n_estimators': 498, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-0

Optimizing Logistic Regression...


[I 2025-01-02 07:45:37,890] Trial 10 finished with value: 0.7 and parameters: {'C': 9.98064003450217, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7066666666666667.
[I 2025-01-02 07:45:37,924] Trial 11 finished with value: 0.7 and parameters: {'C': 9.96905486555056, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7066666666666667.
[I 2025-01-02 07:45:37,957] Trial 12 finished with value: 0.7066666666666667 and parameters: {'C': 8.383269263456494, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7066666666666667.
[I 2025-01-02 07:45:37,986] Trial 13 finished with value: 0.7066666666666667 and parameters: {'C': 8.364670229197966, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7066666666666667.
[I 2025-01-02 07:45:38,017] Trial 14 finished with value: 0.7066666666666667 and parameters: {'C': 8.614980988306952, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7066666666666667.
[I 2025-01-02 07:45:38,049] Trial 15 finished with value: 0.7033333333333334 and parameters: {'C': 6.4

Optimizing k-NN...


[I 2025-01-02 07:45:38,736] Trial 4 finished with value: 0.8033333333333333 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,773] Trial 5 finished with value: 0.85 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,808] Trial 6 finished with value: 0.82 and parameters: {'n_neighbors': 9}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,851] Trial 7 finished with value: 0.82 and parameters: {'n_neighbors': 14}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,887] Trial 8 finished with value: 0.82 and parameters: {'n_neighbors': 14}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,916] Trial 9 finished with value: 0.83 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:45:38,958] Trial 10 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8533333333333334.
[I 2025-01-02 07:45:38,994] T

Optimizing Naive Bayes...


[I 2025-01-02 07:45:39,846] Trial 20 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,855] Trial 21 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,865] Trial 22 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,878] Trial 23 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,890] Trial 24 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,898] Trial 25 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,907] Trial 26 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,915] Trial 27 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-02 07:45:39,922] Trial 28 finished with value: 0.68 a

Optimizing Gradient Boosting...


[I 2025-01-02 07:45:41,091] Trial 0 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 392, 'learning_rate': 0.24723327792365632, 'max_depth': 3}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-02 07:45:44,537] Trial 1 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 419, 'learning_rate': 0.4131979222524481, 'max_depth': 5}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-02 07:45:49,963] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 153, 'learning_rate': 0.4330870188220721, 'max_depth': 19}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-02 07:45:52,821] Trial 3 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 207, 'learning_rate': 0.18021984007611486, 'max_depth': 16}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-02 07:45:57,799] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 454, 'learning_rate': 0

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:47:16,646] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 142, 'max_depth': 6, 'learning_rate': 0.1544151777739625}. Best is trial 0 with value: 0.87.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:47:18,228] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 374, 'max_depth': 12, 'learning_rate': 0.24019139189326852}. Best is trial 0 with value: 0.87.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:47:19,327] Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 324, 'max_depth': 3, 'learning_rate': 0.3056696751645805}. Best is trial 0 with value: 0.87.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:47:19,383] Trial 3 finished with value: 0.83 and parameters: {'n_estimators': 54, 'max_depth': 4, 'learning_rate': 0.2697421429540633}. Best is trial 0 with value: 0.87.
Parameters: { "

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> init

[I 2025-01-02 07:47:27,771] Trial 1 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 358, 'max_depth': 13, 'learning_rate': 0.47139772738552693}. Best is trial 1 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:28,170] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 472, 'max_depth': 17, 'learning_rate': 0.2966082403556544}. Best is trial 1 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:28,319] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 199, 'max_depth': 15, 'learning_rate': 0.36691432892209347}. Best is trial 1 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:28,555] Trial 4 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 301, 'max_depth': 19, 'learning_rate': 0.4891577663150275}. Best is trial 4 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:28,733] Trial 5 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 213, 'max_depth': 9, 'learning_rate': 0.24300615950442336}. Best is trial 4 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:28,907] Trial 6 finished with value: 0.88 and parameters: {'n_estimators': 268, 'max_depth': 16, 'learning_rate': 0.05164845397817469}. Best is trial 6 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:29,283] Trial 7 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 439, 'max_depth': 6, 'learning_rate': 0.34820978947439535}. Best is trial 6 with value: 0.88.
[I 2025-01-02 07:47:29,499] Trial 8 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 174, 'max_depth': 10, 'learning_rate': 0.24189619415002425}. Best is trial 6 with value: 0.88.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:47:29,828] Trial 9 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 278, 'max_depth': 15, 'learning_rate': 0.41003127059602684}. Best is trial 9 with value: 0.8833333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:29,925] Trial 10 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 59, 'max_depth': 20, 'learning_rate': 0.12044496029357499}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:30,300] Trial 11 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 315, 'max_depth': 14, 'learning_rate': 0.013891920099821459}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:47:30,755] Trial 12 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 392, 'max_depth': 16, 'learning_rate': 0.1341743006762384}. Best is trial 9 with value: 0.8833333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:31,051] Trial 13 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 250, 'max_depth': 12, 'learning_rate': 0.016806115186585313}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:31,229] Trial 14 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 120, 'max_depth': 18, 'learning_rate': 0.14254570180258036}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:47:31,569] Trial 15 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 265, 'max_depth': 10, 'learning_rate': 0.41968056228706885}. Best is trial 9 with value: 0.8833333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:32,021] Trial 16 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 349, 'max_depth': 16, 'learning_rate': 0.17936585570332186}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:32,311] Trial 17 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 245, 'max_depth': 14, 'learning_rate': 0.06364093016417473}. Best is trial 9 with value: 0.8833333333333333.
[I 2025-01-02 07:47:32,491] Trial 18 finished with value: 0.87 and parameters: {'n_estimators': 126, 'max_depth': 8, 'learning_rate': 0.3330388129230795}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:33,008] Trial 19 finished with value: 0.87 and parameters: {'n_estimators': 409, 'max_depth': 13, 'learning_rate': 0.4216955010372807}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000351 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:47:33,309] Trial 20 finished with value: 0.87 and parameters: {'n_estimators': 225, 'max_depth': 11, 'learning_rate': 0.18949954441930789}. Best is trial 9 with value: 0.8833333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:33,640] Trial 21 finished with value: 0.87 and parameters: {'n_estimators': 290, 'max_depth': 15, 'learning_rate': 0.07006608079007613}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000184 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:34,048] Trial 22 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 327, 'max_depth': 18, 'learning_rate': 0.07655554367568437}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:34,357] Trial 23 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 250, 'max_depth': 14, 'learning_rate': 0.0797667222443629}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000173 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:34,614] Trial 24 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 274, 'max_depth': 17, 'learning_rate': 0.19336008680949257}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:34,923] Trial 25 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 376, 'max_depth': 12, 'learning_rate': 0.0578668296453813}. Best is trial 9 with value: 0.8833333333333333.
[I 2025-01-02 07:47:35,037] Trial 26 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 137, 'max_depth': 15, 'learning_rate': 0.2979829832799959}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:35,246] Trial 27 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 233, 'max_depth': 13, 'learning_rate': 0.40551266907874955}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:35,512] Trial 28 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 335, 'max_depth': 17, 'learning_rate': 0.10970621091008846}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:35,690] Trial 29 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 172, 'max_depth': 5, 'learning_rate': 0.04204349622956343}. Best is trial 9 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:47:35,906] A new study created in memory with name: no-name-378b81b9-66a7-4b46-97f8-a4a07ca59741


Optimizing AdaBoost...


[I 2025-01-02 07:47:36,320] Trial 0 finished with value: 0.7433333333333333 and parameters: {'n_estimators': 139, 'learning_rate': 0.8556733909496829}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-01-02 07:47:36,521] Trial 1 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 61, 'learning_rate': 0.7261327223729138}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-01-02 07:47:36,960] Trial 2 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 149, 'learning_rate': 0.5979168312658678}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-01-02 07:47:37,162] Trial 3 finished with value: 0.74 and parameters: {'n_estimators': 65, 'learning_rate': 0.48497317842097304}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-01-02 07:47:37,848] Trial 4 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 228, 'learning_rate': 0.8963398963608571}. Best is trial 0 with value: 0.7433333333333333.
[I 2025-01-02 0

Optimizing Neural Network...


[I 2025-01-02 07:48:01,219] Trial 0 finished with value: 0.8 and parameters: {'hidden_layer_1': 48, 'hidden_layer_2': 77, 'learning_rate_init': 0.017188045534543082}. Best is trial 0 with value: 0.8.
[I 2025-01-02 07:48:01,723] Trial 1 finished with value: 0.7233333333333334 and parameters: {'hidden_layer_1': 15, 'hidden_layer_2': 43, 'learning_rate_init': 0.09324404520329169}. Best is trial 0 with value: 0.8.
[I 2025-01-02 07:48:02,673] Trial 2 finished with value: 0.7433333333333333 and parameters: {'hidden_layer_1': 28, 'hidden_layer_2': 83, 'learning_rate_init': 0.0905217768494729}. Best is trial 0 with value: 0.8.
[I 2025-01-02 07:48:03,804] Trial 3 finished with value: 0.7633333333333333 and parameters: {'hidden_layer_1': 38, 'hidden_layer_2': 66, 'learning_rate_init': 0.0860274652392535}. Best is trial 0 with value: 0.8.
[I 2025-01-02 07:48:05,419] Trial 4 finished with value: 0.7666666666666667 and parameters: {'hidden_layer_1': 86, 'hidden_layer_2': 33, 'learning_rate_init': 0

Optimizing MLP...


[I 2025-01-02 07:48:46,060] Trial 0 finished with value: 0.7666666666666667 and parameters: {'layer_1': 54, 'layer_2': 128, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.07080354687620669}. Best is trial 0 with value: 0.7666666666666667.
[I 2025-01-02 07:48:47,203] Trial 1 finished with value: 0.7666666666666667 and parameters: {'layer_1': 110, 'layer_2': 92, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.09446692243342501}. Best is trial 0 with value: 0.7666666666666667.
[I 2025-01-02 07:48:47,786] Trial 2 finished with value: 0.7133333333333334 and parameters: {'layer_1': 63, 'layer_2': 63, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.09420947307452292}. Best is trial 0 with value: 0.7666666666666667.
[I 2025-01-02 07:48:50,029] Trial 3 finished with value: 0.8266666666666667 and parameters: {'layer_1': 123, 'layer_2': 64, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.06545938076796519}. Best is trial 3 with valu

Dataset saved successfully!




Class Feature Vector (CFV)

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_GDC.csv", index=False)
print("CFV dataset created and saved!")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-16 03:42:52,545] A new study created in memory with name: no-name-5bd4801d-c6a6-4cb8-888a-4f534057a7e7


Training and predicting with SVM...


[I 2025-01-16 03:42:52,806] Trial 0 finished with value: 0.7833333333333333 and parameters: {'C': 4.374651706952948, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:42:53,048] Trial 1 finished with value: 0.7833333333333333 and parameters: {'C': 4.58689453320545, 'kernel': 'rbf'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:42:53,296] Trial 2 finished with value: 0.78 and parameters: {'C': 3.3710181330853315, 'kernel': 'poly'}. Best is trial 0 with value: 0.7833333333333333.
[I 2025-01-16 03:42:53,602] Trial 3 finished with value: 0.7866666666666666 and parameters: {'C': 6.504792719554885, 'kernel': 'poly'}. Best is trial 3 with value: 0.7866666666666666.
[I 2025-01-16 03:42:53,836] Trial 4 finished with value: 0.7533333333333333 and parameters: {'C': 0.4787956273302435, 'kernel': 'rbf'}. Best is trial 3 with value: 0.7866666666666666.
[I 2025-01-16 03:42:54,087] Trial 5 finished with value: 0.62 and parameters: {'C': 8.3468823620

Training and predicting with Decision Tree...


[I 2025-01-16 03:43:01,559] Trial 12 finished with value: 0.85 and parameters: {'max_depth': 17, 'min_samples_split': 4}. Best is trial 11 with value: 0.8566666666666667.
[I 2025-01-16 03:43:01,585] Trial 13 finished with value: 0.84 and parameters: {'max_depth': 17, 'min_samples_split': 4}. Best is trial 11 with value: 0.8566666666666667.
[I 2025-01-16 03:43:01,610] Trial 14 finished with value: 0.8266666666666667 and parameters: {'max_depth': 18, 'min_samples_split': 4}. Best is trial 11 with value: 0.8566666666666667.
[I 2025-01-16 03:43:01,635] Trial 15 finished with value: 0.8366666666666667 and parameters: {'max_depth': 9, 'min_samples_split': 3}. Best is trial 11 with value: 0.8566666666666667.
[I 2025-01-16 03:43:01,660] Trial 16 finished with value: 0.8333333333333334 and parameters: {'max_depth': 9, 'min_samples_split': 4}. Best is trial 11 with value: 0.8566666666666667.
[I 2025-01-16 03:43:01,686] Trial 17 finished with value: 0.8566666666666667 and parameters: {'max_depth'

Training and predicting with Random Forest...


[I 2025-01-16 03:43:04,234] Trial 0 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 464, 'max_depth': 14, 'min_samples_split': 6}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 03:43:06,127] Trial 1 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 240, 'max_depth': 10, 'min_samples_split': 3}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 03:43:07,519] Trial 2 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 142, 'max_depth': 9, 'min_samples_split': 9}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 03:43:07,775] Trial 3 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 97, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-16 03:43:08,277] Trial 4 finished with value: 0.77 and parameters: {'n_estimators': 213, 'max_depth': 4, 'min_samples_split': 4}. Best is trial 0 with value: 0.8666666666666667

Training and predicting with Logistic Regression...


[I 2025-01-16 03:43:35,595] Trial 5 finished with value: 0.7033333333333334 and parameters: {'C': 7.216729216758223, 'solver': 'liblinear'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-16 03:43:35,629] Trial 6 finished with value: 0.6966666666666667 and parameters: {'C': 4.189263713944772, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-16 03:43:35,671] Trial 7 finished with value: 0.7066666666666667 and parameters: {'C': 8.908624191281149, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-16 03:43:35,710] Trial 8 finished with value: 0.7 and parameters: {'C': 4.297944736475218, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-16 03:43:35,735] Trial 9 finished with value: 0.7033333333333334 and parameters: {'C': 7.240368798478537, 'solver': 'liblinear'}. Best is trial 3 with value: 0.7066666666666667.
[I 2025-01-16 03:43:35,786] Trial 10 finished with value: 0.7033333333333334 and p

Training and predicting with k-NN...


[I 2025-01-16 03:43:36,888] Trial 4 finished with value: 0.82 and parameters: {'n_neighbors': 13}. Best is trial 0 with value: 0.85.
[I 2025-01-16 03:43:36,929] Trial 5 finished with value: 0.82 and parameters: {'n_neighbors': 14}. Best is trial 0 with value: 0.85.
[I 2025-01-16 03:43:36,972] Trial 6 finished with value: 0.8033333333333333 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.85.
[I 2025-01-16 03:43:37,013] Trial 7 finished with value: 0.82 and parameters: {'n_neighbors': 13}. Best is trial 0 with value: 0.85.
[I 2025-01-16 03:43:37,048] Trial 8 finished with value: 0.82 and parameters: {'n_neighbors': 9}. Best is trial 0 with value: 0.85.
[I 2025-01-16 03:43:37,078] Trial 9 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 3}. Best is trial 9 with value: 0.8533333333333334.
[I 2025-01-16 03:43:37,121] Trial 10 finished with value: 0.82 and parameters: {'n_neighbors': 9}. Best is trial 9 with value: 0.8533333333333334.
[I 2025-01-16 0

Training and predicting with Naive Bayes...


[I 2025-01-16 03:43:38,101] Trial 16 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,115] Trial 17 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,126] Trial 18 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,137] Trial 19 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,147] Trial 20 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,158] Trial 21 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,173] Trial 22 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,185] Trial 23 finished with value: 0.68 and parameters: {}. Best is trial 0 with value: 0.68.
[I 2025-01-16 03:43:38,196] Trial 24 finished with value: 0.68 a

Training and predicting with Gradient Boosting...


[I 2025-01-16 03:43:43,308] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 456, 'learning_rate': 0.12579723141646001, 'max_depth': 16}. Best is trial 0 with value: 0.8466666666666667.
[I 2025-01-16 03:43:44,094] Trial 1 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 58, 'learning_rate': 0.2865946143407616, 'max_depth': 15}. Best is trial 1 with value: 0.8533333333333334.
[I 2025-01-16 03:43:47,204] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 274, 'learning_rate': 0.22427368103689088, 'max_depth': 17}. Best is trial 1 with value: 0.8533333333333334.
[I 2025-01-16 03:43:51,543] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 295, 'learning_rate': 0.39882409512900147, 'max_depth': 18}. Best is trial 1 with value: 0.8533333333333334.
[I 2025-01-16 03:43:53,157] Trial 4 finished with value: 0.87 and parameters: {'n_estimators': 174, 'learning_rate': 0.11029407854

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:44:50,666] Trial 0 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 460, 'max_depth': 6, 'learning_rate': 0.04685740059946166}. Best is trial 0 with value: 0.8666666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:44:50,786] Trial 1 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 71, 'max_depth': 16, 'learning_rate': 0.27651621512480096}. Best is trial 1 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:44:51,038] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 245, 'max_depth': 16, 'learning_rate': 0.4597181332128624}. Best is trial 1 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:44:51,169] Trial 3 finished with value: 0.86 and parameters: {'n_estimators': 269, 'max_depth': 3, 'learning_rate': 0.4830282188

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001023 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:44:59,124] Trial 0 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 306, 'max_depth': 18, 'learning_rate': 0.23536108299118907}. Best is trial 0 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:44:59,356] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 284, 'max_depth': 15, 'learning_rate': 0.23221630118737518}. Best is trial 1 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:44:59,703] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 415, 'max_depth': 4, 'learning_rate': 0.07144955539908283}. Best is trial 1 with value: 0.86.
[I 2025-01-16 03:44:59,763] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 67, 'max_depth': 17, 'learning_rate': 0.2917622570182837}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:00,078] Trial 4 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 277, 'max_depth': 8, 'learning_rate': 0.1130798200598343}. Best is trial 3 with value: 0.8766666666666667.
[I 2025-01-16 03:45:00,256] Trial 5 finished with value: 0.87 and parameters: {'n_estimators': 144, 'max_depth': 18, 'learning_rate': 0.3191187613998777}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:00,774] Trial 6 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 399, 'max_depth': 12, 'learning_rate': 0.4108480316965715}. Best is trial 3 with value: 0.8766666666666667.
[I 2025-01-16 03:45:00,868] Trial 7 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 73, 'max_depth': 16, 'learning_rate': 0.14709348689028323}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:01,187] Trial 8 finished with value: 0.86 and parameters: {'n_estimators': 370, 'max_depth': 3, 'learning_rate': 0.4024268650558796}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000360 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:01,718] Trial 9 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 387, 'max_depth': 11, 'learning_rate': 0.04050694065677521}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:02,005] Trial 10 finished with value: 0.88 and parameters: {'n_estimators': 173, 'max_depth': 20, 'learning_rate': 0.48725691933912707}. Best is trial 10 with value: 0.88.
[I 2025-01-16 03:45:02,229] Trial 11 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 158, 'max_depth': 20, 'learning_rate': 0.4788959263979424}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:02,486] Trial 12 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 181, 'max_depth': 20, 'learning_rate': 0.4995433485140168}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:02,734] Trial 13 finished with value: 0.87 and parameters: {'n_estimators': 182, 'max_depth': 20, 'learning_rate': 0.49538829126389855}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:03,026] Trial 14 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 208, 'max_depth': 13, 'learning_rate': 0.4004335410342684}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000196 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:03,710] Trial 15 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 500, 'max_depth': 9, 'learning_rate': 0.4575275866646884}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:03,887] Trial 16 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 120, 'max_depth': 20, 'learning_rate': 0.3401335550960639}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000183 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:04,233] Trial 17 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 219, 'max_depth': 15, 'learning_rate': 0.43731215931620415}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:04,409] Trial 18 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 120, 'max_depth': 18, 'learning_rate': 0.36763088903799107}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000171 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:04,761] Trial 19 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 234, 'max_depth': 6, 'learning_rate': 0.17183705685583367}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:05,135] Trial 20 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 323, 'max_depth': 14, 'learning_rate': 0.4601899861526539}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:05,202] Trial 21 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 51, 'max_depth': 17, 'learning_rate': 0.30187710362494097}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:05,297] Trial 22 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 96, 'max_depth': 19, 'learning_rate': 0.27452226006197006}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGB

[I 2025-01-16 03:45:05,429] Trial 23 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 138, 'max_depth': 17, 'learning_rate': 0.36965327415959287}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:05,568] Trial 24 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 169, 'max_depth': 20, 'learning_rate': 0.17673432292778504}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:05,662] Trial 25 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 84, 'max_depth': 17, 'learning_rate': 0.47024132791418943}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:05,802] Trial 26 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 157, 'max_depth': 19, 'learning_rate': 0.35545724949188034}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:06,037] Trial 27 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 240, 'max_depth': 16, 'learning_rate': 0.2721891180787755}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:06,249] Trial 28 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 242, 'max_depth': 15, 'learning_rate': 0.20510890719011604}. Best is trial 11 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:45:06,551] Trial 29 finished with value: 0.86 and parameters: {'n_estimators': 313, 'max_depth': 19, 'learning_rate': 0.43235009718295225}. Best is trial 11 with value: 0.8833333333333333.
[I 2025-01-16 03:45:06,682] A new study created in memory with name: no-name-a496eb14-80f4-4d9a-831b-18beee37dd28


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000177 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training and predicting with AdaBoost...


[I 2025-01-16 03:45:07,210] Trial 0 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 157, 'learning_rate': 0.07195361046721979}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-16 03:45:07,665] Trial 1 finished with value: 0.73 and parameters: {'n_estimators': 145, 'learning_rate': 0.4080878195368856}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-16 03:45:08,681] Trial 2 finished with value: 0.73 and parameters: {'n_estimators': 316, 'learning_rate': 0.31120797246555454}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-16 03:45:09,916] Trial 3 finished with value: 0.7233333333333334 and parameters: {'n_estimators': 408, 'learning_rate': 0.2058335847953856}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-16 03:45:11,228] Trial 4 finished with value: 0.7366666666666667 and parameters: {'n_estimators': 433, 'learning_rate': 0.5487586100648937}. Best is trial 0 with value: 0.7366666666666667.
[I 2025-01-16 03:45:12,312

Training and predicting with Neural Network...


[I 2025-01-16 03:45:40,436] Trial 0 finished with value: 0.8033333333333333 and parameters: {'hidden_layer_1': 62, 'hidden_layer_2': 60, 'learning_rate_init': 0.08224088138725932}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-16 03:45:41,335] Trial 1 finished with value: 0.7966666666666666 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 99, 'learning_rate_init': 0.05833172345210263}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-16 03:45:41,873] Trial 2 finished with value: 0.7966666666666666 and parameters: {'hidden_layer_1': 63, 'hidden_layer_2': 62, 'learning_rate_init': 0.0748348403102436}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-16 03:45:43,245] Trial 3 finished with value: 0.8133333333333334 and parameters: {'hidden_layer_1': 98, 'hidden_layer_2': 74, 'learning_rate_init': 0.021630466402413503}. Best is trial 3 with value: 0.8133333333333334.
[I 2025-01-16 03:45:45,215] Trial 4 finished with value: 0.8066666666666666 and para

Training and predicting with MLP...


[I 2025-01-16 03:46:27,862] Trial 0 finished with value: 0.5 and parameters: {'layer_1': 150, 'layer_2': 142, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.08039020041500193}. Best is trial 0 with value: 0.5.
[I 2025-01-16 03:46:29,717] Trial 1 finished with value: 0.7133333333333334 and parameters: {'layer_1': 52, 'layer_2': 65, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.041531712880350846}. Best is trial 1 with value: 0.7133333333333334.
[I 2025-01-16 03:46:29,991] Trial 2 finished with value: 0.5 and parameters: {'layer_1': 112, 'layer_2': 88, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.08164991180756852}. Best is trial 1 with value: 0.7133333333333334.
[I 2025-01-16 03:46:31,096] Trial 3 finished with value: 0.7066666666666667 and parameters: {'layer_1': 68, 'layer_2': 56, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.004714162155213947}. Best is trial 1 with value: 0.7133333333333334.
[I 2025-01-16 

CFV dataset created and saved!


CPFV (Combined Probability and Class Feature Vector)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier


# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_GDC.csv", index=False)

Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.



Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...


grids for RandomizedSearchCV

In [None]:
#import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier # Import path for KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for RandomizedSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply random search
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing RandomizedSearchCV for SVM...
Performing RandomizedSearchCV for Decision Tree...
Performing RandomizedSearchCV for Random Forest...


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

Performing RandomizedSearchCV for Logistic Regression...




Performing RandomizedSearchCV for k-NN...
Performing RandomizedSearchCV for Naive Bayes...
Performing RandomizedSearchCV for Gradient Boosting...
Performing RandomizedSearchCV for XGBoost...
Performing RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing RandomizedSearchCV for CatBoost...
Performing RandomizedSearchCV for AdaBoost...




Performing RandomizedSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.792079   
1         Decision Tree    0.831638   
2         Random Forest    0.853974   
3   Logistic Regression    0.719032   
4                  k-NN    0.847081   
5           Naive Bayes    0.683824   
6     Gradient Boosting    0.858254   
7               XGBoost    0.860837   
8              LightGBM    0.859113   
9              CatBoost    0.862561   
10             AdaBoost    0.795471   
11       Neural Network    0.709564   

                                      Best Parameters  
0        {'kernel': 'rbf', 'gamma': 'scale', 'C': 10}  
1   {'min_samples_split': 2, 'min_samples_leaf': 2...  
2   {'n_estimators': 200, 'min_samples_split': 5, ...  
3   {'solver': 'liblinear', 'penalty': 'l2', 'C': ...  
4   {'weights': 'distance', 'n_neighbors': 3, 'met...  
5             {'var_smoothing': 0.012915496650148827}  
6   {'n_estimators': 200, 'max_depth': 7, 'learnin...  
7   {'n_estimators': 200, 'max_depth': 7, 'l

In [None]:
# Storage for predictions and target column
probability_datasets = pd.DataFrame(y_train, columns=['Target'])

# Loop through models, perform random search, and save probabilities
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model,
                                       param_grid,
                                       n_iter=10,
                                       cv=cv,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       random_state=42)

    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)



    # Get probability predictions (if supported)
    if hasattr(random_search.best_estimator_, "predict_proba"):
        probabilities = random_search.best_estimator_.predict_proba(X_train)[:, 1]  # Probability for the positive class
        probability_datasets[f"{model_name}_Probabilities"] = probabilities
    else:
        # Fallback if probability prediction isn't supported
        predictions = random_search.best_estimator_.predict(X_train)
        probability_datasets[f"{model_name}_Predictions"] = predictions



# Display final dataset with probabilities
print(probability_datasets.head())

# Save the probability dataset to a CSV file
probability_datasets.to_csv("model_probabilities_with_target GDC in Randomsearch .csv", index=False)
print("Probability dataset saved to 'model_probabilities_with_target.csv'.")


Performing RandomizedSearchCV for SVM...
Performing RandomizedSearchCV for Decision Tree...
Performing RandomizedSearchCV for Random Forest...


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

Performing RandomizedSearchCV for Logistic Regression...




Performing RandomizedSearchCV for k-NN...
Performing RandomizedSearchCV for Naive Bayes...
Performing RandomizedSearchCV for Gradient Boosting...
Performing RandomizedSearchCV for XGBoost...
Performing RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000334 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing RandomizedSearchCV for CatBoost...
Performing RandomizedSearchCV for AdaBoost...




Performing RandomizedSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


   Target  SVM_Predictions  Decision Tree_Probabilities  \
0     1.0              0.0                     1.000000   
1     1.0              1.0                     0.973684   
2     1.0              1.0                     1.000000   
3     1.0              0.0                     0.666667   
4     1.0              1.0                     0.973684   

   Random Forest_Probabilities  Logistic Regression_Probabilities  \
0                     0.997857                           0.317050   
1                     0.972599                           0.990989   
2                     0.997500                           0.972281   
3                     0.725478                           0.473788   
4                     0.972599                           0.990989   

   k-NN_Probabilities  Naive Bayes_Probabilities  \
0                 1.0                   0.262839   
1                 1.0                   1.000000   
2                 1.0                   1.000000   
3                 1.0 

# parameter grids for GridSearchCV

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for GridSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing GridSearchCV for SVM...
Performing GridSearchCV for Decision Tree...
Performing GridSearchCV for Random Forest...


135 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Performing GridSearchCV for Logistic Regression...
Performing GridSearchCV for k-NN...
Performing GridSearchCV for Naive Bayes...
Performing GridSearchCV for Gradient Boosting...
Performing GridSearchCV for XGBoost...




Performing GridSearchCV for LightGBM...


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing GridSearchCV for CatBoost...
Performing GridSearchCV for AdaBoost...




Performing GridSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.807533   
1         Decision Tree    0.829051   
2         Random Forest    0.859120   
3   Logistic Regression    0.719032   
4                  k-NN    0.847081   
5           Naive Bayes    0.683824   
6     Gradient Boosting    0.857407   
7               XGBoost    0.860837   
8              LightGBM    0.860830   
9              CatBoost    0.862561   
10             AdaBoost    0.795471   
11       Neural Network    0.717341   

                                      Best Parameters  
0      {'C': 100, 'gamma': 'scale', 'kernel': 'poly'}  
1   {'max_depth': 30, 'min_samples_leaf': 2, 'min_...  
2   {'max_depth': None, 'max_features': 'sqrt', 'm...  
3   {'C': 100, 'penalty': 'l2', 'solver': 'libline...  
4   {'metric': 'manhattan', 'n_neighbors': 3, 'wei...  
5             {'var_smoothing': 0.012915496650148827}  
6   {'learning_rate': 0.1, 'max_depth': 7, 'n_esti...  
7   {'learning_rate': 0.1, 'max_depth': 7, '

In [None]:
# Prepare the final dataset with probabilities and target
all_probabilities = []
all_targets = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")

    # Special handling for SVC: enable probability estimation
    if model_name == "SVM":
        model.probability = True  # Enable probability for SVC

    # Get the parameter grid for the current model
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

    # Predict probabilities using the best estimator
    best_model = grid_search.best_estimator_
    if hasattr(best_model, "predict_proba"):
        probabilities = best_model.predict_proba(X_train)[:, 1]  # Positive class probabilities
    else:
        # Fallback for models without predict_proba (e.g., SVM with linear kernel)
        probabilities = best_model.decision_function(X_train)
        probabilities = (probabilities - probabilities.min()) / (probabilities.max() - probabilities.min())

    # Append probabilities and targets for this model
    all_probabilities.append(probabilities)
    all_targets.append(y_train)

    # Combine probabilities, features, and target into a DataFrame
    model_data = pd.DataFrame(X_train, columns=main_p.columns)  # Ensure column consistency
    model_data[f"{model_name}_probability"] = probabilities
    model_data['target'] = y_train

    # Save to CSV
    output_path = f"/content/{model_name}_probabilities.csv"
    model_data.to_csv(output_path, index=False)
    print(f"Saved probabilities for {model_name} to {output_path}")

# Combine all model probabilities into a single DataFrame (optional)
final_dataset = pd.DataFrame({'target': y_train})
for idx, model_name in enumerate(models.keys()):
    final_dataset[f"{model_name}_probability"] = all_probabilities[idx]

# Save the combined dataset
final_output_path = "/content/combined_probabilities_GridSearchCV.csv"
final_dataset.to_csv(final_output_path, index=False)
print(f"Saved combined dataset to {final_output_path}")


Performing GridSearchCV for SVM...
Saved probabilities for SVM to /content/SVM_probabilities.csv
Performing GridSearchCV for Decision Tree...
Saved probabilities for Decision Tree to /content/Decision Tree_probabilities.csv
Performing GridSearchCV for Random Forest...


135 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
70 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
skl

Saved probabilities for Random Forest to /content/Random Forest_probabilities.csv
Performing GridSearchCV for Logistic Regression...
Saved probabilities for Logistic Regression to /content/Logistic Regression_probabilities.csv
Performing GridSearchCV for k-NN...
Saved probabilities for k-NN to /content/k-NN_probabilities.csv
Performing GridSearchCV for Naive Bayes...
Saved probabilities for Naive Bayes to /content/Naive Bayes_probabilities.csv
Performing GridSearchCV for Gradient Boosting...
Saved probabilities for Gradient Boosting to /content/Gradient Boosting_probabilities.csv
Performing GridSearchCV for XGBoost...
Saved probabilities for XGBoost to /content/XGBoost_probabilities.csv
Performing GridSearchCV for LightGBM...


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000249 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 598
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 9
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Saved probabilities for LightGBM to /content/LightGBM_probabilities.csv
Performing GridSearchCV for CatBoost...
Saved probabilities for CatBoost to /content/CatBoost_probabilities.csv
Performing GridSearchCV for AdaBoost...




Saved probabilities for AdaBoost to /content/AdaBoost_probabilities.csv
Performing GridSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Saved probabilities for Neural Network to /content/Neural Network_probabilities.csv
Saved combined dataset to /content/combined_probabilities_GridSearchCV.csv


proposed model for GDC

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                            matthews_corrcoef, cohen_kappa_score, roc_auc_score)

# Load all datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Add target labels (1 for positive, 0 for negative)
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine main datasets for training and validation datasets for testing
train_data = pd.concat([main_p, main_n], axis=0)
validation_data = pd.concat([validation_p, validation_n], axis=0)

# Shuffle the datasets
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check for and drop any non-numeric columns (except Target)
non_numeric_cols = train_data.select_dtypes(exclude=[np.number]).columns.tolist()
if 'Target' in non_numeric_cols:
    non_numeric_cols.remove('Target')
if non_numeric_cols:
    print(f"Dropping non-numeric columns: {non_numeric_cols}")
    train_data = train_data.drop(columns=non_numeric_cols)
    validation_data = validation_data.drop(columns=non_numeric_cols)

# Separate features and labels
X_train = train_data.drop(columns=['Target']).values
y_train = train_data['Target'].values
X_val = validation_data.drop(columns=['Target']).values
y_val = validation_data['Target'].values

# Check feature dimensions
print(f"Training data shape: {X_train.shape}")
print(f"Validation data shape: {X_val.shape}")

if X_train.shape[1] == 0:
    raise ValueError("No features found! Check your input data.")

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# For GDC features, we'll use a Dense network instead of Conv1D
model = Sequential()

# Input layer
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Hidden layers
model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the Model
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# Model Summary
model.summary()

# Add early stopping to prevent overfitting
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

# Train the model
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=20,
    batch_size=32,
    verbose=1,
    callbacks=[early_stopping]
)

# Evaluate the model on the validation data
val_probabilities = model.predict(X_val).flatten()
val_predictions = (val_probabilities > 0.5).astype(int)

# Calculate all metrics
accuracy = accuracy_score(y_val, val_predictions)
tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = matthews_corrcoef(y_val, val_predictions)
kappa = cohen_kappa_score(y_val, val_predictions)
auc = roc_auc_score(y_val, val_probabilities)

# Print all metrics
print("\nValidation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Area Under Curve (AUC): {auc:.4f}")

# Classification report
print("\nClassification Report:\n", classification_report(y_val, val_predictions))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_predictions))

Training data shape: (1164, 9)
Validation data shape: (300, 9)


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 24ms/step - accuracy: 0.6165 - loss: 0.7807 - val_accuracy: 0.6433 - val_loss: 0.6308
Epoch 2/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.6920 - loss: 0.6160 - val_accuracy: 0.7167 - val_loss: 0.5996
Epoch 3/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - accuracy: 0.7240 - loss: 0.5420 - val_accuracy: 0.7333 - val_loss: 0.5828
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7045 - loss: 0.5402 - val_accuracy: 0.7433 - val_loss: 0.5539
Epoch 5/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7342 - loss: 0.5540 - val_accuracy: 0.7533 - val_loss: 0.5303
Epoch 6/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7450 - loss: 0.5177 - val_accuracy: 0.7400 - val_loss: 0.5180
Epoch 7/20
[1m37/37[0m [32m━━━━━━━

In [2]:
#cross validation 5 fold

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, matthews_corrcoef,
                           cohen_kappa_score, roc_auc_score)

# Load all datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_positive.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_main_negative.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_positive.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/7_GDC (Grouped Dipeptide Composition)/GDC_validation_negative.csv")

# Add target labels (1 for positive, 0 for negative)
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine all data for cross-validation
all_data = pd.concat([main_p, main_n, validation_p, validation_n])
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check and drop non-numeric columns
non_numeric_cols = all_data.select_dtypes(exclude=[np.number]).columns.tolist()
if 'Target' in non_numeric_cols:
    non_numeric_cols.remove('Target')
if non_numeric_cols:
    print(f"Dropping non-numeric columns: {non_numeric_cols}")
    all_data = all_data.drop(columns=non_numeric_cols)

# Separate features and labels
X = all_data.drop(columns=['Target']).values
y = all_data['Target'].values

# Initialize 5-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
results = []

def build_model(input_dim):
    model = Sequential([
        Dense(256, activation='relu', input_shape=(input_dim,), kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.5),

        Dense(128, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(64, activation='relu', kernel_regularizer=l2(0.01)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(1, activation='sigmoid')
    ])

    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer,
                 loss='binary_crossentropy',
                 metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])
    return model

for train_idx, val_idx in kfold.split(X, y):
    print(f'\n{"="*40}')
    print(f'Training fold {fold_no}')
    print(f'{"="*40}')

    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Normalize features
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    # Build model
    model = build_model(X_train.shape[1])

    # Callbacks
    early_stop = EarlyStopping(monitor='val_auc', patience=15, mode='max', restore_best_weights=True)
    checkpoint = ModelCheckpoint(f'best_model_fold{fold_no}.h5', monitor='val_auc', save_best_only=True, mode='max')

    # Class weighting for imbalanced data
    class_counts = np.bincount(y_train)
    class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

    # Train model
    history = model.fit(
        X_train, y_train,
        validation_data=(X_val, y_val),
        epochs=20,
        batch_size=32,
        callbacks=[early_stop, checkpoint],
        class_weight=class_weight,
        verbose=1
    )

    # Load best model
    model.load_weights(f'best_model_fold{fold_no}.h5')

    # Evaluate
    val_probabilities = model.predict(X_val).flatten()
    val_predictions = (val_probabilities > 0.5).astype(int)

    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
    metrics = {
        'fold': fold_no,
        'accuracy': accuracy_score(y_val, val_predictions),
        'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
        'mcc': matthews_corrcoef(y_val, val_predictions),
        'kappa': cohen_kappa_score(y_val, val_predictions),
        'auc': roc_auc_score(y_val, val_probabilities)
    }

    results.append(metrics)

    print(f'\nFold {fold_no} Results:')
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"MCC: {metrics['mcc']:.4f}")
    print(f"Sensitivity: {metrics['sensitivity']:.4f}")
    print(f"Specificity: {metrics['specificity']:.4f}")

    fold_no += 1

# Calculate average metrics
print('\n' + '='*50)
print('Cross-Validation Summary')
print('='*50)

avg_metrics = {
    'accuracy': np.mean([r['accuracy'] for r in results]),
    'sensitivity': np.mean([r['sensitivity'] for r in results]),
    'specificity': np.mean([r['specificity'] for r in results]),
    'mcc': np.mean([r['mcc'] for r in results]),
    'kappa': np.mean([r['kappa'] for r in results]),
    'auc': np.mean([r['auc'] for r in results])
}

print(f"\nAverage Metrics Across All Folds:")
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Sensitivity: {avg_metrics['sensitivity']:.4f}")
print(f"Specificity: {avg_metrics['specificity']:.4f}")
print(f"MCC: {avg_metrics['mcc']:.4f}")
print(f"Cohen's Kappa: {avg_metrics['kappa']:.4f}")
print(f"AUC: {avg_metrics['auc']:.4f}")

# Train final model on all data
print("\nTraining final model on all data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

final_model = build_model(X_scaled.shape[1])

# Final class weights
class_counts = np.bincount(y)
final_class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

history = final_model.fit(
    X_scaled, y,
    epochs=20,
    batch_size=32,
    class_weight=final_class_weight,
    verbose=1
)

# Save final model
final_model.save('gdc_final_model.h5')
print("Final model training complete and saved!")

# Evaluate final model
final_probabilities = final_model.predict(X_scaled).flatten()
final_predictions = (final_probabilities > 0.5).astype(int)

# Calculate final metrics
tn, fp, fn, tp = confusion_matrix(y, final_predictions).ravel()
final_metrics = {
    'accuracy': accuracy_score(y, final_predictions),
    'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
    'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
    'mcc': matthews_corrcoef(y, final_predictions),
    'kappa': cohen_kappa_score(y, final_predictions),
    'auc': roc_auc_score(y, final_probabilities)
}

print("\nFinal Model Metrics:")
print(f"Accuracy: {final_metrics['accuracy']:.4f}")
print(f"AUC: {final_metrics['auc']:.4f}")
print(f"MCC: {final_metrics['mcc']:.4f}")
print(f"Sensitivity: {final_metrics['sensitivity']:.4f}")
print(f"Specificity: {final_metrics['specificity']:.4f}")


Training fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/20
[1m34/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.5534 - auc: 0.5730 - loss: 2.5179



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 22ms/step - accuracy: 0.5535 - auc: 0.5736 - loss: 2.4926 - val_accuracy: 0.5154 - val_auc: 0.6323 - val_loss: 2.5022
Epoch 2/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5470 - auc: 0.5998 - loss: 1.6401 - val_accuracy: 0.5017 - val_auc: 0.6194 - val_loss: 1.8794
Epoch 3/20
[1m27/37[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5726 - auc: 0.6121 - loss: 1.0965



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5719 - auc: 0.6119 - loss: 1.0640 - val_accuracy: 0.5017 - val_auc: 0.6765 - val_loss: 1.4652
Epoch 4/20
[1m35/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.5837 - auc: 0.6064 - loss: 0.6879



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5857 - auc: 0.6088 - loss: 0.6823 - val_accuracy: 0.5017 - val_auc: 0.7333 - val_loss: 1.1939
Epoch 5/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.6209 - auc: 0.6762 - loss: 0.4345



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6207 - auc: 0.6764 - loss: 0.4321 - val_accuracy: 0.5017 - val_auc: 0.7596 - val_loss: 1.0160
Epoch 6/20
[1m32/37[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6363 - auc: 0.6923 - loss: 0.2753



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6373 - auc: 0.6937 - loss: 0.2705 - val_accuracy: 0.5017 - val_auc: 0.7691 - val_loss: 0.8956
Epoch 7/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6716 - auc: 0.7261 - loss: 0.1676 - val_accuracy: 0.5017 - val_auc: 0.7620 - val_loss: 0.8217
Epoch 8/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6612 - auc: 0.7375 - loss: 0.1033



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6617 - auc: 0.7379 - loss: 0.1030 - val_accuracy: 0.5017 - val_auc: 0.7725 - val_loss: 0.7656
Epoch 9/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6920 - auc: 0.7558 - loss: 0.0630 - val_accuracy: 0.5017 - val_auc: 0.7517 - val_loss: 0.7417
Epoch 10/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7040 - auc: 0.7760 - loss: 0.0385 - val_accuracy: 0.5017 - val_auc: 0.7527 - val_loss: 0.7203
Epoch 11/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7319 - auc: 0.8092 - loss: 0.0235 - val_accuracy: 0.5017 - val_auc: 0.7606 - val_loss: 0.7100
Epoch 12/20
[1m25/37[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.6751 - auc: 0.7751 - loss: 0.0151



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6789 - auc: 0.7757 - loss: 0.0146 - val_accuracy: 0.5017 - val_auc: 0.7789 - val_loss: 0.7020
Epoch 13/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6841 - auc: 0.7562 - loss: 0.0091 - val_accuracy: 0.5017 - val_auc: 0.7670 - val_loss: 0.6930
Epoch 14/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6960 - auc: 0.7810 - loss: 0.0058 - val_accuracy: 0.5017 - val_auc: 0.7557 - val_loss: 0.6898
Epoch 15/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7203 - auc: 0.7836 - loss: 0.0038 - val_accuracy: 0.5324 - val_auc: 0.7682 - val_loss: 0.6791
Epoch 16/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7248 - auc: 0.7987 - loss: 0.0026 - val_accuracy: 0.54

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.5418 - auc: 0.5359 - loss: 2.4697



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.5409 - auc: 0.5356 - loss: 2.4575 - val_accuracy: 0.5188 - val_auc: 0.6333 - val_loss: 2.4727
Epoch 2/20
[1m30/37[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.4885 - auc: 0.4808 - loss: 1.6469



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.4909 - auc: 0.4859 - loss: 1.6121 - val_accuracy: 0.5017 - val_auc: 0.6567 - val_loss: 1.8467
Epoch 3/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.5114 - auc: 0.5208 - loss: 1.0478



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5124 - auc: 0.5223 - loss: 1.0423 - val_accuracy: 0.5017 - val_auc: 0.6840 - val_loss: 1.4346
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5942 - auc: 0.6360 - loss: 0.6676



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5939 - auc: 0.6359 - loss: 0.6658 - val_accuracy: 0.5017 - val_auc: 0.7214 - val_loss: 1.1665
Epoch 5/20
[1m30/37[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5874 - auc: 0.6354 - loss: 0.4300



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5846 - auc: 0.6333 - loss: 0.4202 - val_accuracy: 0.5017 - val_auc: 0.7358 - val_loss: 0.9984
Epoch 6/20
[1m34/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.6294 - auc: 0.6925 - loss: 0.2650



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6315 - auc: 0.6941 - loss: 0.2619 - val_accuracy: 0.5017 - val_auc: 0.7589 - val_loss: 0.8932
Epoch 7/20
[1m31/37[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6362 - auc: 0.6973 - loss: 0.1650



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6357 - auc: 0.6962 - loss: 0.1616 - val_accuracy: 0.5017 - val_auc: 0.7668 - val_loss: 0.8269
Epoch 8/20
[1m35/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.6714 - auc: 0.7352 - loss: 0.0997



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6707 - auc: 0.7343 - loss: 0.0989 - val_accuracy: 0.5017 - val_auc: 0.7679 - val_loss: 0.7866
Epoch 9/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6774 - auc: 0.7532 - loss: 0.0602 - val_accuracy: 0.5017 - val_auc: 0.7594 - val_loss: 0.7615
Epoch 10/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6913 - auc: 0.7500 - loss: 0.0367 - val_accuracy: 0.5017 - val_auc: 0.7212 - val_loss: 0.7430
Epoch 11/20
[1m34/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.7133 - auc: 0.7949 - loss: 0.0226



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7126 - auc: 0.7925 - loss: 0.0223 - val_accuracy: 0.5017 - val_auc: 0.7741 - val_loss: 0.7271
Epoch 12/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6956 - auc: 0.7687 - loss: 0.0138 - val_accuracy: 0.5017 - val_auc: 0.7688 - val_loss: 0.7281
Epoch 13/20
[1m33/37[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6782 - auc: 0.7506 - loss: 0.0087



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6798 - auc: 0.7525 - loss: 0.0086 - val_accuracy: 0.5017 - val_auc: 0.7804 - val_loss: 0.7190
Epoch 14/20
[1m33/37[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6672 - auc: 0.7458 - loss: 0.0055



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6706 - auc: 0.7489 - loss: 0.0055 - val_accuracy: 0.5017 - val_auc: 0.7821 - val_loss: 0.7046
Epoch 15/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.7046 - auc: 0.7791 - loss: 0.0035



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7047 - auc: 0.7791 - loss: 0.0035 - val_accuracy: 0.5017 - val_auc: 0.7874 - val_loss: 0.6910
Epoch 16/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7068 - auc: 0.7900 - loss: 0.0024 - val_accuracy: 0.5017 - val_auc: 0.7420 - val_loss: 0.6889
Epoch 17/20
[1m35/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.6912 - auc: 0.7513 - loss: 0.0019



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6928 - auc: 0.7537 - loss: 0.0019 - val_accuracy: 0.5392 - val_auc: 0.7906 - val_loss: 0.6609
Epoch 18/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6843 - auc: 0.7727 - loss: 0.0015 - val_accuracy: 0.5495 - val_auc: 0.7858 - val_loss: 0.6610
Epoch 19/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6896 - auc: 0.7746 - loss: 0.0013 - val_accuracy: 0.5188 - val_auc: 0.7849 - val_loss: 0.6601
Epoch 20/20
[1m31/37[0m [32m━━━━━━━━━━━━━━━━[0m[37m━━━━[0m [1m0s[0m 7ms/step - accuracy: 0.7117 - auc: 0.7983 - loss: 0.0012



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.7097 - auc: 0.7941 - loss: 0.0012 - val_accuracy: 0.6724 - val_auc: 0.7930 - val_loss: 0.6104
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step

Fold 2 Results:
Accuracy: 0.6724
AUC: 0.7937
MCC: 0.3499
Sensitivity: 0.7619
Specificity: 0.5822

Training fold 3
Epoch 1/20


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m26/37[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.4394 - auc: 0.4514 - loss: 2.5664



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.4500 - auc: 0.4581 - loss: 2.4885 - val_accuracy: 0.4881 - val_auc: 0.4877 - val_loss: 2.5097
Epoch 2/20
[1m32/37[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.5160 - auc: 0.5325 - loss: 1.6631



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5174 - auc: 0.5314 - loss: 1.6370 - val_accuracy: 0.4983 - val_auc: 0.5589 - val_loss: 1.8737
Epoch 3/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.5415 - auc: 0.5537 - loss: 1.0677



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5421 - auc: 0.5543 - loss: 1.0621 - val_accuracy: 0.4983 - val_auc: 0.6226 - val_loss: 1.4554
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5798 - auc: 0.6162 - loss: 0.6830



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.5798 - auc: 0.6163 - loss: 0.6812 - val_accuracy: 0.4983 - val_auc: 0.7135 - val_loss: 1.1838
Epoch 5/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.6192 - auc: 0.6583 - loss: 0.4341



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6186 - auc: 0.6577 - loss: 0.4317 - val_accuracy: 0.4983 - val_auc: 0.7639 - val_loss: 1.0151
Epoch 6/20
[1m35/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 4ms/step - accuracy: 0.6504 - auc: 0.6944 - loss: 0.2727



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6491 - auc: 0.6935 - loss: 0.2704 - val_accuracy: 0.4983 - val_auc: 0.8091 - val_loss: 0.9120
Epoch 7/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6427 - auc: 0.7065 - loss: 0.1676 - val_accuracy: 0.4983 - val_auc: 0.7830 - val_loss: 0.8485
Epoch 8/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6523 - auc: 0.7151 - loss: 0.1031 - val_accuracy: 0.4983 - val_auc: 0.7996 - val_loss: 0.8139
Epoch 9/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6714 - auc: 0.7410 - loss: 0.0631 - val_accuracy: 0.4983 - val_auc: 0.7891 - val_loss: 0.7855
Epoch 10/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6837 - auc: 0.7652 - loss: 0.0386 - val_accuracy: 0.4983 

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m27/37[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.4973 - auc: 0.5047 - loss: 2.5572



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 21ms/step - accuracy: 0.4964 - auc: 0.5031 - loss: 2.4859 - val_accuracy: 0.5495 - val_auc: 0.5911 - val_loss: 2.4970
Epoch 2/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5284 - auc: 0.5181 - loss: 1.6383



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5282 - auc: 0.5184 - loss: 1.6342 - val_accuracy: 0.6553 - val_auc: 0.7148 - val_loss: 1.8568
Epoch 3/20
[1m25/37[0m [32m━━━━━━━━━━━━━[0m[37m━━━━━━━[0m [1m0s[0m 4ms/step - accuracy: 0.5667 - auc: 0.5697 - loss: 1.0976



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5635 - auc: 0.5688 - loss: 1.0591 - val_accuracy: 0.5870 - val_auc: 0.7808 - val_loss: 1.4347
Epoch 4/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.5564 - auc: 0.5692 - loss: 0.6822



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5573 - auc: 0.5710 - loss: 0.6785 - val_accuracy: 0.5119 - val_auc: 0.7930 - val_loss: 1.1593
Epoch 5/20
[1m29/37[0m [32m━━━━━━━━━━━━━━━[0m[37m━━━━━[0m [1m0s[0m 6ms/step - accuracy: 0.5632 - auc: 0.6187 - loss: 0.4407



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.5707 - auc: 0.6233 - loss: 0.4295 - val_accuracy: 0.4983 - val_auc: 0.8095 - val_loss: 0.9813
Epoch 6/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6377 - auc: 0.7080 - loss: 0.2686 - val_accuracy: 0.4983 - val_auc: 0.7975 - val_loss: 0.8680
Epoch 7/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.6243 - auc: 0.7002 - loss: 0.1673



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6251 - auc: 0.7006 - loss: 0.1664 - val_accuracy: 0.4983 - val_auc: 0.8148 - val_loss: 0.7982
Epoch 8/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.6445 - auc: 0.6959 - loss: 0.1023 - val_accuracy: 0.4983 - val_auc: 0.8132 - val_loss: 0.7570
Epoch 9/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.6639 - auc: 0.7325 - loss: 0.0629



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6644 - auc: 0.7329 - loss: 0.0626 - val_accuracy: 0.4983 - val_auc: 0.8315 - val_loss: 0.7296
Epoch 10/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6733 - auc: 0.7509 - loss: 0.0382 - val_accuracy: 0.4983 - val_auc: 0.7733 - val_loss: 0.7190
Epoch 11/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6751 - auc: 0.7503 - loss: 0.0235 - val_accuracy: 0.4983 - val_auc: 0.7317 - val_loss: 0.7062
Epoch 12/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7183 - auc: 0.8046 - loss: 0.0144 - val_accuracy: 0.4983 - val_auc: 0.7715 - val_loss: 0.7006
Epoch 13/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.6704 - auc: 0.7728 - loss: 0.0090 - val_accuracy: 0.49

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 5ms/step - accuracy: 0.5039 - auc: 0.5026 - loss: 2.5079



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 21ms/step - accuracy: 0.5049 - auc: 0.5042 - loss: 2.4956 - val_accuracy: 0.5000 - val_auc: 0.6943 - val_loss: 2.5096
Epoch 2/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 4ms/step - accuracy: 0.5395 - auc: 0.5509 - loss: 1.6511



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.5410 - auc: 0.5526 - loss: 1.6427 - val_accuracy: 0.5000 - val_auc: 0.7218 - val_loss: 1.8876
Epoch 3/20
[1m35/37[0m [32m━━━━━━━━━━━━━━━━━━[0m[37m━━[0m [1m0s[0m 5ms/step - accuracy: 0.5673 - auc: 0.5963 - loss: 1.0749



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5673 - auc: 0.5966 - loss: 1.0664 - val_accuracy: 0.5000 - val_auc: 0.7425 - val_loss: 1.4861
Epoch 4/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.5760 - auc: 0.6137 - loss: 0.6864



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5763 - auc: 0.6138 - loss: 0.6846 - val_accuracy: 0.5000 - val_auc: 0.7629 - val_loss: 1.2225
Epoch 5/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5930 - auc: 0.6230 - loss: 0.4343 - val_accuracy: 0.5000 - val_auc: 0.7581 - val_loss: 1.0525
Epoch 6/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - accuracy: 0.6610 - auc: 0.7161 - loss: 0.2730



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6607 - auc: 0.7159 - loss: 0.2723 - val_accuracy: 0.5000 - val_auc: 0.7690 - val_loss: 0.9441
Epoch 7/20
[1m33/37[0m [32m━━━━━━━━━━━━━━━━━[0m[37m━━━[0m [1m0s[0m 5ms/step - accuracy: 0.6355 - auc: 0.7048 - loss: 0.1715



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6393 - auc: 0.7094 - loss: 0.1690 - val_accuracy: 0.5000 - val_auc: 0.7859 - val_loss: 0.8689
Epoch 8/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.6827 - auc: 0.7440 - loss: 0.1040 - val_accuracy: 0.5000 - val_auc: 0.7748 - val_loss: 0.8265
Epoch 9/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.6917 - auc: 0.7656 - loss: 0.0637 - val_accuracy: 0.5000 - val_auc: 0.7733 - val_loss: 0.7902
Epoch 10/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.7036 - auc: 0.7842 - loss: 0.0389 - val_accuracy: 0.5000 - val_auc: 0.7570 - val_loss: 0.7641
Epoch 11/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.6764 - auc: 0.7534 - loss: 0.0239 - val_accuracy: 0.5000



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6656 - auc: 0.7504 - loss: 0.0092 - val_accuracy: 0.5000 - val_auc: 0.7914 - val_loss: 0.7145
Epoch 14/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.7213 - auc: 0.7817 - loss: 0.0058 - val_accuracy: 0.5000 - val_auc: 0.7627 - val_loss: 0.7036
Epoch 15/20
[1m36/37[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 8ms/step - accuracy: 0.6899 - auc: 0.7717 - loss: 0.0038



[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - accuracy: 0.6902 - auc: 0.7721 - loss: 0.0038 - val_accuracy: 0.5000 - val_auc: 0.7953 - val_loss: 0.6977
Epoch 16/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - accuracy: 0.6981 - auc: 0.7708 - loss: 0.0026 - val_accuracy: 0.5000 - val_auc: 0.7711 - val_loss: 0.6830
Epoch 17/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7095 - auc: 0.7917 - loss: 0.0019 - val_accuracy: 0.5445 - val_auc: 0.7535 - val_loss: 0.6630
Epoch 18/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.7143 - auc: 0.7876 - loss: 0.0015 - val_accuracy: 0.6644 - val_auc: 0.7661 - val_loss: 0.6576
Epoch 19/20
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 8ms/step - accuracy: 0.7011 - auc: 0.7665 - loss: 0.0013 - val_accuracy: 0.

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 5ms/step - accuracy: 0.4913 - auc: 0.4969 - loss: 2.4218
Epoch 2/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5255 - auc: 0.5055 - loss: 1.4304
Epoch 3/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5420 - auc: 0.5711 - loss: 0.8279
Epoch 4/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.5721 - auc: 0.6144 - loss: 0.4706
Epoch 5/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6173 - auc: 0.6567 - loss: 0.2626
Epoch 6/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6492 - auc: 0.7073 - loss: 0.1442
Epoch 7/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - accuracy: 0.6899 - auc: 0.7433 - loss: 0.0782
Epoch 8/20
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - a



Final model training complete and saved!
[1m46/46[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step

Final Model Metrics:
Accuracy: 0.7322
AUC: 0.7811
MCC: 0.4657
Sensitivity: 0.6967
Specificity: 0.7678
