In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO

# Amino acid alphabet
AminoAcids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Mapping of amino acid characters to integers
aa_dict = {aa: i for i, aa in enumerate(AminoAcids)}

# Function to calculate the CTDD feature (correlation of tri-peptides)
def calculate_ctdd(sequence):
    tri_peptide_count = np.zeros((20, 20, 20))  # 3D array for counting tri-peptide frequencies
    for i in range(len(sequence) - 2):
        first_aa = sequence[i]
        second_aa = sequence[i + 1]
        third_aa = sequence[i + 2]

        if first_aa in aa_dict and second_aa in aa_dict and third_aa in aa_dict:
            tri_peptide_count[aa_dict[first_aa], aa_dict[second_aa], aa_dict[third_aa]] += 1

    total_tri_peptides = np.sum(tri_peptide_count)

    if total_tri_peptides > 0:
        tri_peptide_count = tri_peptide_count / total_tri_peptides  # Normalize by total count

    # Flatten the 3D array into a 1D feature vector
    return tri_peptide_count.flatten()

# Function to extract CTDD features from sequences in FASTA format
def extract_ctdd_features(fasta_path):
    features = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = str(record.seq)
        ctdd_features = calculate_ctdd(seq)
        features.append(ctdd_features)

    # Convert the list of feature vectors into a DataFrame
    df = pd.DataFrame(features)
    return df

# Define file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_validation (2) (1).fasta"

# Output directories for each file
output_dirs = {
    "main_positive": "/content/ctdd_main_positive_features.csv",
    "main_negative": "/content/ctdd_main_negative_features.csv",
    "validation_positive": "/content/ctdd_validation_positive_features.csv",
    "validation_negative": "/content/ctdd_validation_negative_features.csv"
}

# Function to ensure the directory exists
def ensure_directory_exists(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

# Main workflow function
def main():
    # File paths for input FASTA files
    datasets = [
        (main_p, output_dirs["main_positive"]),
        (main_n, output_dirs["main_negative"]),
        (validation_p, output_dirs["validation_positive"]),
        (validation_n, output_dirs["validation_negative"])
    ]

    # Extract features and save to CSV
    for fasta_path, output_csv in datasets:
        print(f"Processing {fasta_path}...")

        # Ensure the directory for saving the output file exists
        ensure_directory_exists(output_csv)

        # Extract features and save to CSV
        features_df = extract_ctdd_features(fasta_path)
        features_df.to_csv(output_csv, index=False)
        print(f"CTDD features saved to {output_csv}")

if __name__ == "__main__":
    main()


Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_main (2) (1).fasta...
CTDD features saved to /content/ctdd_main_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_main (2) (1).fasta...
CTDD features saved to /content/ctdd_main_negative_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_validation (2) (1).fasta...
CTDD features saved to /content/ctdd_validation_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_validation (2) (1).fasta...
CTDD features saved to /content/ctdd_validation_negative_features.csv


In [None]:
!pip install pfeature



In [None]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO

# Amino acid alphabet
AminoAcids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Mapping of amino acid characters to integers
aa_dict = {aa: i for i, aa in enumerate(AminoAcids)}

# Generate all possible tri-peptide combinations as feature names
tri_peptide_names = [f"{a1}{a2}{a3}" for a1 in AminoAcids for a2 in AminoAcids for a3 in AminoAcids]

# Function to calculate the CTDD feature (correlation of tri-peptides)
def calculate_ctdd(sequence):
    tri_peptide_count = np.zeros((20, 20, 20))  # 3D array for counting tri-peptide frequencies
    for i in range(len(sequence) - 2):
        first_aa = sequence[i]
        second_aa = sequence[i + 1]
        third_aa = sequence[i + 2]

        if first_aa in aa_dict and second_aa in aa_dict and third_aa in aa_dict:
            tri_peptide_count[aa_dict[first_aa], aa_dict[second_aa], aa_dict[third_aa]] += 1

    total_tri_peptides = np.sum(tri_peptide_count)

    if total_tri_peptides > 0:
        tri_peptide_count = tri_peptide_count / total_tri_peptides  # Normalize by total count

    # Flatten the 3D array into a 1D feature vector
    return tri_peptide_count.flatten()

# Function to extract CTDD features from sequences in FASTA format
def extract_ctdd_features(fasta_path):
    features = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = str(record.seq)
        ctdd_features = calculate_ctdd(seq)
        features.append(ctdd_features)

    # Convert the list of feature vectors into a DataFrame
    df = pd.DataFrame(features, columns=tri_peptide_names)
    return df

# Define file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_validation (2) (1).fasta"

# Output directories for each file
output_dirs = {
    "main_positive": "/content/ctdd_main_positive_features.csv",
    "main_negative": "/content/ctdd_main_negative_features.csv",
    "validation_positive": "/content/ctdd_validation_positive_features.csv",
    "validation_negative": "/content/ctdd_validation_negative_features.csv"
}

# Function to ensure the directory exists
def ensure_directory_exists(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

# Main workflow function
def main():
    # File paths for input FASTA files
    datasets = [
        (main_p, output_dirs["main_positive"]),
        (main_n, output_dirs["main_negative"]),
        (validation_p, output_dirs["validation_positive"]),
        (validation_n, output_dirs["validation_negative"])
    ]

    # Extract features and save to CSV
    for fasta_path, output_csv in datasets:
        print(f"Processing {fasta_path}...")

        # Ensure the directory for saving the output file exists
        ensure_directory_exists(output_csv)

        # Extract features and save to CSV
        features_df = extract_ctdd_features(fasta_path)
        features_df.to_csv(output_csv, index=False)
        print(f"CTDD features saved to {output_csv}")

if __name__ == "__main__":
    main()


Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_main (2) (1).fasta...
CTDD features saved to /content/ctdd_main_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_main (2) (1).fasta...
CTDD features saved to /content/ctdd_main_negative_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_validation (2) (1).fasta...
CTDD features saved to /content/ctdd_validation_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_validation (2) (1).fasta...
CTDD features saved to /content/ctdd_validation_negative_features.csv


In [None]:
!pip show pfeature  # Check if pfeature is installed and its location
!pip install pfeature --upgrade #Update the library incase there's an updated version that has the PyProtein class.

Name: pfeature
Version: 1.4
Summary: A tool to compute the features of protein and peptide sequences
Home-page: https://github.com/raghavagps/pfeature
Author: 
Author-email: 
License: 
Location: /usr/local/lib/python3.11/dist-packages
Requires: numpy, pandas
Required-by: 


In [None]:
import pandas as pd
from pfeature import PyProtein

# Define the input file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/NEGATIVE_validation (2) (1).fasta"

# Output file paths
output_main_p = "/content/CTDD_features_positive_main.csv"
output_main_n = "/content/CTDD_features_negative_main.csv"
output_validation_p = "/content/CTDD_features_positive_validation.csv"
output_validation_n = "/content/CTDD_features_negative_validation.csv"

# Function to extract CTDD features from a FASTA file
def extract_ctdd_features(fasta_file, output_file):
    features = []
    labels = []
    with open(fasta_file, "r") as f:
        lines = f.readlines()
        for i in range(0, len(lines), 2):  # FASTA format alternates between headers and sequences
            sequence = lines[i + 1].strip()
            protein = PyProtein(sequence)
            ctdd_features = protein.GetCTDD()
            features.append(list(ctdd_features.values()))
            labels.append(lines[i].strip())  # Save header as label

    # Convert features into a DataFrame
    df = pd.DataFrame(features, columns=ctdd_features.keys())
    df.insert(0, "Sequence_ID", labels)  # Insert Sequence ID
    df.to_csv(output_file, index=False)
    print(f"CTDD features saved to {output_file}")

# Extract and save features for all datasets
extract_ctdd_features(main_p, output_main_p)
extract_ctdd_features(main_n, output_main_n)
extract_ctdd_features(validation_p, output_validation_p)
extract_ctdd_features(validation_n, output_validation_n)


ImportError: cannot import name 'PyProtein' from 'pfeature' (unknown location)

# ***All Algorithm ***

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})


Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Training CatBoost...

Training AdaBoost...





Training Neural Network...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0         Random Forest        0.995704             0.913333
1        Neural Network        0.995704             0.903333
2                   MLP        0.995704             0.900000
3              CatBoost        0.970790             0.900000
4               XGBoost        0.963918             0.896667
5         Decision Tree        0.995704             0.893333
6              AdaBoost        0.939863             0.890000
7     Gradient Boosting        0.933849             0.876667
8           Naive Bayes        0.959622             0.866667
9                  k-NN        0.928694             0.856667
10             LightGBM        0.887457             0.840000
11  Logistic Regression        0.849656             0.823333
12                  SVM        0.809278             0.773333


# **CROSS VALIDATION**

In [None]:
!pip install catboost



In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}


In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000819 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 631
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000804 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 931, number of used 




Performing Cross-validation for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step 


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step

Performing Cross-validation for MLP...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0                   MLP          0.927834         0.011328
1        Neural Network          0.927830         0.006918
2         Random Forest          0.920105         0.007461
3              CatBoost          0.898613         0.018209
4           Naive Bayes          0.892648         0.025944
5               XGBoost          0.888320         0.013813
6     Gradient Boosting          0.873705         0.018966
7         Decision Tree          0.871134         0.019762
8              AdaBoost          0.867704         0.015870
9                  k-NN          0.856504         0.022010
10  Logistic Regression          0.804976         0.016045
11             LightGBM          0.784372         0.013287
12                  SVM          0.766320         0.012068


# **Hyperparameter optimization with Optuna**

In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: M

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2024-12-27 09:44:27,427] A new study created in memory with name: no-name-4c32d2a6-ce43-40c2-afc0-1ade4c571ba1


Optimizing SVM...


[I 2024-12-27 09:44:37,139] Trial 0 finished with value: 0.8633333333333333 and parameters: {'C': 9.48374615820333, 'kernel': 'poly'}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:44:46,433] Trial 1 finished with value: 0.8566666666666667 and parameters: {'C': 0.27425940547801, 'kernel': 'rbf'}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:44:55,568] Trial 2 finished with value: 0.8633333333333333 and parameters: {'C': 9.456950246446223, 'kernel': 'poly'}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:45:00,964] Trial 3 finished with value: 0.89 and parameters: {'C': 6.128819341645765, 'kernel': 'sigmoid'}. Best is trial 3 with value: 0.89.
[I 2024-12-27 09:45:11,342] Trial 4 finished with value: 0.84 and parameters: {'C': 4.9571273035859695, 'kernel': 'poly'}. Best is trial 3 with value: 0.89.
[I 2024-12-27 09:45:19,389] Trial 5 finished with value: 0.8666666666666667 and parameters: {'C': 2.228288113038584, 'kernel': 'linear'

Optimizing Decision Tree...


[I 2024-12-27 09:48:36,424] Trial 1 finished with value: 0.8433333333333334 and parameters: {'max_depth': 15, 'min_samples_split': 5}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-27 09:48:36,754] Trial 2 finished with value: 0.85 and parameters: {'max_depth': 16, 'min_samples_split': 10}. Best is trial 2 with value: 0.85.
[I 2024-12-27 09:48:37,054] Trial 3 finished with value: 0.8333333333333334 and parameters: {'max_depth': 14, 'min_samples_split': 5}. Best is trial 2 with value: 0.85.
[I 2024-12-27 09:48:37,363] Trial 4 finished with value: 0.8233333333333334 and parameters: {'max_depth': 14, 'min_samples_split': 6}. Best is trial 2 with value: 0.85.
[I 2024-12-27 09:48:37,709] Trial 5 finished with value: 0.8566666666666667 and parameters: {'max_depth': 19, 'min_samples_split': 10}. Best is trial 5 with value: 0.8566666666666667.
[I 2024-12-27 09:48:38,238] Trial 6 finished with value: 0.8633333333333333 and parameters: {'max_depth': 20, 'min_samples_split': 2}. Best

Optimizing Random Forest...


[I 2024-12-27 09:48:49,503] Trial 0 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 402, 'max_depth': 14, 'min_samples_split': 4}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:48:50,011] Trial 1 finished with value: 0.7566666666666667 and parameters: {'n_estimators': 161, 'max_depth': 3, 'min_samples_split': 9}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:48:52,617] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 432, 'max_depth': 16, 'min_samples_split': 7}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:48:56,973] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 483, 'max_depth': 14, 'min_samples_split': 2}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-27 09:48:59,194] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 473, 'max_depth': 11, 'min_samples_split': 7}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-1

Optimizing Logistic Regression...


[I 2024-12-27 09:49:44,548] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 5.66531771641788, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 09:49:44,785] Trial 1 finished with value: 0.8533333333333334 and parameters: {'C': 1.8817306496067852, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 09:49:45,094] Trial 2 finished with value: 0.86 and parameters: {'C': 2.9934116528700856, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 09:49:45,534] Trial 3 finished with value: 0.8633333333333333 and parameters: {'C': 3.0577018752653746, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 09:49:46,028] Trial 4 finished with value: 0.8733333333333333 and parameters: {'C': 4.222293523927825, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 09:49:46,441] Trial 5 finished with value: 0.8833333333333333 and

Optimizing k-NN...


[I 2024-12-27 09:49:57,993] Trial 0 finished with value: 0.8566666666666667 and parameters: {'n_neighbors': 5}. Best is trial 0 with value: 0.8566666666666667.
[I 2024-12-27 09:49:58,366] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8566666666666667.
[I 2024-12-27 09:49:58,730] Trial 2 finished with value: 0.86 and parameters: {'n_neighbors': 4}. Best is trial 2 with value: 0.86.
[I 2024-12-27 09:49:59,105] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 2 with value: 0.86.
[I 2024-12-27 09:49:59,464] Trial 4 finished with value: 0.84 and parameters: {'n_neighbors': 18}. Best is trial 2 with value: 0.86.
[I 2024-12-27 09:49:59,824] Trial 5 finished with value: 0.87 and parameters: {'n_neighbors': 10}. Best is trial 5 with value: 0.87.
[I 2024-12-27 09:50:00,189] Trial 6 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 5 with 

Optimizing Naive Bayes...


[I 2024-12-27 09:50:11,065] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:11,389] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:11,662] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:11,933] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:12,211] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:12,475] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 09:50:12,763] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-27 09:50:35,050] Trial 0 finished with value: 0.89 and parameters: {'n_estimators': 232, 'learning_rate': 0.3072125263944971, 'max_depth': 3}. Best is trial 0 with value: 0.89.
[I 2024-12-27 09:51:56,381] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 386, 'learning_rate': 0.1838602134415169, 'max_depth': 11}. Best is trial 0 with value: 0.89.
[I 2024-12-27 09:54:05,265] Trial 2 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 380, 'learning_rate': 0.37504103957927293, 'max_depth': 20}. Best is trial 2 with value: 0.8966666666666666.
[I 2024-12-27 09:54:47,574] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 166, 'learning_rate': 0.11423545236477257, 'max_depth': 14}. Best is trial 2 with value: 0.8966666666666666.
[I 2024-12-27 09:55:58,964] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 375, 'learning_rate': 0.12062385405413355, 'max_depth': 10}. Best is trial 2 w

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 10:16:37,035] Trial 0 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 354, 'max_depth': 12, 'learning_rate': 0.489615601539429}. Best is trial 0 with value: 0.8966666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 10:17:03,715] Trial 1 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 320, 'max_depth': 20, 'learning_rate': 0.07145972604866886}. Best is trial 1 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 10:17:14,809] Trial 2 finished with value: 0.91 and parameters: {'n_estimators': 91, 'max_depth': 9, 'learning_rate': 0.3543502329483138}. Best is trial 2 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 10:17:29,275] Trial 3 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 143, 'max_depth': 18, 'learning_rate': 0.4829042283271115}. Best i

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001549 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:20,442] Trial 0 finished with value: 0.84 and parameters: {'n_estimators': 273, 'max_depth': 16, 'learning_rate': 0.03519117209169544}. Best is trial 0 with value: 0.84.




[I 2024-12-27 10:27:20,799] Trial 1 finished with value: 0.84 and parameters: {'n_estimators': 182, 'max_depth': 9, 'learning_rate': 0.15053284847105747}. Best is trial 0 with value: 0.84.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001154 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:21,072] Trial 2 finished with value: 0.79 and parameters: {'n_estimators': 86, 'max_depth': 5, 'learning_rate': 0.023420926727606355}. Best is trial 0 with value: 0.84.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:21,367] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 94, 'max_depth': 16, 'learning_rate': 0.11780664357526285}. Best is trial 3 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:21,851] Trial 4 finished with value: 0.84 and parameters: {'n_estimators': 346, 'max_depth': 9, 'learning_rate': 0.18578563759392544}. Best is trial 3 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001385 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:22,449] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 358, 'max_depth': 14, 'learning_rate': 0.11960116281685745}. Best is trial 3 with value: 0.8433333333333334.




[I 2024-12-27 10:27:22,698] Trial 6 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 67, 'max_depth': 17, 'learning_rate': 0.49127146979377745}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001405 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:23,039] Trial 7 finished with value: 0.84 and parameters: {'n_estimators': 168, 'max_depth': 11, 'learning_rate': 0.1482209004753622}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:23,590] Trial 8 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 466, 'max_depth': 19, 'learning_rate': 0.15537987534470718}. Best is trial 6 with value: 0.8533333333333334.




[I 2024-12-27 10:27:23,948] Trial 9 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 164, 'max_depth': 17, 'learning_rate': 0.04353517923865583}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:24,377] Trial 10 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 252, 'max_depth': 20, 'learning_rate': 0.4871811950846715}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:24,985] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 500, 'max_depth': 20, 'learning_rate': 0.34303600487708297}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:25,576] Trial 12 finished with value: 0.85 and parameters: {'n_estimators': 493, 'max_depth': 20, 'learning_rate': 0.4303218595265374}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:26,101] Trial 13 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 396, 'max_depth': 13, 'learning_rate': 0.3356613519568823}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001840 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:26,746] Trial 14 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 403, 'max_depth': 18, 'learning_rate': 0.35125282114726863}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:27,132] Trial 15 finished with value: 0.84 and parameters: {'n_estimators': 219, 'max_depth': 15, 'learning_rate': 0.3022517675745126}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:27,609] Trial 16 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 298, 'max_depth': 3, 'learning_rate': 0.41768439984051586}. Best is trial 6 with value: 0.8533333333333334.
[I 2024-12-27 10:27:28,016] Trial 17 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 62, 'max_depth': 11, 'learning_rate': 0.4966406230020526}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001667 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:28,507] Trial 18 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 128, 'max_depth': 18, 'learning_rate': 0.25713979171410006}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001650 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:29,253] Trial 19 finished with value: 0.85 and parameters: {'n_estimators': 443, 'max_depth': 13, 'learning_rate': 0.40532154719578084}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001648 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:29,887] Trial 20 finished with value: 0.85 and parameters: {'n_estimators': 323, 'max_depth': 20, 'learning_rate': 0.239828739316355}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001686 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2024-12-27 10:27:30,581] Trial 21 finished with value: 0.85 and parameters: {'n_estimators': 419, 'max_depth': 18, 'learning_rate': 0.34645127762960387}. Best is trial 6 with value: 0.8533333333333334.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001817 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:31,511] Trial 22 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 488, 'max_depth': 18, 'learning_rate': 0.3495445119892945}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001739 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:32,244] Trial 23 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 399, 'max_depth': 16, 'learning_rate': 0.3876656040749049}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001495 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:32,800] Trial 24 finished with value: 0.85 and parameters: {'n_estimators': 442, 'max_depth': 18, 'learning_rate': 0.26336459275771706}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:33,341] Trial 25 finished with value: 0.85 and parameters: {'n_estimators': 385, 'max_depth': 20, 'learning_rate': 0.45483891661794457}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001474 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:33,781] Trial 26 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 218, 'max_depth': 17, 'learning_rate': 0.30595284002528683}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:34,390] Trial 27 finished with value: 0.85 and parameters: {'n_estimators': 499, 'max_depth': 14, 'learning_rate': 0.3801567835135996}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:34,945] Trial 28 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 447, 'max_depth': 19, 'learning_rate': 0.463931621699485}. Best is trial 6 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 10:27:35,400] Trial 29 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 259, 'max_depth': 15, 'learning_rate': 0.21375590608389772}. Best is trial 6 with value: 0.8533333333333334.
[I 2024-12-27 10:27:35,406] A new study created in memory with name: no-name-8e563d69-5072-4695-81c0-e9ecf3ff9fd1


Optimizing AdaBoost...


[I 2024-12-27 10:27:59,025] Trial 0 finished with value: 0.7433333333333333 and parameters: {'n_estimators': 329, 'learning_rate': 0.26022732552691485}. Best is trial 0 with value: 0.7433333333333333.
[I 2024-12-27 10:28:12,061] Trial 1 finished with value: 0.6933333333333334 and parameters: {'n_estimators': 179, 'learning_rate': 0.19074909345015237}. Best is trial 0 with value: 0.7433333333333333.
[I 2024-12-27 10:28:24,622] Trial 2 finished with value: 0.81 and parameters: {'n_estimators': 169, 'learning_rate': 0.9759684112763864}. Best is trial 2 with value: 0.81.
[I 2024-12-27 10:28:56,168] Trial 3 finished with value: 0.84 and parameters: {'n_estimators': 434, 'learning_rate': 0.7497285892889262}. Best is trial 3 with value: 0.84.
[I 2024-12-27 10:29:24,110] Trial 4 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 392, 'learning_rate': 0.2036151773922928}. Best is trial 3 with value: 0.84.
[I 2024-12-27 10:29:37,261] Trial 5 finished with value: 0.663333333

Optimizing Neural Network...


[I 2024-12-27 10:40:22,683] Trial 0 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 65, 'hidden_layer_2': 48, 'learning_rate_init': 0.010208593746490243}. Best is trial 0 with value: 0.8966666666666666.
[I 2024-12-27 10:41:01,357] Trial 1 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 86, 'hidden_layer_2': 89, 'learning_rate_init': 0.054178009400851226}. Best is trial 0 with value: 0.8966666666666666.
[I 2024-12-27 10:41:40,850] Trial 2 finished with value: 0.9166666666666666 and parameters: {'hidden_layer_1': 96, 'hidden_layer_2': 87, 'learning_rate_init': 0.09278249388092663}. Best is trial 2 with value: 0.9166666666666666.
[I 2024-12-27 10:41:58,180] Trial 3 finished with value: 0.9166666666666666 and parameters: {'hidden_layer_1': 91, 'hidden_layer_2': 37, 'learning_rate_init': 0.05239655816861249}. Best is trial 2 with value: 0.9166666666666666.
[I 2024-12-27 10:42:15,619] Trial 4 finished with value: 0.91 and parameters: {'hi

Optimizing MLP...


[I 2024-12-27 10:50:47,665] Trial 0 finished with value: 0.9233333333333333 and parameters: {'layer_1': 85, 'layer_2': 53, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.05618522834368167}. Best is trial 0 with value: 0.9233333333333333.
[I 2024-12-27 10:50:58,086] Trial 1 finished with value: 0.9 and parameters: {'layer_1': 61, 'layer_2': 132, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.03666585848227141}. Best is trial 0 with value: 0.9233333333333333.
[I 2024-12-27 10:51:45,242] Trial 2 finished with value: 0.8733333333333333 and parameters: {'layer_1': 101, 'layer_2': 97, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.04009653095661493}. Best is trial 0 with value: 0.9233333333333333.
[I 2024-12-27 10:52:31,166] Trial 3 finished with value: 0.9066666666666666 and parameters: {'layer_1': 142, 'layer_2': 103, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.03540527915881024}. Best is trial 0 with value: 0.92333333333

                  Model  Accuracy  \
0                   SVM  0.903333   
1         Decision Tree  0.870000   
2         Random Forest  0.893333   
3   Logistic Regression  0.886667   
4                  k-NN  0.883333   
5           Naive Bayes  0.866667   
6     Gradient Boosting  0.903333   
7               XGBoost  0.910000   
8              LightGBM  0.853333   
9              AdaBoost  0.886667   
10       Neural Network  0.933333   
11                  MLP  0.923333   

                                          Best Params  
0           {'C': 6.085864540540266, 'kernel': 'rbf'}  
1           {'max_depth': 19, 'min_samples_split': 3}  
2   {'n_estimators': 205, 'max_depth': 18, 'min_sa...  
3          {'C': 5.66531771641788, 'solver': 'lbfgs'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 153, 'learning_rate': 0.01358...  
7   {'n_estimators': 91, 'max_depth': 9, 'learning...  
8   {'n_estima

In [None]:
!pip install scikit-learn



In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2024-12-27 11:08:25,592] A new study created in memory with name: no-name-9d2e186a-9924-42f1-a76a-c11b2f89ff83


Optimizing SVM...


[I 2024-12-27 11:08:34,233] Trial 0 finished with value: 0.89 and parameters: {'C': 5.528009860655345, 'kernel': 'linear'}. Best is trial 0 with value: 0.89.
[I 2024-12-27 11:08:42,921] Trial 1 finished with value: 0.8566666666666667 and parameters: {'C': 1.882477897081928, 'kernel': 'linear'}. Best is trial 0 with value: 0.89.
[I 2024-12-27 11:08:51,264] Trial 2 finished with value: 0.9033333333333333 and parameters: {'C': 3.8008971659568984, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-27 11:08:58,112] Trial 3 finished with value: 0.8766666666666667 and parameters: {'C': 4.204386891705302, 'kernel': 'linear'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-27 11:09:05,495] Trial 4 finished with value: 0.8966666666666666 and parameters: {'C': 0.9912576605297914, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-27 11:09:14,271] Trial 5 finished with value: 0.75 and parameters: {'C': 1.0766126215230611, 'kernel': 'p

Optimizing Decision Tree...


[I 2024-12-27 11:12:27,216] Trial 1 finished with value: 0.8533333333333334 and parameters: {'max_depth': 19, 'min_samples_split': 5}. Best is trial 1 with value: 0.8533333333333334.
[I 2024-12-27 11:12:27,439] Trial 2 finished with value: 0.77 and parameters: {'max_depth': 7, 'min_samples_split': 7}. Best is trial 1 with value: 0.8533333333333334.
[I 2024-12-27 11:12:27,718] Trial 3 finished with value: 0.83 and parameters: {'max_depth': 13, 'min_samples_split': 7}. Best is trial 1 with value: 0.8533333333333334.
[I 2024-12-27 11:12:28,091] Trial 4 finished with value: 0.86 and parameters: {'max_depth': 20, 'min_samples_split': 5}. Best is trial 4 with value: 0.86.
[I 2024-12-27 11:12:28,316] Trial 5 finished with value: 0.78 and parameters: {'max_depth': 7, 'min_samples_split': 5}. Best is trial 4 with value: 0.86.
[I 2024-12-27 11:12:28,570] Trial 6 finished with value: 0.8066666666666666 and parameters: {'max_depth': 10, 'min_samples_split': 8}. Best is trial 4 with value: 0.86.
[I

Optimizing Random Forest...


[I 2024-12-27 11:12:39,784] Trial 0 finished with value: 0.87 and parameters: {'n_estimators': 283, 'max_depth': 19, 'min_samples_split': 8}. Best is trial 0 with value: 0.87.
[I 2024-12-27 11:12:41,925] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 451, 'max_depth': 12, 'min_samples_split': 8}. Best is trial 0 with value: 0.87.
[I 2024-12-27 11:12:42,491] Trial 2 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 93, 'max_depth': 13, 'min_samples_split': 7}. Best is trial 0 with value: 0.87.
[I 2024-12-27 11:12:44,206] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 270, 'max_depth': 12, 'min_samples_split': 4}. Best is trial 0 with value: 0.87.
[I 2024-12-27 11:12:47,757] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 391, 'max_depth': 16, 'min_samples_split': 7}. Best is trial 0 with value: 0.87.
[I 2024-12-27 11:12:49,040] Trial 5 finished with value: 0.7933333333333333 and

Optimizing Logistic Regression...


[I 2024-12-27 11:13:29,904] Trial 0 finished with value: 0.79 and parameters: {'C': 0.5795293005238256, 'solver': 'liblinear'}. Best is trial 0 with value: 0.79.
[I 2024-12-27 11:13:30,254] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 8.34888059318696, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2024-12-27 11:13:30,780] Trial 2 finished with value: 0.8533333333333334 and parameters: {'C': 2.271608684536865, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8866666666666667.
[I 2024-12-27 11:13:31,139] Trial 3 finished with value: 0.8266666666666667 and parameters: {'C': 1.1232680920816671, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8866666666666667.
[I 2024-12-27 11:13:31,526] Trial 4 finished with value: 0.8833333333333333 and parameters: {'C': 4.622768559039366, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8866666666666667.
[I 2024-12-27 11:13:31,971] Trial 5 finished with value: 0.8833333333333333 and parameters: {'

Optimizing k-NN...


[I 2024-12-27 11:13:39,698] Trial 0 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:40,063] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 7}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:40,418] Trial 2 finished with value: 0.84 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:40,803] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:41,369] Trial 4 finished with value: 0.84 and parameters: {'n_neighbors': 16}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:41,956] Trial 5 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8533333333333334.
[I 2024-12-27 11:13:42,523] Trial 6 finished with value: 0.8466

Optimizing Naive Bayes...


[I 2024-12-27 11:13:52,335] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:52,613] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:52,892] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:53,168] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:53,449] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:53,728] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 11:13:54,004] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-27 11:14:38,972] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 435, 'learning_rate': 0.04349909640361378, 'max_depth': 4}. Best is trial 0 with value: 0.88.
[I 2024-12-27 11:15:00,063] Trial 1 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 143, 'learning_rate': 0.0494399357365585, 'max_depth': 7}. Best is trial 1 with value: 0.8833333333333333.
[I 2024-12-27 11:16:37,544] Trial 2 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 384, 'learning_rate': 0.0414966285536507, 'max_depth': 14}. Best is trial 1 with value: 0.8833333333333333.
[I 2024-12-27 11:18:59,723] Trial 3 finished with value: 0.89 and parameters: {'n_estimators': 457, 'learning_rate': 0.4770438123429577, 'max_depth': 20}. Best is trial 3 with value: 0.89.
[I 2024-12-27 11:19:39,292] Trial 4 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 206, 'learning_rate': 0.0720647048790072, 'max_depth': 10}. Best is trial 3 with

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 11:51:44,075] Trial 0 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 303, 'max_depth': 7, 'learning_rate': 0.3467772509100809}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 11:52:12,220] Trial 1 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 429, 'max_depth': 10, 'learning_rate': 0.03690306722083159}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 11:52:30,781] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 336, 'max_depth': 3, 'learning_rate': 0.09603889073571087}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 11:52:54,154] Trial 3 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 295, 'max_depth': 11, 'learning_rate': 0.0227822254

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:15,570] Trial 0 finished with value: 0.84 and parameters: {'n_estimators': 109, 'max_depth': 20, 'learning_rate': 0.4380068374433838}. Best is trial 0 with value: 0.84.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:16,103] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 428, 'max_depth': 8, 'learning_rate': 0.13814652026793692}. Best is trial 1 with value: 0.8466666666666667.




[I 2024-12-27 11:59:16,417] Trial 2 finished with value: 0.84 and parameters: {'n_estimators': 138, 'max_depth': 9, 'learning_rate': 0.4687598446591448}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:17,034] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 408, 'max_depth': 4, 'learning_rate': 0.2587201965124136}. Best is trial 1 with value: 0.8466666666666667.




[I 2024-12-27 11:59:17,393] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 201, 'max_depth': 16, 'learning_rate': 0.2869174349129075}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:17,679] Trial 5 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 92, 'max_depth': 12, 'learning_rate': 0.028907888736585202}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001435 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:17,944] Trial 6 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 64, 'max_depth': 16, 'learning_rate': 0.4108675434632906}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001400 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:18,231] Trial 7 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 89, 'max_depth': 8, 'learning_rate': 0.04954412962341742}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:18,572] Trial 8 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 194, 'max_depth': 7, 'learning_rate': 0.4543269207608086}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001623 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:18,968] Trial 9 finished with value: 0.85 and parameters: {'n_estimators': 252, 'max_depth': 6, 'learning_rate': 0.22704654871053417}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001286 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:19,417] Trial 10 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 338, 'max_depth': 3, 'learning_rate': 0.35793807669547295}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001696 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:19,932] Trial 11 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 257, 'max_depth': 6, 'learning_rate': 0.18852231444516132}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:20,336] Trial 12 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 218, 'max_depth': 11, 'learning_rate': 0.3194557531170806}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:20,797] Trial 13 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 335, 'max_depth': 6, 'learning_rate': 0.17392118372229168}. Best is trial 8 with value: 0.8566666666666667.




[I 2024-12-27 11:59:21,161] Trial 14 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 169, 'max_depth': 6, 'learning_rate': 0.4974378352427852}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:21,613] Trial 15 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 303, 'max_depth': 12, 'learning_rate': 0.3673579304174624}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:22,040] Trial 16 finished with value: 0.85 and parameters: {'n_estimators': 255, 'max_depth': 10, 'learning_rate': 0.20707837844119015}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001459 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:22,556] Trial 17 finished with value: 0.84 and parameters: {'n_estimators': 377, 'max_depth': 5, 'learning_rate': 0.1052241379319932}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001683 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:23,089] Trial 18 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 196, 'max_depth': 14, 'learning_rate': 0.25436038924082416}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001882 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:23,876] Trial 19 finished with value: 0.84 and parameters: {'n_estimators': 495, 'max_depth': 16, 'learning_rate': 0.3051496931695457}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:24,546] Trial 20 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 175, 'max_depth': 14, 'learning_rate': 0.38434741976229947}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001661 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:25,087] Trial 21 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 219, 'max_depth': 14, 'learning_rate': 0.2289589422630043}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001730 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:25,697] Trial 22 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 287, 'max_depth': 19, 'learning_rate': 0.25202007294704487}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002222 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:26,209] Trial 23 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 144, 'max_depth': 8, 'learning_rate': 0.1427524039741288}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001731 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:26,832] Trial 24 finished with value: 0.84 and parameters: {'n_estimators': 249, 'max_depth': 14, 'learning_rate': 0.33761615328987654}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001781 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:27,329] Trial 25 finished with value: 0.84 and parameters: {'n_estimators': 191, 'max_depth': 3, 'learning_rate': 0.2703625783993483}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:27,794] Trial 26 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 230, 'max_depth': 7, 'learning_rate': 0.23230317212209511}. Best is trial 8 with value: 0.8566666666666667.
[I 2024-12-27 11:59:28,131] Trial 27 finished with value: 0.84 and parameters: {'n_estimators': 154, 'max_depth': 10, 'learning_rate': 0.08561361064636441}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001374 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:28,717] Trial 28 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 315, 'max_depth': 18, 'learning_rate': 0.16433482514408226}. Best is trial 8 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 11:59:29,158] Trial 29 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 272, 'max_depth': 13, 'learning_rate': 0.4230530147622986}. Best is trial 8 with value: 0.8566666666666667.
[I 2024-12-27 11:59:29,175] A new study created in memory with name: no-name-1d96b2e6-08b0-4d76-9ce5-d95c1718a748


Optimizing AdaBoost...


[I 2024-12-27 11:59:43,822] Trial 0 finished with value: 0.79 and parameters: {'n_estimators': 209, 'learning_rate': 0.7547857752634063}. Best is trial 0 with value: 0.79.
[I 2024-12-27 11:59:49,469] Trial 1 finished with value: 0.7 and parameters: {'n_estimators': 88, 'learning_rate': 0.35025561262667915}. Best is trial 0 with value: 0.79.
[I 2024-12-27 12:00:12,819] Trial 2 finished with value: 0.83 and parameters: {'n_estimators': 323, 'learning_rate': 0.8513002491616882}. Best is trial 2 with value: 0.83.
[I 2024-12-27 12:00:43,058] Trial 3 finished with value: 0.79 and parameters: {'n_estimators': 437, 'learning_rate': 0.5113327295602126}. Best is trial 2 with value: 0.83.
[I 2024-12-27 12:00:47,894] Trial 4 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 76, 'learning_rate': 0.6673497972008017}. Best is trial 2 with value: 0.83.
[I 2024-12-27 12:01:19,403] Trial 5 finished with value: 0.79 and parameters: {'n_estimators': 459, 'learning_rate': 0.495051725

Optimizing Neural Network...


[I 2024-12-27 12:11:45,791] Trial 0 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 87, 'hidden_layer_2': 51, 'learning_rate_init': 0.07844955491454592}. Best is trial 0 with value: 0.8966666666666666.
[I 2024-12-27 12:12:01,211] Trial 1 finished with value: 0.9133333333333333 and parameters: {'hidden_layer_1': 99, 'hidden_layer_2': 67, 'learning_rate_init': 0.080559818301646}. Best is trial 1 with value: 0.9133333333333333.
[I 2024-12-27 12:12:25,357] Trial 2 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 86, 'hidden_layer_2': 55, 'learning_rate_init': 0.08959087760286319}. Best is trial 1 with value: 0.9133333333333333.
[I 2024-12-27 12:12:47,499] Trial 3 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 88, 'hidden_layer_2': 90, 'learning_rate_init': 0.02407576792191377}. Best is trial 1 with value: 0.9133333333333333.
[I 2024-12-27 12:13:18,116] Trial 4 finished with value: 0.89 and parameters: {'hidden

Optimizing MLP...


[I 2024-12-27 12:22:42,183] Trial 0 finished with value: 0.9166666666666666 and parameters: {'layer_1': 59, 'layer_2': 146, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.02849652858494237}. Best is trial 0 with value: 0.9166666666666666.
[I 2024-12-27 12:23:30,383] Trial 1 finished with value: 0.9133333333333333 and parameters: {'layer_1': 142, 'layer_2': 86, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.08386283683949947}. Best is trial 0 with value: 0.9166666666666666.
[I 2024-12-27 12:24:35,513] Trial 2 finished with value: 0.92 and parameters: {'layer_1': 50, 'layer_2': 122, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.02018636080410494}. Best is trial 2 with value: 0.92.
[I 2024-12-27 12:24:57,369] Trial 3 finished with value: 0.9166666666666666 and parameters: {'layer_1': 55, 'layer_2': 115, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.04592974504634596}. Best is trial 2 with value: 0.92.
[I 2024-12-27 12:

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.903333     0.873333     0.933333  0.808123   
1         Decision Tree  0.870000     0.780000     0.960000  0.752287   
2         Random Forest  0.883333     0.813333     0.953333  0.774292   
3   Logistic Regression  0.886667     0.853333     0.920000  0.775058   
4                  k-NN  0.883333     0.866667     0.900000  0.767093   
5           Naive Bayes  0.866667     0.880000     0.853333  0.733594   
6     Gradient Boosting  0.896667     0.866667     0.926667  0.794765   
7               XGBoost  0.910000     0.886667     0.933333  0.820894   
8              LightGBM  0.856667     0.793333     0.920000  0.719126   
9              AdaBoost  0.870000     0.793333     0.946667  0.748856   
10       Neural Network  0.923333     0.946667     0.900000  0.847590   
11                  MLP  0.930000     0.926667     0.933333  0.860019   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_CTDD_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2024-12-27 12:42:15,487] A new study created in memory with name: no-name-dbb6ed62-df20-4645-9bda-74467a25d832


Optimizing SVM...


[I 2024-12-27 12:42:33,051] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 6.522197047368653, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 12:43:06,665] Trial 1 finished with value: 0.8166666666666667 and parameters: {'C': 2.8208025533999015, 'kernel': 'poly'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 12:43:24,502] Trial 2 finished with value: 0.88 and parameters: {'C': 9.488685047879896, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 12:44:01,434] Trial 3 finished with value: 0.7133333333333334 and parameters: {'C': 0.7049368695613077, 'kernel': 'poly'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 12:44:39,589] Trial 4 finished with value: 0.9033333333333333 and parameters: {'C': 3.115673376766238, 'kernel': 'rbf'}. Best is trial 4 with value: 0.9033333333333333.
[I 2024-12-27 12:44:58,993] Trial 5 finished with value: 0.87 and parameters: {'C': 4.2

Optimizing Decision Tree...


[I 2024-12-27 13:00:07,505] Trial 0 finished with value: 0.8466666666666667 and parameters: {'max_depth': 16, 'min_samples_split': 3}. Best is trial 0 with value: 0.8466666666666667.
[I 2024-12-27 13:00:07,914] Trial 1 finished with value: 0.85 and parameters: {'max_depth': 19, 'min_samples_split': 10}. Best is trial 1 with value: 0.85.
[I 2024-12-27 13:00:08,092] Trial 2 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 3}. Best is trial 1 with value: 0.85.
[I 2024-12-27 13:00:08,413] Trial 3 finished with value: 0.8433333333333334 and parameters: {'max_depth': 15, 'min_samples_split': 6}. Best is trial 1 with value: 0.85.
[I 2024-12-27 13:00:08,590] Trial 4 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 3}. Best is trial 1 with value: 0.85.
[I 2024-12-27 13:00:08,878] Trial 5 finished with value: 0.7866666666666666 and parameters: {'max_depth': 8, 'min_samples_split': 8}. Best is trial 1 with value: 0.85.
[I 2024-12-27 13

Optimizing Random Forest...


[I 2024-12-27 13:00:22,525] Trial 0 finished with value: 0.86 and parameters: {'n_estimators': 496, 'max_depth': 14, 'min_samples_split': 3}. Best is trial 0 with value: 0.86.
[I 2024-12-27 13:00:25,824] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 444, 'max_depth': 20, 'min_samples_split': 7}. Best is trial 1 with value: 0.88.
[I 2024-12-27 13:00:29,635] Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 370, 'max_depth': 18, 'min_samples_split': 7}. Best is trial 1 with value: 0.88.
[I 2024-12-27 13:00:31,320] Trial 3 finished with value: 0.85 and parameters: {'n_estimators': 316, 'max_depth': 10, 'min_samples_split': 4}. Best is trial 1 with value: 0.88.
[I 2024-12-27 13:00:32,072] Trial 4 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 256, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 1 with value: 0.88.
[I 2024-12-27 13:00:33,420] Trial 5 finished with value: 0.8433333333333334 and parameters: {

Optimizing Logistic Regression...


[I 2024-12-27 13:01:38,455] Trial 0 finished with value: 0.8866666666666667 and parameters: {'C': 8.519156777583722, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 13:01:39,035] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 8.368781280885706, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 13:01:39,383] Trial 2 finished with value: 0.83 and parameters: {'C': 1.2252468319554448, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 13:01:39,697] Trial 3 finished with value: 0.78 and parameters: {'C': 0.45914246425044736, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 13:01:40,064] Trial 4 finished with value: 0.8866666666666667 and parameters: {'C': 9.88962285952172, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-27 13:01:40,410] Trial 5 finished with value: 0.8866666666666667 and parameter

Optimizing k-NN...


[I 2024-12-27 13:01:50,866] Trial 0 finished with value: 0.8566666666666667 and parameters: {'n_neighbors': 5}. Best is trial 0 with value: 0.8566666666666667.
[I 2024-12-27 13:01:51,225] Trial 1 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 1 with value: 0.8733333333333333.
[I 2024-12-27 13:01:51,582] Trial 2 finished with value: 0.87 and parameters: {'n_neighbors': 13}. Best is trial 1 with value: 0.8733333333333333.
[I 2024-12-27 13:01:51,962] Trial 3 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 1 with value: 0.8733333333333333.
[I 2024-12-27 13:01:52,318] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 7}. Best is trial 1 with value: 0.8733333333333333.
[I 2024-12-27 13:01:52,675] Trial 5 finished with value: 0.8833333333333333 and parameters: {'n_neighbors': 3}. Best is trial 5 with value: 0.8833333333333333.
[I 2024-12-27 13:01:53,054] Trial 6 finished with val

Optimizing Naive Bayes...


[I 2024-12-27 13:02:04,068] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:04,347] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:04,610] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:04,886] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:05,155] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:05,432] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-27 13:02:05,699] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-27 13:03:43,135] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 406, 'learning_rate': 0.012814936522365638, 'max_depth': 12}. Best is trial 0 with value: 0.88.
[I 2024-12-27 13:04:35,009] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 424, 'learning_rate': 0.4087957233124397, 'max_depth': 6}. Best is trial 0 with value: 0.88.
[I 2024-12-27 13:05:18,863] Trial 2 finished with value: 0.88 and parameters: {'n_estimators': 354, 'learning_rate': 0.28794746172113833, 'max_depth': 6}. Best is trial 0 with value: 0.88.
[I 2024-12-27 13:07:27,260] Trial 3 finished with value: 0.89 and parameters: {'n_estimators': 455, 'learning_rate': 0.31593965942704205, 'max_depth': 15}. Best is trial 3 with value: 0.89.
[I 2024-12-27 13:08:14,345] Trial 4 finished with value: 0.88 and parameters: {'n_estimators': 298, 'learning_rate': 0.4250822505634833, 'max_depth': 8}. Best is trial 3 with value: 0.89.
[I 2024-12-27 13:08:40,926] Trial 5 finished with valu

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 13:30:22,192] Trial 0 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 491, 'max_depth': 18, 'learning_rate': 0.12871272012978754}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 13:30:34,360] Trial 1 finished with value: 0.89 and parameters: {'n_estimators': 98, 'max_depth': 18, 'learning_rate': 0.4341950696763331}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 13:30:53,299] Trial 2 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 326, 'max_depth': 5, 'learning_rate': 0.09887981231758973}. Best is trial 0 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-27 13:31:26,933] Trial 3 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 494, 'max_depth': 14, 'learning_rate': 0.355902420

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:50,517] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 169, 'max_depth': 18, 'learning_rate': 0.13510140056884196}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001440 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:51,174] Trial 1 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 442, 'max_depth': 14, 'learning_rate': 0.018644483791667117}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001673 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:52,010] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 324, 'max_depth': 18, 'learning_rate': 0.07968577396220107}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:52,758] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 496, 'max_depth': 11, 'learning_rate': 0.2970685164088263}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001660 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:53,253] Trial 4 finished with value: 0.84 and parameters: {'n_estimators': 187, 'max_depth': 18, 'learning_rate': 0.42574345628125704}. Best is trial 0 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:53,835] Trial 5 finished with value: 0.85 and parameters: {'n_estimators': 321, 'max_depth': 9, 'learning_rate': 0.2964494672363349}. Best is trial 5 with value: 0.85.




[I 2024-12-27 13:42:54,223] Trial 6 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 59, 'max_depth': 19, 'learning_rate': 0.11314927162517875}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001772 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:54,864] Trial 7 finished with value: 0.85 and parameters: {'n_estimators': 357, 'max_depth': 19, 'learning_rate': 0.4442399354126752}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:55,508] Trial 8 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 390, 'max_depth': 6, 'learning_rate': 0.26084331454148746}. Best is trial 5 with value: 0.85.




[I 2024-12-27 13:42:55,846] Trial 9 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 154, 'max_depth': 16, 'learning_rate': 0.16753843852915373}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001512 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:56,238] Trial 10 finished with value: 0.84 and parameters: {'n_estimators': 251, 'max_depth': 3, 'learning_rate': 0.34089462011483246}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001485 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:56,825] Trial 11 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 337, 'max_depth': 10, 'learning_rate': 0.4959329602757284}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001431 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:57,264] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 283, 'max_depth': 9, 'learning_rate': 0.381242722529995}. Best is trial 5 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:57,764] Trial 13 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 380, 'max_depth': 13, 'learning_rate': 0.48535515500222826}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001756 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:58,298] Trial 14 finished with value: 0.84 and parameters: {'n_estimators': 415, 'max_depth': 13, 'learning_rate': 0.20211551105519704}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001449 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:58,707] Trial 15 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 259, 'max_depth': 7, 'learning_rate': 0.4972100117837402}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001499 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:59,256] Trial 16 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 445, 'max_depth': 14, 'learning_rate': 0.34592203989226744}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:42:59,693] Trial 17 finished with value: 0.85 and parameters: {'n_estimators': 290, 'max_depth': 8, 'learning_rate': 0.21430713100337917}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001410 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:00,265] Trial 18 finished with value: 0.84 and parameters: {'n_estimators': 500, 'max_depth': 5, 'learning_rate': 0.39788945198610315}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:00,783] Trial 19 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 221, 'max_depth': 12, 'learning_rate': 0.3044344959806906}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001483 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:01,276] Trial 20 finished with value: 0.85 and parameters: {'n_estimators': 371, 'max_depth': 16, 'learning_rate': 0.44821969147545876}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:01,748] Trial 21 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 336, 'max_depth': 20, 'learning_rate': 0.45787799462591355}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:02,246] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 366, 'max_depth': 15, 'learning_rate': 0.3836151363705165}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:02,718] Trial 23 finished with value: 0.85 and parameters: {'n_estimators': 315, 'max_depth': 11, 'learning_rate': 0.46104445342143535}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:03,261] Trial 24 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 412, 'max_depth': 9, 'learning_rate': 0.32888571679618}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001096 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:03,756] Trial 25 finished with value: 0.85 and parameters: {'n_estimators': 361, 'max_depth': 5, 'learning_rate': 0.40614255588929404}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2024-12-27 13:43:04,320] Trial 26 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 452, 'max_depth': 12, 'learning_rate': 0.25907517489518517}. Best is trial 13 with value: 0.8566666666666667.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:04,893] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 465, 'max_depth': 12, 'learning_rate': 0.2397854996477728}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:05,537] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 397, 'max_depth': 9, 'learning_rate': 0.266722692213302}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001694 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:06,332] Trial 29 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 461, 'max_depth': 13, 'learning_rate': 0.16257255191807424}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001685 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-27 13:43:06,978] A new study created in memory with name: no-name-4c004ee2-1657-4e59-9857-33e602ce5af9


Optimizing AdaBoost...


[I 2024-12-27 13:43:29,549] Trial 0 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 318, 'learning_rate': 0.4430061071078897}. Best is trial 0 with value: 0.7733333333333333.
[I 2024-12-27 13:43:42,971] Trial 1 finished with value: 0.69 and parameters: {'n_estimators': 189, 'learning_rate': 0.14470766620577785}. Best is trial 0 with value: 0.7733333333333333.
[I 2024-12-27 13:44:02,987] Trial 2 finished with value: 0.79 and parameters: {'n_estimators': 294, 'learning_rate': 0.5945155430412854}. Best is trial 2 with value: 0.79.
[I 2024-12-27 13:44:22,274] Trial 3 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 267, 'learning_rate': 0.9184748850442154}. Best is trial 3 with value: 0.8166666666666667.
[I 2024-12-27 13:44:28,501] Trial 4 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 89, 'learning_rate': 0.9213674786748874}. Best is trial 3 with value: 0.8166666666666667.
[I 2024-12-27 13:44:43,502] Trial 5 finish

Optimizing Neural Network...


[I 2024-12-27 13:56:11,840] Trial 0 finished with value: 0.9133333333333333 and parameters: {'hidden_layer_1': 65, 'hidden_layer_2': 52, 'learning_rate_init': 0.0854995331262163}. Best is trial 0 with value: 0.9133333333333333.
[I 2024-12-27 13:56:32,265] Trial 1 finished with value: 0.92 and parameters: {'hidden_layer_1': 92, 'hidden_layer_2': 65, 'learning_rate_init': 0.053156026165303315}. Best is trial 1 with value: 0.92.
[I 2024-12-27 13:56:50,967] Trial 2 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 20, 'hidden_layer_2': 16, 'learning_rate_init': 0.009201654591059066}. Best is trial 1 with value: 0.92.
[I 2024-12-27 13:57:15,230] Trial 3 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 67, 'hidden_layer_2': 95, 'learning_rate_init': 0.03672685483015423}. Best is trial 1 with value: 0.92.
[I 2024-12-27 13:57:26,452] Trial 4 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 25, 'l

Optimizing MLP...


[I 2024-12-27 14:08:05,909] Trial 0 finished with value: 0.9 and parameters: {'layer_1': 147, 'layer_2': 130, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.07026376160387535}. Best is trial 0 with value: 0.9.
[I 2024-12-27 14:08:45,301] Trial 1 finished with value: 0.89 and parameters: {'layer_1': 148, 'layer_2': 91, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.06551448739697524}. Best is trial 0 with value: 0.9.
[I 2024-12-27 14:09:33,937] Trial 2 finished with value: 0.9133333333333333 and parameters: {'layer_1': 125, 'layer_2': 54, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.05472908060290152}. Best is trial 2 with value: 0.9133333333333333.
[I 2024-12-27 14:09:59,973] Trial 3 finished with value: 0.9066666666666666 and parameters: {'layer_1': 59, 'layer_2': 149, 'activation': 'relu', 'solver': 'adam', 'learning_rate_init': 0.055146588458306355}. Best is trial 2 with value: 0.9133333333333333.
[I 2024-12-27 14:10:28,554] Trial 4

Dataset saved successfully!


Class Feature Vector (CFV)

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_CTDD.csv", index=False)
print("CFV dataset created and saved!")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-15 12:03:24,608] A new study created in memory with name: no-name-6990331d-0838-4aa2-8fb8-8c2ed1c3b1c4


Training and predicting with SVM...


[I 2025-01-15 12:03:53,296] Trial 0 finished with value: 0.8833333333333333 and parameters: {'C': 8.63059532325755, 'kernel': 'linear'}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-15 12:04:28,222] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 0.5434157206769458, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-15 12:05:11,592] Trial 2 finished with value: 0.9033333333333333 and parameters: {'C': 3.6730715107125906, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-15 12:05:53,994] Trial 3 finished with value: 0.9033333333333333 and parameters: {'C': 6.5165850420915294, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-15 12:06:36,620] Trial 4 finished with value: 0.9033333333333333 and parameters: {'C': 2.657868985539425, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2025-01-15 12:07:10,040] Trial 5 finished with value: 0.8833333333333333 

Training and predicting with Decision Tree...


[I 2025-01-15 12:23:15,302] Trial 0 finished with value: 0.7233333333333334 and parameters: {'max_depth': 4, 'min_samples_split': 4}. Best is trial 0 with value: 0.7233333333333334.
[I 2025-01-15 12:23:15,583] Trial 1 finished with value: 0.78 and parameters: {'max_depth': 8, 'min_samples_split': 4}. Best is trial 1 with value: 0.78.
[I 2025-01-15 12:23:16,008] Trial 2 finished with value: 0.8533333333333334 and parameters: {'max_depth': 20, 'min_samples_split': 5}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-15 12:23:16,323] Trial 3 finished with value: 0.7933333333333333 and parameters: {'max_depth': 9, 'min_samples_split': 7}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-15 12:23:16,577] Trial 4 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 10}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-15 12:23:16,917] Trial 5 finished with value: 0.78 and parameters: {'max_depth': 7, 'min_samples_split': 7}. Best is tr

Training and predicting with Random Forest...


[I 2025-01-15 12:23:29,911] Trial 0 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 133, 'max_depth': 19, 'min_samples_split': 5}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-15 12:23:30,422] Trial 1 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 67, 'max_depth': 13, 'min_samples_split': 7}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-15 12:23:31,598] Trial 2 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 388, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-15 12:23:32,553] Trial 3 finished with value: 0.8033333333333333 and parameters: {'n_estimators': 238, 'max_depth': 3, 'min_samples_split': 3}. Best is trial 0 with value: 0.8866666666666667.
[I 2025-01-15 12:23:33,347] Trial 4 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 86, 'max_depth': 7, 'min_samples_split': 10}. Best is trial 0 with value: 0.88

Training and predicting with Logistic Regression...


[I 2025-01-15 12:24:15,979] Trial 0 finished with value: 0.86 and parameters: {'C': 2.369984250079421, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.86.
[I 2025-01-15 12:24:16,471] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 5.031442743941689, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 12:24:16,933] Trial 2 finished with value: 0.8833333333333333 and parameters: {'C': 7.334113223963511, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 12:24:17,391] Trial 3 finished with value: 0.8366666666666667 and parameters: {'C': 1.4154594267286449, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 12:24:17,853] Trial 4 finished with value: 0.8266666666666667 and parameters: {'C': 1.1438467278077997, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 12:24:18,637] Trial 5 finished with value: 0.8866666666666667 and param

Training and predicting with k-NN...


[I 2025-01-15 12:24:29,582] Trial 0 finished with value: 0.84 and parameters: {'n_neighbors': 18}. Best is trial 0 with value: 0.84.
[I 2025-01-15 12:24:30,457] Trial 1 finished with value: 0.87 and parameters: {'n_neighbors': 10}. Best is trial 1 with value: 0.87.
[I 2025-01-15 12:24:31,423] Trial 2 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 2 with value: 0.88.
[I 2025-01-15 12:24:32,535] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 15}. Best is trial 2 with value: 0.88.
[I 2025-01-15 12:24:33,708] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_neighbors': 7}. Best is trial 2 with value: 0.88.
[I 2025-01-15 12:24:35,071] Trial 5 finished with value: 0.8833333333333333 and parameters: {'n_neighbors': 3}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-15 12:24:36,052] Trial 6 finished with value: 0.87 and parameters: {'n_neighbors': 10}. Best is trial 5 with value: 0.8833333333333333.
[I

Training and predicting with Naive Bayes...


[I 2025-01-15 12:24:54,544] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:54,891] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:55,237] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:55,585] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:55,931] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:56,273] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 12:24:56,623] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Training and predicting with Gradient Boosting...


[I 2025-01-15 12:25:49,856] Trial 0 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 144, 'learning_rate': 0.4787747769126398, 'max_depth': 16}. Best is trial 0 with value: 0.8833333333333333.
[I 2025-01-15 12:25:56,310] Trial 1 finished with value: 0.9 and parameters: {'n_estimators': 51, 'learning_rate': 0.10572701535835176, 'max_depth': 7}. Best is trial 1 with value: 0.9.
[I 2025-01-15 12:26:26,185] Trial 2 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 335, 'learning_rate': 0.49953541920024247, 'max_depth': 4}. Best is trial 1 with value: 0.9.
[I 2025-01-15 12:26:39,247] Trial 3 finished with value: 0.89 and parameters: {'n_estimators': 60, 'learning_rate': 0.3975353660484763, 'max_depth': 11}. Best is trial 1 with value: 0.9.
[I 2025-01-15 12:28:18,410] Trial 4 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 268, 'learning_rate': 0.23813951876138856, 'max_depth': 20}. Best is trial 1 with value: 0.9.
[I 20

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 12:55:11,440] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 165, 'max_depth': 8, 'learning_rate': 0.02496536066505519}. Best is trial 0 with value: 0.8466666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 12:55:39,993] Trial 1 finished with value: 0.91 and parameters: {'n_estimators': 399, 'max_depth': 12, 'learning_rate': 0.34408487653655345}. Best is trial 1 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 12:55:54,794] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 163, 'max_depth': 8, 'learning_rate': 0.10553230482031642}. Best is trial 1 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 12:56:17,352] Trial 3 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 330, 'max_depth': 6, 'learning_rate': 0.20291030427563742}. Best is trial 1 with value: 0.91

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:53,624] Trial 0 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 382, 'max_depth': 13, 'learning_rate': 0.038752157517595164}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001684 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:54,356] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 369, 'max_depth': 17, 'learning_rate': 0.26619353283811836}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001759 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:54,848] Trial 2 finished with value: 0.83 and parameters: {'n_estimators': 154, 'max_depth': 3, 'learning_rate': 0.16600139739699257}. Best is trial 0 with value: 0.8433333333333334.
[I 2025-01-15 13:07:55,195] Trial 3 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 156, 'max_depth': 15, 'learning_rate': 0.44265547691289187}. Best is trial 3 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001450 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001769 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:55,585] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 187, 'max_depth': 17, 'learning_rate': 0.280592314932083}. Best is trial 3 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001407 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:55,965] Trial 5 finished with value: 0.84 and parameters: {'n_estimators': 194, 'max_depth': 11, 'learning_rate': 0.0730200738379904}. Best is trial 3 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001424 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:56,383] Trial 6 finished with value: 0.85 and parameters: {'n_estimators': 241, 'max_depth': 19, 'learning_rate': 0.17686388618616108}. Best is trial 6 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:57,069] Trial 7 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 464, 'max_depth': 7, 'learning_rate': 0.270429477596015}. Best is trial 6 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:57,544] Trial 8 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 346, 'max_depth': 3, 'learning_rate': 0.08937988389754589}. Best is trial 6 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:58,072] Trial 9 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 409, 'max_depth': 9, 'learning_rate': 0.3674541516814243}. Best is trial 6 with value: 0.85.




[I 2025-01-15 13:07:58,379] Trial 10 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 74, 'max_depth': 20, 'learning_rate': 0.16852023062430957}. Best is trial 6 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001473 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:58,818] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 245, 'max_depth': 16, 'learning_rate': 0.4781808712652478}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001437 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:59,261] Trial 12 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 267, 'max_depth': 20, 'learning_rate': 0.4962960224484029}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001438 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:07:59,712] Trial 13 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 267, 'max_depth': 17, 'learning_rate': 0.36542366156694495}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:00,179] Trial 14 finished with value: 0.85 and parameters: {'n_estimators': 295, 'max_depth': 14, 'learning_rate': 0.3786642750957714}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001421 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:01,081] Trial 15 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 304, 'max_depth': 16, 'learning_rate': 0.3761466651059144}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:01,507] Trial 16 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 232, 'max_depth': 12, 'learning_rate': 0.49575562667790857}. Best is trial 13 with value: 0.8566666666666667.
[I 2025-01-15 13:08:01,845] Trial 17 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 110, 'max_depth': 10, 'learning_rate': 0.4141324549700156}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:02,332] Trial 18 finished with value: 0.85 and parameters: {'n_estimators': 325, 'max_depth': 8, 'learning_rate': 0.3317579223536459}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:02,764] Trial 19 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 220, 'max_depth': 12, 'learning_rate': 0.44922852892017745}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001413 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:03,319] Trial 20 finished with value: 0.84 and parameters: {'n_estimators': 434, 'max_depth': 6, 'learning_rate': 0.3192858479016335}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:03,800] Trial 21 finished with value: 0.85 and parameters: {'n_estimators': 262, 'max_depth': 18, 'learning_rate': 0.493682388701261}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001461 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:04,216] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 221, 'max_depth': 14, 'learning_rate': 0.4494247842267607}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001443 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:04,678] Trial 23 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 283, 'max_depth': 16, 'learning_rate': 0.4977288294312773}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:05,276] Trial 24 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 138, 'max_depth': 13, 'learning_rate': 0.4136142852036361}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001775 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:05,849] Trial 25 finished with value: 0.84 and parameters: {'n_estimators': 209, 'max_depth': 15, 'learning_rate': 0.41383293735206744}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001722 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:06,452] Trial 26 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 252, 'max_depth': 18, 'learning_rate': 0.4625490406045055}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001701 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:07,157] Trial 27 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 324, 'max_depth': 11, 'learning_rate': 0.31688097209067084}. Best is trial 13 with value: 0.8566666666666667.
[I 2025-01-15 13:08:07,586] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 51, 'max_depth': 13, 'learning_rate': 0.36270276764550313}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001699 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:08,158] Trial 29 finished with value: 0.85 and parameters: {'n_estimators': 176, 'max_depth': 15, 'learning_rate': 0.22605849051365862}. Best is trial 13 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001743 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 13:08:08,710] A new study created in memory with name: no-name-f4caf2ef-0392-4ae0-b0de-36842dfe50db


Training and predicting with AdaBoost...


[I 2025-01-15 13:08:23,811] Trial 0 finished with value: 0.72 and parameters: {'n_estimators': 187, 'learning_rate': 0.2221364855585346}. Best is trial 0 with value: 0.72.
[I 2025-01-15 13:08:29,541] Trial 1 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 77, 'learning_rate': 0.8238301548059878}. Best is trial 1 with value: 0.7666666666666667.
[I 2025-01-15 13:09:00,323] Trial 2 finished with value: 0.7866666666666666 and parameters: {'n_estimators': 387, 'learning_rate': 0.4449350663873341}. Best is trial 2 with value: 0.7866666666666666.
[I 2025-01-15 13:09:24,632] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 296, 'learning_rate': 0.8904237576448794}. Best is trial 3 with value: 0.8433333333333334.
[I 2025-01-15 13:09:40,707] Trial 4 finished with value: 0.79 and parameters: {'n_estimators': 201, 'learning_rate': 0.8287700266510682}. Best is trial 3 with value: 0.8433333333333334.
[I 2025-01-15 13:10:19,936] Trial 5 finishe

Training and predicting with Neural Network...


[I 2025-01-15 13:22:30,920] Trial 0 finished with value: 0.8966666666666666 and parameters: {'hidden_layer_1': 52, 'hidden_layer_2': 18, 'learning_rate_init': 0.016588702168304912}. Best is trial 0 with value: 0.8966666666666666.
[I 2025-01-15 13:22:46,237] Trial 1 finished with value: 0.9133333333333333 and parameters: {'hidden_layer_1': 73, 'hidden_layer_2': 40, 'learning_rate_init': 0.07247407292591374}. Best is trial 1 with value: 0.9133333333333333.
[I 2025-01-15 13:23:15,969] Trial 2 finished with value: 0.8833333333333333 and parameters: {'hidden_layer_1': 94, 'hidden_layer_2': 72, 'learning_rate_init': 0.05607440893373081}. Best is trial 1 with value: 0.9133333333333333.
[I 2025-01-15 13:23:33,457] Trial 3 finished with value: 0.9166666666666666 and parameters: {'hidden_layer_1': 76, 'hidden_layer_2': 74, 'learning_rate_init': 0.061624666064036916}. Best is trial 3 with value: 0.9166666666666666.
[I 2025-01-15 13:23:41,223] Trial 4 finished with value: 0.9066666666666666 and pa

Training and predicting with MLP...


[I 2025-01-15 13:35:19,046] Trial 0 finished with value: 0.9133333333333333 and parameters: {'layer_1': 61, 'layer_2': 133, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.02569048255583432}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-15 13:36:13,922] Trial 1 finished with value: 0.8666666666666667 and parameters: {'layer_1': 148, 'layer_2': 51, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.05298967473630021}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-15 13:36:33,630] Trial 2 finished with value: 0.89 and parameters: {'layer_1': 115, 'layer_2': 100, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.00833720816874194}. Best is trial 0 with value: 0.9133333333333333.
[I 2025-01-15 13:37:09,887] Trial 3 finished with value: 0.9033333333333333 and parameters: {'layer_1': 145, 'layer_2': 110, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.08389354522460025}. Best is trial 0 with value: 0.91333333

CPFV (Combined Probability and Class Feature Vector)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_CTDD.csv", index=False)




Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001713 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...


# **Hyperparameter grids for RandomizedSearchCV**

In [None]:
#import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier # Import path for KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for RandomizedSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply random search
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


ModuleNotFoundError: No module named 'optuna'

In [None]:
# Storage for predictions and target column
probability_datasets = pd.DataFrame(y_train, columns=['Target'])

# Loop through models, perform random search, and save probabilities
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model,
                                       param_grid,
                                       n_iter=10,
                                       cv=cv,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       random_state=42)

    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)



    # Get probability predictions (if supported)
    if hasattr(random_search.best_estimator_, "predict_proba"):
        probabilities = random_search.best_estimator_.predict_proba(X_train)[:, 1]  # Probability for the positive class
        probability_datasets[f"{model_name}_Probabilities"] = probabilities
    else:
        # Fallback if probability prediction isn't supported
        predictions = random_search.best_estimator_.predict(X_train)
        probability_datasets[f"{model_name}_Predictions"] = predictions



# Display final dataset with probabilities
print(probability_datasets.head())

# Save the probability dataset to a CSV file
probability_datasets.to_csv("CTDD_Randomsearch .csv", index=False)
print("Probability dataset saved to 'CTDD_Randomsearch.csv'.")


# **Hyperparameter grids for GridSearchCV**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ CTDD_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/CTDD_validation_negative_features.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for GridSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


In [None]:
# Prepare the final dataset with probabilities and target
all_probabilities = []
all_targets = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")

    # Special handling for SVC: enable probability estimation
    if model_name == "SVM":
        model.probability = True  # Enable probability for SVC

    # Get the parameter grid for the current model
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

    # Predict probabilities using the best estimator
    best_model = grid_search.best_estimator_
    if hasattr(best_model, "predict_proba"):
        probabilities = best_model.predict_proba(X_train)[:, 1]  # Positive class probabilities
    else:
        # Fallback for models without predict_proba (e.g., SVM with linear kernel)
        probabilities = best_model.decision_function(X_train)
        probabilities = (probabilities - probabilities.min()) / (probabilities.max() - probabilities.min())

    # Append probabilities and targets for this model
    all_probabilities.append(probabilities)
    all_targets.append(y_train)

    # Combine probabilities, features, and target into a DataFrame
    model_data = pd.DataFrame(X_train, columns=main_p.columns)  # Ensure column consistency
    model_data[f"{model_name}_probability"] = probabilities
    model_data['target'] = y_train

    # Save to CSV
    output_path = f"/content/{model_name}_probabilities.csv"
    model_data.to_csv(output_path, index=False)
    print(f"Saved probabilities for {model_name} to {output_path}")

# Combine all model probabilities into a single DataFrame (optional)
final_dataset = pd.DataFrame({'target': y_train})
for idx, model_name in enumerate(models.keys()):
    final_dataset[f"{model_name}_probability"] = all_probabilities[idx]

# Save the combined dataset
final_output_path = "/content/combined_probabilities CTDD_GridSearchCV.csv"
final_dataset.to_csv(final_output_path, index=False)
print(f"Saved combined dataset to {final_output_path}")


proposed model for CTDD

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report, confusion_matrix,
                            matthews_corrcoef, cohen_kappa_score, roc_auc_score)

# Load all datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_positive_features (3).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_negative_features.csv")

# Add target labels (1 for positive, 0 for negative)
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine main datasets for training and validation datasets for testing
train_data = pd.concat([main_p, main_n], axis=0)
validation_data = pd.concat([validation_p, validation_n], axis=0)

# Shuffle the datasets
train_data = train_data.sample(frac=1, random_state=42).reset_index(drop=True)
validation_data = validation_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check for non-numeric columns and remove them
non_numeric_cols = train_data.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_cols:
    print(f"Dropping non-numeric columns: {non_numeric_cols}")
    train_data = train_data.drop(columns=non_numeric_cols)
    validation_data = validation_data.drop(columns=non_numeric_cols)

# Separate features and labels
X_train = train_data.drop(columns=['Target']).values
y_train = train_data['Target'].values
X_val = validation_data.drop(columns=['Target']).values
y_val = validation_data['Target'].values

# Check for NaN or infinite values
print(f"NaN in X_train: {np.isnan(X_train).any()}")
print(f"Inf in X_train: {np.isinf(X_train).any()}")
print(f"NaN in X_val: {np.isnan(X_val).any()}")
print(f"Inf in X_val: {np.isinf(X_val).any()}")

# Replace any NaN or infinite values
X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Reshape data for Conv1D input
X_train = X_train[..., np.newaxis]
X_val = X_val[..., np.newaxis]

# Model Architecture with improvements
model = Sequential()

# First Conv1D layer with input specification
model.add(Conv1D(filters=32, kernel_size=3, activation='relu',
                input_shape=(X_train.shape[1], 1),
                kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

# Second Conv1D layer
model.add(Conv1D(filters=64, kernel_size=3, activation='relu',
                kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(Dropout(0.3))

# Third Conv1D layer
model.add(Conv1D(filters=128, kernel_size=3, activation='relu',
                kernel_regularizer=l2(0.001)))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# LSTM layer with tanh activation (more stable than relu for RNNs)
model.add(LSTM(32, return_sequences=False, activation='tanh'))

# Dense layers with reduced complexity
model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(32, activation='relu', kernel_regularizer=l2(0.001)))
model.add(Dropout(0.3))
model.add(Dense(1, activation='sigmoid'))

# Compile with lower learning rate and gradient clipping
optimizer = Adam(learning_rate=0.0001, clipvalue=0.5)
model.compile(optimizer=optimizer,
             loss='binary_crossentropy',
             metrics=['accuracy'])

# Model Summary
model.summary()

# Early stopping callback
early_stopping = EarlyStopping(monitor='val_loss',
                             patience=10,
                             restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train,
                   validation_data=(X_val, y_val),
                   epochs=10,
                   batch_size=32,
                   verbose=1,
                   callbacks=[early_stopping])

# Evaluate the model
val_probabilities = model.predict(X_val).flatten()
val_predictions = (val_probabilities > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_val, val_predictions)
tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = matthews_corrcoef(y_val, val_predictions)
kappa = cohen_kappa_score(y_val, val_predictions)
auc = roc_auc_score(y_val, val_probabilities)

# Print metrics
print("\nValidation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"Matthews Correlation Coefficient (MCC): {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Area Under Curve (AUC): {auc:.4f}")

# Classification report
print("\nClassification Report:\n", classification_report(y_val, val_predictions))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_predictions))

NaN in X_train: False
Inf in X_train: False
NaN in X_val: False
Inf in X_val: False


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m125s[0m 3s/step - accuracy: 0.5247 - loss: 0.9081 - val_accuracy: 0.5000 - val_loss: 0.9064
Epoch 2/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 3s/step - accuracy: 0.4895 - loss: 0.9068 - val_accuracy: 0.5000 - val_loss: 0.9025
Epoch 3/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m117s[0m 3s/step - accuracy: 0.5241 - loss: 0.9002 - val_accuracy: 0.5000 - val_loss: 0.8989
Epoch 4/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 3s/step - accuracy: 0.4911 - loss: 0.8988 - val_accuracy: 0.5000 - val_loss: 0.8953
Epoch 5/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 3s/step - accuracy: 0.4740 - loss: 0.8966 - val_accuracy: 0.5033 - val_loss: 0.8918
Epoch 6/10
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m144s[0m 3s/step - accuracy: 0.4838 - loss: 0.8927 - val_accuracy: 0.5033 - val_loss: 0.8882
Epoch 7/10
[1m37/37[0m [32m━━━━

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, matthews_corrcoef,
                           cohen_kappa_score, roc_auc_score)

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_positive_features (3).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_negative_features.csv")

# Add target labels
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine and shuffle data
train_data = pd.concat([main_p, main_n]).sample(frac=1, random_state=42)
validation_data = pd.concat([validation_p, validation_n]).sample(frac=1, random_state=42)

# Check class balance
print("Class distribution in training set:")
print(train_data['Target'].value_counts())
print("\nClass distribution in validation set:")
print(validation_data['Target'].value_counts())

# Drop non-numeric columns
non_numeric_cols = train_data.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_cols:
    print(f"Dropping non-numeric columns: {non_numeric_cols}")
    train_data = train_data.drop(columns=non_numeric_cols)
    validation_data = validation_data.drop(columns=non_numeric_cols)

# Separate features and labels
X_train = train_data.drop(columns=['Target']).values
y_train = train_data['Target'].values
X_val = validation_data.drop(columns=['Target']).values
y_val = validation_data['Target'].values

# Handle NaN/inf
X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)

# Normalize with clipping
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_train = np.clip(X_train, -5, 5)
X_val = np.clip(X_val, -5, 5)

# Build Dense network (appropriate for CTDD features)
model = Sequential()
model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],)))
model.add(BatchNormalization())
model.add(Dropout(0.5))

model.add(Dense(128, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.4))

model.add(Dense(64, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.3))

model.add(Dense(1, activation='sigmoid'))

# Compile with class weighting if imbalanced
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy',
                      tf.keras.metrics.AUC(name='auc'),
                      tf.keras.metrics.Recall(name='recall'),
                      tf.keras.metrics.Precision(name='precision')])

# Model summary
model.summary()

# Early stopping
early_stop = EarlyStopping(monitor='val_auc',
                          patience=15,
                          mode='max',
                          restore_best_weights=True)

# Train model
history = model.fit(X_train, y_train,
                   validation_data=(X_val, y_val),
                   epochs=10,
                   batch_size=64,
                   callbacks=[early_stop],
                   verbose=1)

# Evaluate
val_probabilities = model.predict(X_val).flatten()
val_predictions = (val_probabilities > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_val, val_predictions)
tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = matthews_corrcoef(y_val, val_predictions)
kappa = cohen_kappa_score(y_val, val_predictions)
auc = roc_auc_score(y_val, val_probabilities)

# Print metrics
print("\nValidation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"AUC: {auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, val_predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_predictions))

Class distribution in training set:
Target
0    582
1    582
Name: count, dtype: int64

Class distribution in validation set:
Target
0    150
1    150
Name: count, dtype: int64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 124ms/step - accuracy: 0.5528 - auc: 0.5932 - loss: 0.8716 - precision: 0.5551 - recall: 0.5175 - val_accuracy: 0.7700 - val_auc: 0.8686 - val_loss: 0.6276 - val_precision: 0.8240 - val_recall: 0.6867
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 88ms/step - accuracy: 0.7365 - auc: 0.8269 - loss: 0.5164 - precision: 0.7563 - recall: 0.6960 - val_accuracy: 0.8600 - val_auc: 0.9293 - val_loss: 0.5431 - val_precision: 0.8375 - val_recall: 0.8933
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 66ms/step - accuracy: 0.8575 - auc: 0.9306 - loss: 0.3509 - precision: 0.8687 - recall: 0.8593 - val_accuracy: 0.8667 - val_auc: 0.9431 - val_loss: 0.4753 - val_precision: 0.8571 - val_recall: 0.8800
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.8964 - auc: 0.9626 - loss: 0.2597 - precision: 0.8960 - recall: 0.8949 - va

In [None]:
#cross validation 5 fold

import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, matthews_corrcoef,
                           cohen_kappa_score, roc_auc_score)

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_positive_features (3).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_negative_features.csv")

# Add target labels
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine all data for cross-validation
all_data = pd.concat([main_p, main_n, validation_p, validation_n])
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check class balance
print("Class distribution:")
print(all_data['Target'].value_counts())

# Drop non-numeric columns
non_numeric_cols = all_data.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_cols:
    print(f"\nDropping non-numeric columns: {non_numeric_cols}")
    all_data = all_data.drop(columns=non_numeric_cols)

# Separate features and labels
X = all_data.drop(columns=['Target']).values
y = all_data['Target'].values

# Initialize 5-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
results = []

for train_idx, val_idx in kfold.split(X, y):
    print(f'\n{"="*40}')
    print(f'Training fold {fold_no}')
    print(f'{"="*40}')

    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Handle NaN/inf and normalize with clipping
    X_train = np.nan_to_num(X_train)
    X_val = np.nan_to_num(X_val)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_train = np.clip(X_train, -5, 5)
    X_val = np.clip(X_val, -5, 5)

    # Build model with regularization
    model = Sequential()
    model.add(Dense(256, activation='relu', input_shape=(X_train.shape[1],),
                    kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.5))

    model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.4))

    model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
    model.add(BatchNormalization())
    model.add(Dropout(0.3))

    model.add(Dense(1, activation='sigmoid'))

    # Compile with adjusted learning rate
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    # Early stopping
    early_stop = EarlyStopping(monitor='val_auc',
                              patience=15,
                              mode='max',
                              restore_best_weights=True,
                              verbose=1)

    # Class weighting if imbalanced
    class_counts = np.bincount(y_train)
    class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

    # Train model
    history = model.fit(X_train, y_train,
                       validation_data=(X_val, y_val),
                       epochs=100,
                       batch_size=64,
                       callbacks=[early_stop],
                       class_weight=class_weight,
                       verbose=1)

    # Evaluate model
    val_probabilities = model.predict(X_val).flatten()
    val_predictions = (val_probabilities > 0.5).astype(int)

    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
    metrics = {
        'fold': fold_no,
        'accuracy': accuracy_score(y_val, val_predictions),
        'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
        'mcc': matthews_corrcoef(y_val, val_predictions),
        'kappa': cohen_kappa_score(y_val, val_predictions),
        'auc': roc_auc_score(y_val, val_probabilities)
    }

    results.append(metrics)

    print(f'\nFold {fold_no} Results:')
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"MCC: {metrics['mcc']:.4f}")

    fold_no += 1

# Calculate and display average metrics
print('\n' + '='*50)
print('Cross-Validation Summary')
print('='*50)

avg_metrics = {
    'accuracy': np.mean([r['accuracy'] for r in results]),
    'sensitivity': np.mean([r['sensitivity'] for r in results]),
    'specificity': np.mean([r['specificity'] for r in results]),
    'mcc': np.mean([r['mcc'] for r in results]),
    'kappa': np.mean([r['kappa'] for r in results]),
    'auc': np.mean([r['auc'] for r in results])
}

print(f"\nAverage Metrics Across All Folds:")
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Sensitivity: {avg_metrics['sensitivity']:.4f}")
print(f"Specificity: {avg_metrics['specificity']:.4f}")
print(f"MCC: {avg_metrics['mcc']:.4f}")
print(f"Cohen's Kappa: {avg_metrics['kappa']:.4f}")
print(f"AUC: {avg_metrics['auc']:.4f}")

# Train final model on all data
print("\nTraining final model on all data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = np.clip(X_scaled, -5, 5)

final_model = Sequential()
final_model.add(Dense(256, activation='relu', input_shape=(X_scaled.shape[1],),
                     kernel_regularizer=l2(0.001)))
final_model.add(BatchNormalization())
final_model.add(Dropout(0.5))

final_model.add(Dense(128, activation='relu', kernel_regularizer=l2(0.001)))
final_model.add(BatchNormalization())
final_model.add(Dropout(0.4))

final_model.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))
final_model.add(BatchNormalization())
final_model.add(Dropout(0.3))

final_model.add(Dense(1, activation='sigmoid'))

optimizer = Adam(learning_rate=0.0005)
final_model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Final class weights
class_counts = np.bincount(y)
final_class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

history = final_model.fit(X_scaled, y,
               epochs=100,
               batch_size=64,
               class_weight=final_class_weight,
               verbose=1)

# Save the final model
final_model.save('ctdd_final_model.h5')
print("Final model training complete and saved!")

Class distribution:
Target
1    732
0    732
Name: count, dtype: int64

Training fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 76ms/step - accuracy: 0.5512 - auc: 0.5443 - loss: 0.6409 - val_accuracy: 0.8157 - val_auc: 0.9065 - val_loss: 1.0238
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.7637 - auc: 0.8372 - loss: 0.3200 - val_accuracy: 0.8771 - val_auc: 0.9575 - val_loss: 0.8549
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.8376 - auc: 0.9231 - loss: 0.1900 - val_accuracy: 0.9078 - val_auc: 0.9672 - val_loss: 0.7846
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 72ms/step - accuracy: 0.8659 - auc: 0.9489 - loss: 0.1365 - val_accuracy: 0.8908 - val_auc: 0.9680 - val_loss: 0.7480
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.9024 - auc: 0.9681 - loss: 0.1064 - val_accuracy: 0.8976 - val_auc: 0.9671 - val_loss: 0.7214
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 74ms/step - accuracy: 0.5109 - auc: 0.5312 - loss: 0.6389 - val_accuracy: 0.7235 - val_auc: 0.8369 - val_loss: 1.0160
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 56ms/step - accuracy: 0.7048 - auc: 0.7722 - loss: 0.3191 - val_accuracy: 0.6212 - val_auc: 0.9562 - val_loss: 0.8482
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 71ms/step - accuracy: 0.8315 - auc: 0.9123 - loss: 0.1893 - val_accuracy: 0.5358 - val_auc: 0.9699 - val_loss: 0.7971
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.8817 - auc: 0.9555 - loss: 0.1356 - val_accuracy: 0.4983 - val_auc: 0.9705 - val_loss: 0.7810
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.9034 - auc: 0.9648 - loss: 0.1054 - val_accuracy: 0.4983 - val_auc: 0.9675 - val_loss: 0.7624
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 74ms/step - accuracy: 0.4767 - auc: 0.4909 - loss: 0.6402 - val_accuracy: 0.7031 - val_auc: 0.8951 - val_loss: 1.0265
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 49ms/step - accuracy: 0.7166 - auc: 0.7891 - loss: 0.3201 - val_accuracy: 0.5700 - val_auc: 0.9308 - val_loss: 0.8678
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - accuracy: 0.8224 - auc: 0.9062 - loss: 0.1901 - val_accuracy: 0.5017 - val_auc: 0.9491 - val_loss: 0.8110
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.8612 - auc: 0.9370 - loss: 0.1363 - val_accuracy: 0.5017 - val_auc: 0.9553 - val_loss: 0.7859
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.9114 - auc: 0.9682 - loss: 0.1060 - val_accuracy: 0.5017 - val_auc: 0.9591 - val_loss: 0.7686
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 82ms/step - accuracy: 0.5316 - auc: 0.5452 - loss: 0.6411 - val_accuracy: 0.6246 - val_auc: 0.9128 - val_loss: 1.0253
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.7405 - auc: 0.8202 - loss: 0.3205 - val_accuracy: 0.5870 - val_auc: 0.9609 - val_loss: 0.8570
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.8491 - auc: 0.9303 - loss: 0.1904 - val_accuracy: 0.6553 - val_auc: 0.9722 - val_loss: 0.7913
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 52ms/step - accuracy: 0.8747 - auc: 0.9471 - loss: 0.1366 - val_accuracy: 0.6621 - val_auc: 0.9746 - val_loss: 0.7551
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 53ms/step - accuracy: 0.9209 - auc: 0.9715 - loss: 0.1063 - val_accuracy: 0.7133 - val_auc: 0.9734 - val_loss: 0.7301
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 71ms/step - accuracy: 0.4931 - auc: 0.5018 - loss: 0.6424 - val_accuracy: 0.5925 - val_auc: 0.8956 - val_loss: 1.0305
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.7393 - auc: 0.8134 - loss: 0.3215 - val_accuracy: 0.5034 - val_auc: 0.9442 - val_loss: 0.8790
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 47ms/step - accuracy: 0.8225 - auc: 0.9076 - loss: 0.1912 - val_accuracy: 0.5000 - val_auc: 0.9590 - val_loss: 0.8239
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step - accuracy: 0.8805 - auc: 0.9526 - loss: 0.1373 - val_accuracy: 0.5000 - val_auc: 0.9593 - val_loss: 0.7940
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 73ms/step - accuracy: 0.9235 - auc: 0.9746 - loss: 0.1070 - val_accuracy: 0.5000 - val_auc: 0.9630 - val_loss: 0.7703
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 40ms/step - accuracy: 0.5451 - auc: 0.5593 - loss: 0.6206
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.7471 - auc: 0.8233 - loss: 0.2723
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.8132 - auc: 0.9057 - loss: 0.1584
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.8822 - auc: 0.9479 - loss: 0.1140
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 61ms/step - accuracy: 0.9002 - auc: 0.9558 - loss: 0.0864
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.9175 - auc: 0.9704 - loss: 0.0656
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 42ms/step - accuracy: 0.9364 - auc: 0.9790 - loss: 0.0498
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Final model training complete and saved!
