In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m18.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
import os
import numpy as np
import pandas as pd
from Bio import SeqIO

# Amino acid alphabet
AminoAcids = ['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P', 'Q', 'R', 'S', 'T', 'V', 'W', 'Y']

# Generate all possible tri-peptide combinations as feature names (e.g., 'AAA', 'AAC', 'AAG', ...)
tri_peptide_names = [f"{a1}{a2}{a3}" for a1 in AminoAcids for a2 in AminoAcids for a3 in AminoAcids]

# Mapping of amino acid characters to integers (for counting occurrences)
aa_dict = {aa: i for i, aa in enumerate(AminoAcids)}

# Function to calculate the CTDT feature (correlation of tri-peptides)
def calculate_ctdt(sequence):
    tri_peptide_count = np.zeros((20, 20, 20))  # 3D array for counting tri-peptide transitions
    for i in range(len(sequence) - 2):
        first_aa = sequence[i]
        second_aa = sequence[i + 1]
        third_aa = sequence[i + 2]

        if first_aa in aa_dict and second_aa in aa_dict and third_aa in aa_dict:
            tri_peptide_count[aa_dict[first_aa], aa_dict[second_aa], aa_dict[third_aa]] += 1

    total_tri_peptides = np.sum(tri_peptide_count)

    if total_tri_peptides > 0:
        tri_peptide_count = tri_peptide_count / total_tri_peptides  # Normalize by total count

    # Flatten the 3D array into a 1D feature vector
    return tri_peptide_count.flatten()

# Function to extract CTDT features from sequences in FASTA format
def extract_ctdt_features(fasta_path):
    features = []
    for record in SeqIO.parse(fasta_path, "fasta"):
        seq = str(record.seq)
        ctdt_features = calculate_ctdt(seq)
        features.append(ctdt_features)

    # Convert the list of feature vectors into a DataFrame with letter-based feature names
    df = pd.DataFrame(features, columns=tri_peptide_names)
    return df

# Define file paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/NEGATIVE_validation (2) (1).fasta"

# Output directories for each file
output_dirs = {
    "main_positive": "/content/ctdt_main_positive_features.csv",
    "main_negative": "/content/ctdt_main_negative_features.csv",
    "validation_positive": "/content/ctdt_validation_positive_features.csv",
    "validation_negative": "/content/ctdt_validation_negative_features.csv"
}

# Function to ensure the directory exists
def ensure_directory_exists(file_path):
    directory = os.path.dirname(file_path)
    if not os.path.exists(directory):
        os.makedirs(directory)

# Main workflow function
def main():
    # File paths for input FASTA files
    datasets = [
        (main_p, output_dirs["main_positive"]),
        (main_n, output_dirs["main_negative"]),
        (validation_p, output_dirs["validation_positive"]),
        (validation_n, output_dirs["validation_negative"])
    ]

    # Extract features and save to CSV
    for fasta_path, output_csv in datasets:
        print(f"Processing {fasta_path}...")

        # Ensure the directory for saving the output file exists
        ensure_directory_exists(output_csv)

        # Extract features and save to CSV
        features_df = extract_ctdt_features(fasta_path)
        features_df.to_csv(output_csv, index=False)
        print(f"CTDT features saved to {output_csv}")

if __name__ == "__main__":
    main()


Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_main (2) (1).fasta...
CTDT features saved to /content/ctdt_main_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_main (2) (1).fasta...
CTDT features saved to /content/ctdt_main_negative_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/POSITIVE_validation (2) (1).fasta...
CTDT features saved to /content/ctdt_validation_positive_features.csv
Processing /content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/NEGATIVE_validation (2) (1).fasta...
CTDT features saved to /content/ctdt_validation_negative_features.csv


# ***All Algorithm ***

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_negative_features.csv")

In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})


Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...


Parameters: { "use_label_encoder" } are not used.




Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001516 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

Training CatBoost...

Training AdaBoost...





Training Neural Network...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0         Random Forest        0.995704             0.923333
1                   MLP        0.995704             0.900000
2              CatBoost        0.970790             0.900000
3        Neural Network        0.995704             0.896667
4               XGBoost        0.963918             0.896667
5         Decision Tree        0.995704             0.893333
6              AdaBoost        0.939863             0.890000
7     Gradient Boosting        0.935567             0.880000
8           Naive Bayes        0.959622             0.866667
9                  k-NN        0.928694             0.856667
10             LightGBM        0.887457             0.840000
11  Logistic Regression        0.849656             0.823333
12                  SVM        0.809278             0.773333


# **CROSS VALIDATION**

In [None]:
!pip install catboost



In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}


In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001052 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 631
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 81
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000971 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 610
[LightGBM] [Info] Number of data points in the train set: 931, number of used 




Performing Cross-validation for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step

Performing Cross-validation for MLP...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0                   MLP          0.929543         0.006546
1         Random Forest          0.928685         0.009711
2        Neural Network          0.924401         0.007462
3              CatBoost          0.898613         0.018209
4           Naive Bayes          0.892648         0.025944
5               XGBoost          0.888320         0.013813
6         Decision Tree          0.873694         0.016350
7     Gradient Boosting          0.871137         0.015565
8              AdaBoost          0.865980         0.013125
9                  k-NN          0.856504         0.022010
10  Logistic Regression          0.804976         0.016045
11             LightGBM          0.784372         0.013287
12                  SVM          0.766320         0.012068


# **Hyperparameter optimization with Optuna**

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Mak

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2024-12-28 14:39:16,430] A new study created in memory with name: no-name-a137c802-774f-4c6a-924d-b3ea24ce5d98


Optimizing SVM...


[I 2024-12-28 14:39:28,814] Trial 0 finished with value: 0.86 and parameters: {'C': 6.917740866662788, 'kernel': 'poly'}. Best is trial 0 with value: 0.86.
[I 2024-12-28 14:39:36,716] Trial 1 finished with value: 0.8766666666666667 and parameters: {'C': 4.34503912163165, 'kernel': 'linear'}. Best is trial 1 with value: 0.8766666666666667.
[I 2024-12-28 14:39:41,067] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 9.155725309958758, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.8866666666666667.
[I 2024-12-28 14:39:50,832] Trial 3 finished with value: 0.9033333333333333 and parameters: {'C': 4.814321539411033, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9033333333333333.
[I 2024-12-28 14:40:00,170] Trial 4 finished with value: 0.9033333333333333 and parameters: {'C': 8.497728824928087, 'kernel': 'rbf'}. Best is trial 3 with value: 0.9033333333333333.
[I 2024-12-28 14:40:06,946] Trial 5 finished with value: 0.8833333333333333 and parameters: {'C': 9.233764

Optimizing Decision Tree...


[I 2024-12-28 14:43:38,004] Trial 1 finished with value: 0.8433333333333334 and parameters: {'max_depth': 16, 'min_samples_split': 9}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 14:43:38,287] Trial 2 finished with value: 0.8133333333333334 and parameters: {'max_depth': 13, 'min_samples_split': 4}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 14:43:38,596] Trial 3 finished with value: 0.84 and parameters: {'max_depth': 15, 'min_samples_split': 10}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 14:43:38,799] Trial 4 finished with value: 0.76 and parameters: {'max_depth': 6, 'min_samples_split': 8}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 14:43:38,994] Trial 5 finished with value: 0.7233333333333334 and parameters: {'max_depth': 4, 'min_samples_split': 7}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 14:43:39,409] Trial 6 finished with value: 0.85 and parameters: {'max_depth': 16, 'min_samples_split

Optimizing Random Forest...


[I 2024-12-28 14:43:50,984] Trial 0 finished with value: 0.86 and parameters: {'n_estimators': 361, 'max_depth': 17, 'min_samples_split': 4}. Best is trial 0 with value: 0.86.
[I 2024-12-28 14:43:51,229] Trial 1 finished with value: 0.8 and parameters: {'n_estimators': 55, 'max_depth': 3, 'min_samples_split': 9}. Best is trial 0 with value: 0.86.
[I 2024-12-28 14:43:52,162] Trial 2 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 164, 'max_depth': 12, 'min_samples_split': 8}. Best is trial 0 with value: 0.86.
[I 2024-12-28 14:43:52,708] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 71, 'max_depth': 16, 'min_samples_split': 6}. Best is trial 3 with value: 0.8633333333333333.
[I 2024-12-28 14:43:54,994] Trial 4 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 241, 'max_depth': 20, 'min_samples_split': 4}. Best is trial 4 with value: 0.8766666666666667.
[I 2024-12-28 14:43:58,755] Trial 5 finished with valu

Optimizing Logistic Regression...


[I 2024-12-28 14:44:49,783] Trial 1 finished with value: 0.8466666666666667 and parameters: {'C': 1.747745938175596, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:44:50,198] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 9.84640609434229, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8866666666666667.
[I 2024-12-28 14:44:50,620] Trial 3 finished with value: 0.8866666666666667 and parameters: {'C': 7.983438601789271, 'solver': 'lbfgs'}. Best is trial 2 with value: 0.8866666666666667.
[I 2024-12-28 14:44:50,863] Trial 4 finished with value: 0.8866666666666667 and parameters: {'C': 8.5067281061123, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8866666666666667.
[I 2024-12-28 14:44:51,100] Trial 5 finished with value: 0.8766666666666667 and parameters: {'C': 4.395073038897617, 'solver': 'liblinear'}. Best is trial 2 with value: 0.8866666666666667.
[I 2024-12-28 14:44:51,325] Trial 6 finished with value: 0.88666666666

Optimizing k-NN...


[I 2024-12-28 14:45:02,560] Trial 0 finished with value: 0.84 and parameters: {'n_neighbors': 18}. Best is trial 0 with value: 0.84.
[I 2024-12-28 14:45:02,940] Trial 1 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 17}. Best is trial 1 with value: 0.8533333333333334.
[I 2024-12-28 14:45:03,315] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_neighbors': 5}. Best is trial 2 with value: 0.8566666666666667.
[I 2024-12-28 14:45:03,703] Trial 3 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 3 with value: 0.88.
[I 2024-12-28 14:45:04,078] Trial 4 finished with value: 0.86 and parameters: {'n_neighbors': 4}. Best is trial 3 with value: 0.88.
[I 2024-12-28 14:45:04,456] Trial 5 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 3 with value: 0.88.
[I 2024-12-28 14:45:04,835] Trial 6 finished with value: 0.8833333333333333 and parameters: {'n_neighbors': 3}. Best is trial 6 with va

Optimizing Naive Bayes...


[I 2024-12-28 14:45:15,867] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:16,154] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:16,424] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:16,698] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:16,973] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:17,261] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 14:45:17,531] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-28 14:46:55,278] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 498, 'learning_rate': 0.4340012232366628, 'max_depth': 9}. Best is trial 0 with value: 0.8733333333333333.
[I 2024-12-28 14:47:54,279] Trial 1 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 199, 'learning_rate': 0.38870321813624475, 'max_depth': 16}. Best is trial 1 with value: 0.8933333333333333.
[I 2024-12-28 14:48:33,405] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 122, 'learning_rate': 0.15969208207211305, 'max_depth': 18}. Best is trial 2 with value: 0.9.
[I 2024-12-28 14:48:49,871] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 65, 'learning_rate': 0.46569986588310347, 'max_depth': 13}. Best is trial 2 with value: 0.9.
[I 2024-12-28 14:49:09,244] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 57, 'learning_rate': 0.4669915664239524, 'max_depth': 19}. Best is 

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 15:08:49,441] Trial 0 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 66, 'max_depth': 9, 'learning_rate': 0.10204889892531611}. Best is trial 0 with value: 0.8766666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 15:09:14,253] Trial 1 finished with value: 0.9 and parameters: {'n_estimators': 424, 'max_depth': 6, 'learning_rate': 0.17717587591430156}. Best is trial 1 with value: 0.9.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 15:09:39,259] Trial 2 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 396, 'max_depth': 9, 'learning_rate': 0.17093207856582862}. Best is trial 2 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 15:10:06,614] Trial 3 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 337, 'max_depth': 18, 'learning_rate': 0.1154848239408756}. Best is

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001563 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:10,746] Trial 0 finished with value: 0.84 and parameters: {'n_estimators': 385, 'max_depth': 9, 'learning_rate': 0.15062886668488007}. Best is trial 0 with value: 0.84.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001838 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:11,372] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 355, 'max_depth': 17, 'learning_rate': 0.09238490232799047}. Best is trial 1 with value: 0.8433333333333334.
[I 2024-12-28 15:21:11,779] Trial 2 finished with value: 0.85 and parameters: {'n_estimators': 100, 'max_depth': 18, 'learning_rate': 0.4484872143827501}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001718 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001741 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:12,477] Trial 3 finished with value: 0.84 and parameters: {'n_estimators': 345, 'max_depth': 15, 'learning_rate': 0.06961355441193103}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001710 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:13,065] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 303, 'max_depth': 13, 'learning_rate': 0.3700542816929311}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001679 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:13,782] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 283, 'max_depth': 18, 'learning_rate': 0.4154306633010395}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:14,521] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 437, 'max_depth': 19, 'learning_rate': 0.23851563902175324}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001782 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:15,117] Trial 7 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 372, 'max_depth': 4, 'learning_rate': 0.3857477714073919}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001762 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:15,637] Trial 8 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 274, 'max_depth': 8, 'learning_rate': 0.018325280260657825}. Best is trial 2 with value: 0.85.




[I 2024-12-28 15:21:15,953] Trial 9 finished with value: 0.84 and parameters: {'n_estimators': 138, 'max_depth': 8, 'learning_rate': 0.3243710884676435}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:16,239] Trial 10 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 75, 'max_depth': 13, 'learning_rate': 0.4946522392528603}. Best is trial 2 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:16,589] Trial 11 finished with value: 0.86 and parameters: {'n_estimators': 168, 'max_depth': 13, 'learning_rate': 0.4921221802150917}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001434 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:16,957] Trial 12 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 186, 'max_depth': 20, 'learning_rate': 0.4957428634876815}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:17,326] Trial 13 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 187, 'max_depth': 20, 'learning_rate': 0.48363440424814164}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:17,809] Trial 14 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 197, 'max_depth': 3, 'learning_rate': 0.28164494562111353}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:18,199] Trial 15 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 204, 'max_depth': 11, 'learning_rate': 0.21977139701214793}. Best is trial 11 with value: 0.86.
[I 2024-12-28 15:21:18,546] Trial 16 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 149, 'max_depth': 15, 'learning_rate': 0.3356066432041847}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001469 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001490 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:18,944] Trial 17 finished with value: 0.85 and parameters: {'n_estimators': 228, 'max_depth': 16, 'learning_rate': 0.423604232555119}. Best is trial 11 with value: 0.86.
[I 2024-12-28 15:21:19,216] Trial 18 finished with value: 0.85 and parameters: {'n_estimators': 52, 'max_depth': 6, 'learning_rate': 0.46714057050666646}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001502 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:19,531] Trial 19 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 123, 'max_depth': 11, 'learning_rate': 0.3168090704685437}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001464 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001444 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:19,958] Trial 20 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 240, 'max_depth': 20, 'learning_rate': 0.17459931625534741}. Best is trial 11 with value: 0.86.
[I 2024-12-28 15:21:20,268] Trial 21 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 105, 'max_depth': 18, 'learning_rate': 0.4430072524974912}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001445 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:20,747] Trial 22 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 166, 'max_depth': 13, 'learning_rate': 0.49552060137259}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001758 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:21,060] Trial 23 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 92, 'max_depth': 17, 'learning_rate': 0.44162552720608894}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001707 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001481 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:21,646] Trial 24 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 491, 'max_depth': 15, 'learning_rate': 0.3797320377148194}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001426 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:22,220] Trial 25 finished with value: 0.86 and parameters: {'n_estimators': 476, 'max_depth': 15, 'learning_rate': 0.38699220614902696}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:22,830] Trial 26 finished with value: 0.85 and parameters: {'n_estimators': 482, 'max_depth': 12, 'learning_rate': 0.3926051293467486}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:23,260] Trial 27 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 247, 'max_depth': 14, 'learning_rate': 0.34343485031085813}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001085 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:23,815] Trial 28 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 433, 'max_depth': 10, 'learning_rate': 0.28594082514524227}. Best is trial 11 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 15:21:24,282] Trial 29 finished with value: 0.85 and parameters: {'n_estimators': 322, 'max_depth': 9, 'learning_rate': 0.41500283858064396}. Best is trial 11 with value: 0.86.
[I 2024-12-28 15:21:24,285] A new study created in memory with name: no-name-86239427-9537-41d0-98b4-64f6b181b7d7


Optimizing AdaBoost...


[I 2024-12-28 15:21:57,522] Trial 0 finished with value: 0.6633333333333333 and parameters: {'n_estimators': 446, 'learning_rate': 0.03799874594135725}. Best is trial 0 with value: 0.6633333333333333.
[I 2024-12-28 15:22:31,089] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 450, 'learning_rate': 0.8788216075099975}. Best is trial 1 with value: 0.8633333333333333.
[I 2024-12-28 15:22:35,378] Trial 2 finished with value: 0.7433333333333333 and parameters: {'n_estimators': 64, 'learning_rate': 0.841715726898074}. Best is trial 1 with value: 0.8633333333333333.
[I 2024-12-28 15:23:02,730] Trial 3 finished with value: 0.7933333333333333 and parameters: {'n_estimators': 372, 'learning_rate': 0.5795150660073377}. Best is trial 1 with value: 0.8633333333333333.
[I 2024-12-28 15:23:25,331] Trial 4 finished with value: 0.7933333333333333 and parameters: {'n_estimators': 320, 'learning_rate': 0.6398992638181801}. Best is trial 1 with value: 0.8633333333333333.
[

Optimizing Neural Network...


[I 2024-12-28 15:34:59,878] Trial 0 finished with value: 0.9 and parameters: {'hidden_layer_1': 51, 'hidden_layer_2': 14, 'learning_rate_init': 0.0163122427844449}. Best is trial 0 with value: 0.9.
[I 2024-12-28 15:35:15,504] Trial 1 finished with value: 0.9 and parameters: {'hidden_layer_1': 72, 'hidden_layer_2': 52, 'learning_rate_init': 0.08327924078355399}. Best is trial 0 with value: 0.9.
[I 2024-12-28 15:35:33,594] Trial 2 finished with value: 0.91 and parameters: {'hidden_layer_1': 29, 'hidden_layer_2': 32, 'learning_rate_init': 0.09562872230988455}. Best is trial 2 with value: 0.91.
[I 2024-12-28 15:35:52,704] Trial 3 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 45, 'hidden_layer_2': 90, 'learning_rate_init': 0.015171129065969301}. Best is trial 2 with value: 0.91.
[I 2024-12-28 15:35:59,375] Trial 4 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 13, 'hidden_layer_2': 31, 'learning_rate_init': 0.035926299216057375}. Best

Optimizing MLP...


[I 2024-12-28 15:44:13,507] Trial 0 finished with value: 0.9066666666666666 and parameters: {'layer_1': 85, 'layer_2': 87, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.09637475607675786}. Best is trial 0 with value: 0.9066666666666666.
[I 2024-12-28 15:44:55,570] Trial 1 finished with value: 0.9166666666666666 and parameters: {'layer_1': 115, 'layer_2': 61, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.014057073428258952}. Best is trial 1 with value: 0.9166666666666666.
[I 2024-12-28 15:45:23,946] Trial 2 finished with value: 0.9166666666666666 and parameters: {'layer_1': 98, 'layer_2': 100, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.02681734235581086}. Best is trial 1 with value: 0.9166666666666666.
[I 2024-12-28 15:46:02,353] Trial 3 finished with value: 0.9233333333333333 and parameters: {'layer_1': 121, 'layer_2': 101, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.0459982309745124}. Best is tria

                  Model  Accuracy  \
0                   SVM  0.903333   
1         Decision Tree  0.866667   
2         Random Forest  0.896667   
3   Logistic Regression  0.893333   
4                  k-NN  0.883333   
5           Naive Bayes  0.866667   
6     Gradient Boosting  0.903333   
7               XGBoost  0.910000   
8              LightGBM  0.860000   
9              AdaBoost  0.886667   
10       Neural Network  0.930000   
11                  MLP  0.923333   

                                          Best Params  
0           {'C': 4.814321539411033, 'kernel': 'rbf'}  
1           {'max_depth': 19, 'min_samples_split': 4}  
2   {'n_estimators': 264, 'max_depth': 20, 'min_sa...  
3         {'C': 9.989696614214784, 'solver': 'lbfgs'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 120, 'learning_rate': 0.25107...  
7   {'n_estimators': 439, 'max_depth': 14, 'learni...  
8   {'n_estima

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2024-12-28 16:13:58,661] A new study created in memory with name: no-name-902f26ac-b73a-4759-90d3-6b700dada2bb


Optimizing SVM...


[I 2024-12-28 16:14:06,506] Trial 0 finished with value: 0.88 and parameters: {'C': 0.8414117865498301, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.88.
[I 2024-12-28 16:14:16,147] Trial 1 finished with value: 0.86 and parameters: {'C': 8.14252033296222, 'kernel': 'poly'}. Best is trial 0 with value: 0.88.
[I 2024-12-28 16:14:25,587] Trial 2 finished with value: 0.9033333333333333 and parameters: {'C': 7.9917166362691985, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 16:14:34,498] Trial 3 finished with value: 0.9033333333333333 and parameters: {'C': 0.7781794408401751, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 16:14:39,384] Trial 4 finished with value: 0.8566666666666667 and parameters: {'C': 8.725939366943512, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 16:14:46,460] Trial 5 finished with value: 0.9 and parameters: {'C': 6.108677359820001, 'kernel': 'linear'}. Best is

Optimizing Decision Tree...


[I 2024-12-28 16:18:22,709] Trial 0 finished with value: 0.82 and parameters: {'max_depth': 12, 'min_samples_split': 5}. Best is trial 0 with value: 0.82.
[I 2024-12-28 16:18:23,037] Trial 1 finished with value: 0.7466666666666667 and parameters: {'max_depth': 5, 'min_samples_split': 7}. Best is trial 0 with value: 0.82.
[I 2024-12-28 16:18:23,630] Trial 2 finished with value: 0.8633333333333333 and parameters: {'max_depth': 19, 'min_samples_split': 4}. Best is trial 2 with value: 0.8633333333333333.
[I 2024-12-28 16:18:24,108] Trial 3 finished with value: 0.8266666666666667 and parameters: {'max_depth': 13, 'min_samples_split': 2}. Best is trial 2 with value: 0.8633333333333333.
[I 2024-12-28 16:18:24,685] Trial 4 finished with value: 0.8566666666666667 and parameters: {'max_depth': 18, 'min_samples_split': 2}. Best is trial 2 with value: 0.8633333333333333.
[I 2024-12-28 16:18:25,114] Trial 5 finished with value: 0.81 and parameters: {'max_depth': 10, 'min_samples_split': 4}. Best is

Optimizing Random Forest...


[I 2024-12-28 16:18:35,210] Trial 0 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 346, 'max_depth': 7, 'min_samples_split': 7}. Best is trial 0 with value: 0.8233333333333334.
[I 2024-12-28 16:18:35,875] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 97, 'max_depth': 14, 'min_samples_split': 5}. Best is trial 1 with value: 0.8633333333333333.
[I 2024-12-28 16:18:37,154] Trial 2 finished with value: 0.8 and parameters: {'n_estimators': 438, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 1 with value: 0.8633333333333333.
[I 2024-12-28 16:18:39,876] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 246, 'max_depth': 19, 'min_samples_split': 8}. Best is trial 3 with value: 0.88.
[I 2024-12-28 16:18:41,984] Trial 4 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 414, 'max_depth': 5, 'min_samples_split': 8}. Best is trial 3 with value: 0.88.
[I 2024-12-28 16:18:42,918] Trial 5 finis

Optimizing Logistic Regression...


[I 2024-12-28 16:19:19,880] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 9.997418005044306, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-28 16:19:20,319] Trial 2 finished with value: 0.8833333333333333 and parameters: {'C': 6.92370289552392, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-28 16:19:20,922] Trial 3 finished with value: 0.8866666666666667 and parameters: {'C': 9.29342253278588, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-28 16:19:21,285] Trial 4 finished with value: 0.79 and parameters: {'C': 0.6086364512995223, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-28 16:19:21,645] Trial 5 finished with value: 0.8866666666666667 and parameters: {'C': 9.117486283575076, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8866666666666667.
[I 2024-12-28 16:19:21,987] Trial 6 finished with value: 0.8833333333333333 an

Optimizing k-NN...


[I 2024-12-28 16:19:31,056] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 0.8733333333333333.
[I 2024-12-28 16:19:31,433] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8733333333333333.
[I 2024-12-28 16:19:31,811] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 0 with value: 0.8733333333333333.
[I 2024-12-28 16:19:32,203] Trial 3 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 3 with value: 0.88.
[I 2024-12-28 16:19:32,581] Trial 4 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 11}. Best is trial 3 with value: 0.88.
[I 2024-12-28 16:19:32,979] Trial 5 finished with value: 0.8833333333333333 and parameters: {'n_neighbors': 3}. Best is trial 5 with value: 0.8833333333333333.
[I 2024-12-28 16:19:33,356] Trial 6 finished with value: 0.8666666666666667 and

Optimizing Naive Bayes...


[I 2024-12-28 16:19:44,484] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:44,749] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:45,017] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:45,284] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:45,561] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:45,822] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 16:19:46,086] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-28 16:20:45,924] Trial 0 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 231, 'learning_rate': 0.44698735012432356, 'max_depth': 12}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 16:21:06,427] Trial 1 finished with value: 0.88 and parameters: {'n_estimators': 141, 'learning_rate': 0.11987918348152053, 'max_depth': 7}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 16:21:48,824] Trial 2 finished with value: 0.88 and parameters: {'n_estimators': 399, 'learning_rate': 0.32648115109114706, 'max_depth': 5}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 16:23:39,660] Trial 3 finished with value: 0.89 and parameters: {'n_estimators': 323, 'learning_rate': 0.4601545695894388, 'max_depth': 20}. Best is trial 3 with value: 0.89.
[I 2024-12-28 16:25:01,706] Trial 4 finished with value: 0.87 and parameters: {'n_estimators': 382, 'learning_rate': 0.3093717791510879, 'max_depth': 11}. Best is trial 3 with value: 0.89

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 16:48:52,337] Trial 0 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 448, 'max_depth': 20, 'learning_rate': 0.08290360349835339}. Best is trial 0 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 16:49:06,772] Trial 1 finished with value: 0.91 and parameters: {'n_estimators': 136, 'max_depth': 20, 'learning_rate': 0.15210253196917053}. Best is trial 1 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 16:49:28,653] Trial 2 finished with value: 0.9 and parameters: {'n_estimators': 354, 'max_depth': 7, 'learning_rate': 0.09192882748347697}. Best is trial 1 with value: 0.91.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 16:49:53,265] Trial 3 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 304, 'max_depth': 17, 'learning_rate': 0.045872612326986084}. Best is trial 1 with value: 0

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001923 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:26,776] Trial 0 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 236, 'max_depth': 16, 'learning_rate': 0.06245296195995516}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001742 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:27,251] Trial 1 finished with value: 0.84 and parameters: {'n_estimators': 148, 'max_depth': 5, 'learning_rate': 0.4393916902581952}. Best is trial 0 with value: 0.8433333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001779 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:27,954] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 434, 'max_depth': 8, 'learning_rate': 0.31194113622711517}. Best is trial 2 with value: 0.8466666666666667.
[I 2024-12-28 16:58:28,311] Trial 3 finished with value: 0.84 and parameters: {'n_estimators': 183, 'max_depth': 10, 'learning_rate': 0.15332988691005942}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001489 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:28,718] Trial 4 finished with value: 0.84 and parameters: {'n_estimators': 68, 'max_depth': 14, 'learning_rate': 0.13765237822426704}. Best is trial 2 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:29,149] Trial 5 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 291, 'max_depth': 11, 'learning_rate': 0.39087343943841624}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:29,682] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 465, 'max_depth': 5, 'learning_rate': 0.468109269356499}. Best is trial 5 with value: 0.8533333333333334.




[I 2024-12-28 16:58:29,978] Trial 7 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 67, 'max_depth': 17, 'learning_rate': 0.10693215216410769}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001514 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:30,373] Trial 8 finished with value: 0.84 and parameters: {'n_estimators': 236, 'max_depth': 19, 'learning_rate': 0.43862353978182994}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001452 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:30,753] Trial 9 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 235, 'max_depth': 5, 'learning_rate': 0.3951457217624125}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001425 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:31,267] Trial 10 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 378, 'max_depth': 13, 'learning_rate': 0.27704943820038114}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001420 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:31,754] Trial 11 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 342, 'max_depth': 9, 'learning_rate': 0.3268475376429738}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:32,349] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 474, 'max_depth': 8, 'learning_rate': 0.3386690341378644}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001402 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:32,986] Trial 13 finished with value: 0.84 and parameters: {'n_estimators': 366, 'max_depth': 11, 'learning_rate': 0.2206021819682529}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001448 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:33,513] Trial 14 finished with value: 0.85 and parameters: {'n_estimators': 414, 'max_depth': 7, 'learning_rate': 0.23417838073145356}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001083 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:33,972] Trial 15 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 305, 'max_depth': 3, 'learning_rate': 0.22327246442694565}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001432 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:34,498] Trial 16 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 407, 'max_depth': 13, 'learning_rate': 0.37720797055621136}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001423 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:34,965] Trial 17 finished with value: 0.85 and parameters: {'n_estimators': 308, 'max_depth': 7, 'learning_rate': 0.19935199430839787}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001380 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:35,611] Trial 18 finished with value: 0.83 and parameters: {'n_estimators': 494, 'max_depth': 11, 'learning_rate': 0.011072114397704835}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002002 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:36,118] Trial 19 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 417, 'max_depth': 3, 'learning_rate': 0.4931249570650241}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:36,589] Trial 20 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 306, 'max_depth': 7, 'learning_rate': 0.271161704191241}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001463 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:37,065] Trial 21 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 298, 'max_depth': 7, 'learning_rate': 0.21110439650916216}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001446 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:37,673] Trial 22 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 342, 'max_depth': 6, 'learning_rate': 0.17907047042733037}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:38,170] Trial 23 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 262, 'max_depth': 9, 'learning_rate': 0.25129474702915033}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001666 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:38,712] Trial 24 finished with value: 0.84 and parameters: {'n_estimators': 194, 'max_depth': 15, 'learning_rate': 0.18272242723191542}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001795 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:39,360] Trial 25 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 334, 'max_depth': 12, 'learning_rate': 0.37145224019703044}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001654 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:40,070] Trial 26 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 385, 'max_depth': 10, 'learning_rate': 0.2959416102999321}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001655 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:40,746] Trial 27 finished with value: 0.85 and parameters: {'n_estimators': 384, 'max_depth': 10, 'learning_rate': 0.4043431284516739}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001717 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:41,557] Trial 28 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 441, 'max_depth': 12, 'learning_rate': 0.29995496477210176}. Best is trial 5 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001664 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 16:58:42,324] Trial 29 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 394, 'max_depth': 16, 'learning_rate': 0.35118420864980926}. Best is trial 29 with value: 0.8566666666666667.
[I 2024-12-28 16:58:42,342] A new study created in memory with name: no-name-2de53ba1-fa43-4bf8-99cc-47634de25624


Optimizing AdaBoost...


[I 2024-12-28 16:59:11,715] Trial 0 finished with value: 0.6933333333333334 and parameters: {'n_estimators': 408, 'learning_rate': 0.07888649038794852}. Best is trial 0 with value: 0.6933333333333334.
[I 2024-12-28 16:59:45,513] Trial 1 finished with value: 0.83 and parameters: {'n_estimators': 472, 'learning_rate': 0.6402560647608481}. Best is trial 1 with value: 0.83.
[I 2024-12-28 16:59:49,500] Trial 2 finished with value: 0.72 and parameters: {'n_estimators': 60, 'learning_rate': 0.5942927432244551}. Best is trial 1 with value: 0.83.
[I 2024-12-28 16:59:54,821] Trial 3 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 72, 'learning_rate': 0.5966409684270139}. Best is trial 1 with value: 0.83.
[I 2024-12-28 17:00:14,229] Trial 4 finished with value: 0.69 and parameters: {'n_estimators': 261, 'learning_rate': 0.1020701956746366}. Best is trial 1 with value: 0.83.
[I 2024-12-28 17:00:20,466] Trial 5 finished with value: 0.7366666666666667 and parameters: {'n_est

Optimizing Neural Network...


[I 2024-12-28 17:12:11,017] Trial 0 finished with value: 0.9266666666666666 and parameters: {'hidden_layer_1': 72, 'hidden_layer_2': 92, 'learning_rate_init': 0.0632015217339856}. Best is trial 0 with value: 0.9266666666666666.
[I 2024-12-28 17:12:45,880] Trial 1 finished with value: 0.9133333333333333 and parameters: {'hidden_layer_1': 97, 'hidden_layer_2': 41, 'learning_rate_init': 0.0394698009799764}. Best is trial 0 with value: 0.9266666666666666.
[I 2024-12-28 17:13:14,217] Trial 2 finished with value: 0.9233333333333333 and parameters: {'hidden_layer_1': 52, 'hidden_layer_2': 34, 'learning_rate_init': 0.07878833327218554}. Best is trial 0 with value: 0.9266666666666666.
[I 2024-12-28 17:13:30,329] Trial 3 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 50, 'hidden_layer_2': 16, 'learning_rate_init': 0.029410904596480998}. Best is trial 0 with value: 0.9266666666666666.
[I 2024-12-28 17:13:57,544] Trial 4 finished with value: 0.8966666666666666 and param

Optimizing MLP...


[I 2024-12-28 17:23:28,131] Trial 0 finished with value: 0.9133333333333333 and parameters: {'layer_1': 69, 'layer_2': 144, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.09291185986733802}. Best is trial 0 with value: 0.9133333333333333.
[I 2024-12-28 17:24:05,216] Trial 1 finished with value: 0.9033333333333333 and parameters: {'layer_1': 100, 'layer_2': 143, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.034879873672968714}. Best is trial 0 with value: 0.9133333333333333.
[I 2024-12-28 17:24:36,104] Trial 2 finished with value: 0.9033333333333333 and parameters: {'layer_1': 115, 'layer_2': 135, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.06208070507133277}. Best is trial 0 with value: 0.9133333333333333.
[I 2024-12-28 17:25:07,395] Trial 3 finished with value: 0.9 and parameters: {'layer_1': 103, 'layer_2': 101, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.09480438721201426}. Best is trial 0 with value: 0.913

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.906667     0.873333     0.940000  0.815147   
1         Decision Tree  0.863333     0.766667     0.960000  0.740640   
2         Random Forest  0.883333     0.813333     0.953333  0.774292   
3   Logistic Regression  0.886667     0.846667     0.926667  0.775820   
4                  k-NN  0.883333     0.866667     0.900000  0.767093   
5           Naive Bayes  0.866667     0.880000     0.853333  0.733594   
6     Gradient Boosting  0.900000     0.866667     0.933333  0.801784   
7               XGBoost  0.913333     0.893333     0.933333  0.827329   
8              LightGBM  0.856667     0.780000     0.933333  0.721870   
9              AdaBoost  0.876667     0.806667     0.946667  0.760826   
10       Neural Network  0.930000     0.960000     0.900000  0.861552   
11                  MLP  0.920000     0.933333     0.906667  0.840299   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_CTDT_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2024-12-28 17:48:01,067] A new study created in memory with name: no-name-3705412d-028c-43b3-bf8a-ae7817892de9


Optimizing SVM...


[I 2024-12-28 17:48:42,537] Trial 0 finished with value: 0.8633333333333333 and parameters: {'C': 0.32921223780532827, 'kernel': 'rbf'}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 17:49:17,719] Trial 1 finished with value: 0.83 and parameters: {'C': 3.733142784986342, 'kernel': 'poly'}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 17:49:57,363] Trial 2 finished with value: 0.9033333333333333 and parameters: {'C': 8.278626171260056, 'kernel': 'rbf'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 17:50:17,085] Trial 3 finished with value: 0.8933333333333333 and parameters: {'C': 6.926065791270991, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 17:50:58,348] Trial 4 finished with value: 0.6566666666666666 and parameters: {'C': 0.47861930963292254, 'kernel': 'linear'}. Best is trial 2 with value: 0.9033333333333333.
[I 2024-12-28 17:51:28,012] Trial 5 finished with value: 0.8833333333333333 and paramet

Optimizing Decision Tree...


[I 2024-12-28 18:07:14,217] Trial 0 finished with value: 0.83 and parameters: {'max_depth': 14, 'min_samples_split': 10}. Best is trial 0 with value: 0.83.
[I 2024-12-28 18:07:14,724] Trial 1 finished with value: 0.8366666666666667 and parameters: {'max_depth': 15, 'min_samples_split': 10}. Best is trial 1 with value: 0.8366666666666667.
[I 2024-12-28 18:07:15,130] Trial 2 finished with value: 0.7833333333333333 and parameters: {'max_depth': 8, 'min_samples_split': 7}. Best is trial 1 with value: 0.8366666666666667.
[I 2024-12-28 18:07:15,356] Trial 3 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 9}. Best is trial 1 with value: 0.8366666666666667.
[I 2024-12-28 18:07:15,669] Trial 4 finished with value: 0.8466666666666667 and parameters: {'max_depth': 16, 'min_samples_split': 8}. Best is trial 4 with value: 0.8466666666666667.
[I 2024-12-28 18:07:16,005] Trial 5 finished with value: 0.8566666666666667 and parameters: {'max_depth': 17, 'min_samples_spli

Optimizing Random Forest...


[I 2024-12-28 18:07:25,726] Trial 0 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 453, 'max_depth': 4, 'min_samples_split': 10}. Best is trial 0 with value: 0.8133333333333334.
[I 2024-12-28 18:07:27,432] Trial 1 finished with value: 0.81 and parameters: {'n_estimators': 318, 'max_depth': 5, 'min_samples_split': 7}. Best is trial 0 with value: 0.8133333333333334.
[I 2024-12-28 18:07:28,724] Trial 2 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 274, 'max_depth': 3, 'min_samples_split': 4}. Best is trial 0 with value: 0.8133333333333334.
[I 2024-12-28 18:07:30,522] Trial 3 finished with value: 0.87 and parameters: {'n_estimators': 221, 'max_depth': 17, 'min_samples_split': 5}. Best is trial 3 with value: 0.87.
[I 2024-12-28 18:07:31,804] Trial 4 finished with value: 0.82 and parameters: {'n_estimators': 424, 'max_depth': 4, 'min_samples_split': 3}. Best is trial 3 with value: 0.87.
[I 2024-12-28 18:07:32,765] Trial 5 finished with val

Optimizing Logistic Regression...


[I 2024-12-28 18:08:02,158] Trial 0 finished with value: 0.8833333333333333 and parameters: {'C': 5.151195481197457, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 18:08:02,590] Trial 1 finished with value: 0.8833333333333333 and parameters: {'C': 7.032261593369165, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 18:08:03,076] Trial 2 finished with value: 0.8833333333333333 and parameters: {'C': 7.216935246113768, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 18:08:03,566] Trial 3 finished with value: 0.8366666666666667 and parameters: {'C': 1.4679311928837457, 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 18:08:03,906] Trial 4 finished with value: 0.8733333333333333 and parameters: {'C': 3.9511044446342796, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8833333333333333.
[I 2024-12-28 18:08:04,166] Trial 5 finished with value: 0.8866666666

Optimizing k-NN...


[I 2024-12-28 18:08:12,697] Trial 0 finished with value: 0.8466666666666667 and parameters: {'n_neighbors': 20}. Best is trial 0 with value: 0.8466666666666667.
[I 2024-12-28 18:08:13,334] Trial 1 finished with value: 0.87 and parameters: {'n_neighbors': 13}. Best is trial 1 with value: 0.87.
[I 2024-12-28 18:08:13,784] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 17}. Best is trial 1 with value: 0.87.
[I 2024-12-28 18:08:14,173] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 6}. Best is trial 1 with value: 0.87.
[I 2024-12-28 18:08:14,553] Trial 4 finished with value: 0.84 and parameters: {'n_neighbors': 16}. Best is trial 1 with value: 0.87.
[I 2024-12-28 18:08:14,932] Trial 5 finished with value: 0.84 and parameters: {'n_neighbors': 18}. Best is trial 1 with value: 0.87.
[I 2024-12-28 18:08:15,336] Trial 6 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 6 with value: 0.87333

Optimizing Naive Bayes...


[I 2024-12-28 18:08:25,555] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:25,927] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:26,281] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:26,649] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:27,019] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:27,391] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2024-12-28 18:08:27,771] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Optimizing Gradient Boosting...


[I 2024-12-28 18:09:00,930] Trial 0 finished with value: 0.89 and parameters: {'n_estimators': 213, 'learning_rate': 0.20814647388263222, 'max_depth': 6}. Best is trial 0 with value: 0.89.
[I 2024-12-28 18:11:01,844] Trial 1 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 412, 'learning_rate': 0.0736591022356084, 'max_depth': 16}. Best is trial 1 with value: 0.8933333333333333.
[I 2024-12-28 18:11:32,348] Trial 2 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 93, 'learning_rate': 0.4991981940309843, 'max_depth': 18}. Best is trial 2 with value: 0.8966666666666666.
[I 2024-12-28 18:13:40,478] Trial 3 finished with value: 0.88 and parameters: {'n_estimators': 388, 'learning_rate': 0.4978235008117901, 'max_depth': 20}. Best is trial 2 with value: 0.8966666666666666.
[I 2024-12-28 18:14:26,812] Trial 4 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 379, 'learning_rate': 0.1837467111290758, 'max_depth': 6}. Best is

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 18:28:23,536] Trial 0 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 481, 'max_depth': 7, 'learning_rate': 0.3909385615422454}. Best is trial 0 with value: 0.8966666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 18:28:49,435] Trial 1 finished with value: 0.9 and parameters: {'n_estimators': 405, 'max_depth': 7, 'learning_rate': 0.32637327194698346}. Best is trial 1 with value: 0.9.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 18:29:08,720] Trial 2 finished with value: 0.9066666666666666 and parameters: {'n_estimators': 219, 'max_depth': 20, 'learning_rate': 0.3805839403165768}. Best is trial 2 with value: 0.9066666666666666.
Parameters: { "use_label_encoder" } are not used.

[I 2024-12-28 18:29:19,086] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 103, 'max_depth': 11, 'learning_rate': 0.03666159034251966}. Best i

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001767 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:31,563] Trial 0 finished with value: 0.83 and parameters: {'n_estimators': 376, 'max_depth': 17, 'learning_rate': 0.014369280938890134}. Best is trial 0 with value: 0.83.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001164 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:32,074] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 339, 'max_depth': 4, 'learning_rate': 0.3042973474572408}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001384 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:32,532] Trial 2 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 293, 'max_depth': 20, 'learning_rate': 0.06104489778489752}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001103 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:33,010] Trial 3 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 418, 'max_depth': 3, 'learning_rate': 0.28455025324592176}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001418 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:33,411] Trial 4 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 278, 'max_depth': 3, 'learning_rate': 0.05976208548122782}. Best is trial 1 with value: 0.8466666666666667.
[I 2024-12-28 18:38:33,801] Trial 5 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 65, 'max_depth': 3, 'learning_rate': 0.2182939442762855}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001478 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001494 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:34,330] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 425, 'max_depth': 10, 'learning_rate': 0.3194433397619083}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:34,841] Trial 7 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 370, 'max_depth': 11, 'learning_rate': 0.024193436843791803}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:35,336] Trial 8 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 374, 'max_depth': 20, 'learning_rate': 0.4006295943278874}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001545 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:35,808] Trial 9 finished with value: 0.8133333333333334 and parameters: {'n_estimators': 359, 'max_depth': 3, 'learning_rate': 0.015129078417130149}. Best is trial 1 with value: 0.8466666666666667.




[I 2024-12-28 18:38:36,159] Trial 10 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 165, 'max_depth': 8, 'learning_rate': 0.4652432088369912}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001579 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001382 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:36,739] Trial 11 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 484, 'max_depth': 8, 'learning_rate': 0.301754398699372}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001419 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:37,326] Trial 12 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 488, 'max_depth': 7, 'learning_rate': 0.18992456189218207}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:37,900] Trial 13 finished with value: 0.85 and parameters: {'n_estimators': 496, 'max_depth': 6, 'learning_rate': 0.3523284481255895}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001457 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:38,490] Trial 14 finished with value: 0.85 and parameters: {'n_estimators': 487, 'max_depth': 14, 'learning_rate': 0.3805572123235671}. Best is trial 11 with value: 0.8533333333333334.




[I 2024-12-28 18:38:38,991] Trial 15 finished with value: 0.84 and parameters: {'n_estimators': 197, 'max_depth': 7, 'learning_rate': 0.17112132370926933}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001451 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:39,545] Trial 16 finished with value: 0.84 and parameters: {'n_estimators': 449, 'max_depth': 9, 'learning_rate': 0.3758569660332519}. Best is trial 11 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001447 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:40,196] Trial 17 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 494, 'max_depth': 13, 'learning_rate': 0.4899132184296052}. Best is trial 17 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001770 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:40,580] Trial 18 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 216, 'max_depth': 12, 'learning_rate': 0.45944606225125384}. Best is trial 17 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001429 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:41,158] Trial 19 finished with value: 0.84 and parameters: {'n_estimators': 436, 'max_depth': 14, 'learning_rate': 0.12945514029225863}. Best is trial 17 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001439 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:41,618] Trial 20 finished with value: 0.86 and parameters: {'n_estimators': 306, 'max_depth': 13, 'learning_rate': 0.41727494639982865}. Best is trial 20 with value: 0.86.




[I 2024-12-28 18:38:42,010] Trial 21 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 99, 'max_depth': 13, 'learning_rate': 0.49014268242013825}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001864 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001643 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:42,570] Trial 22 finished with value: 0.85 and parameters: {'n_estimators': 239, 'max_depth': 16, 'learning_rate': 0.4141975007211495}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001706 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:43,348] Trial 23 finished with value: 0.84 and parameters: {'n_estimators': 314, 'max_depth': 17, 'learning_rate': 0.43689699511441266}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:44,067] Trial 24 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 458, 'max_depth': 11, 'learning_rate': 0.49723262768648946}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001693 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:44,789] Trial 25 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 408, 'max_depth': 15, 'learning_rate': 0.26022212263713485}. Best is trial 20 with value: 0.86.
[I 2024-12-28 18:38:45,273] Trial 26 finished with value: 0.84 and parameters: {'n_estimators': 132, 'max_depth': 9, 'learning_rate': 0.34020056489233275}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001726 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001721 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:45,902] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 249, 'max_depth': 13, 'learning_rate': 0.4227452554865047}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001892 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:46,613] Trial 28 finished with value: 0.85 and parameters: {'n_estimators': 465, 'max_depth': 5, 'learning_rate': 0.4474786105616828}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001476 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:47,209] Trial 29 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 392, 'max_depth': 12, 'learning_rate': 0.24475663858051006}. Best is trial 20 with value: 0.86.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001422 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2024-12-28 18:38:47,655] A new study created in memory with name: no-name-2672c338-f451-4dd4-b7cd-2b8119516db9


Optimizing AdaBoost...


[I 2024-12-28 18:39:16,517] Trial 0 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 406, 'learning_rate': 0.9475964808707948}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 18:39:30,022] Trial 1 finished with value: 0.79 and parameters: {'n_estimators': 196, 'learning_rate': 0.6788143225571363}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 18:40:03,211] Trial 2 finished with value: 0.7933333333333333 and parameters: {'n_estimators': 466, 'learning_rate': 0.5092382260123617}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 18:40:34,470] Trial 3 finished with value: 0.7 and parameters: {'n_estimators': 442, 'learning_rate': 0.08987929233335534}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 18:41:04,800] Trial 4 finished with value: 0.86 and parameters: {'n_estimators': 429, 'learning_rate': 0.8188791666582512}. Best is trial 0 with value: 0.8633333333333333.
[I 2024-12-28 18:41:20,712] Trial 5 finish

Optimizing Neural Network...


[I 2024-12-28 18:52:56,762] Trial 0 finished with value: 0.9033333333333333 and parameters: {'hidden_layer_1': 20, 'hidden_layer_2': 42, 'learning_rate_init': 0.08547380078919815}. Best is trial 0 with value: 0.9033333333333333.
[I 2024-12-28 18:53:10,596] Trial 1 finished with value: 0.8933333333333333 and parameters: {'hidden_layer_1': 30, 'hidden_layer_2': 59, 'learning_rate_init': 0.027222675755263197}. Best is trial 0 with value: 0.9033333333333333.
[I 2024-12-28 18:53:39,336] Trial 2 finished with value: 0.9166666666666666 and parameters: {'hidden_layer_1': 57, 'hidden_layer_2': 65, 'learning_rate_init': 0.04410803278966192}. Best is trial 2 with value: 0.9166666666666666.
[I 2024-12-28 18:54:07,019] Trial 3 finished with value: 0.91 and parameters: {'hidden_layer_1': 75, 'hidden_layer_2': 77, 'learning_rate_init': 0.04131094091134468}. Best is trial 2 with value: 0.9166666666666666.
[I 2024-12-28 18:54:20,210] Trial 4 finished with value: 0.9133333333333333 and parameters: {'hid

Optimizing MLP...


[I 2024-12-28 19:02:33,766] Trial 0 finished with value: 0.9066666666666666 and parameters: {'layer_1': 122, 'layer_2': 83, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.050903786787646024}. Best is trial 0 with value: 0.9066666666666666.
[I 2024-12-28 19:03:03,670] Trial 1 finished with value: 0.91 and parameters: {'layer_1': 129, 'layer_2': 134, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.027082986802228922}. Best is trial 1 with value: 0.91.
[I 2024-12-28 19:03:13,077] Trial 2 finished with value: 0.5 and parameters: {'layer_1': 132, 'layer_2': 122, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.09061819555249595}. Best is trial 1 with value: 0.91.
[I 2024-12-28 19:03:57,346] Trial 3 finished with value: 0.92 and parameters: {'layer_1': 62, 'layer_2': 95, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.04087038885143787}. Best is trial 3 with value: 0.92.
[I 2024-12-28 19:04:35,753] Trial 4 finished with val

Dataset saved successfully!


Class Feature Vector (CFV)


In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score


# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_CTDT.csv", index=False)
print("CFV dataset created and saved!")


[I 2025-01-15 06:23:49,831] A new study created in memory with name: no-name-ed2f3d97-fa22-483d-9c13-58dc246f79b6


Training and predicting with SVM...


[I 2025-01-15 06:24:10,268] Trial 0 finished with value: 0.88 and parameters: {'C': 5.717977577816307, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:24:50,989] Trial 1 finished with value: 0.9033333333333333 and parameters: {'C': 6.968472826659812, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-15 06:25:09,830] Trial 2 finished with value: 0.8833333333333333 and parameters: {'C': 8.440763527465231, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-15 06:25:37,610] Trial 3 finished with value: 0.8833333333333333 and parameters: {'C': 9.348734108790465, 'kernel': 'linear'}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-15 06:26:17,803] Trial 4 finished with value: 0.9033333333333333 and parameters: {'C': 6.251728441201438, 'kernel': 'rbf'}. Best is trial 1 with value: 0.9033333333333333.
[I 2025-01-15 06:26:37,357] Trial 5 finished with value: 0.89 and parameters: {'C': 6.219898600262071,

Training and predicting with Decision Tree...


[I 2025-01-15 06:42:23,396] Trial 0 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 8}. Best is trial 0 with value: 0.71.
[I 2025-01-15 06:42:23,758] Trial 1 finished with value: 0.71 and parameters: {'max_depth': 3, 'min_samples_split': 3}. Best is trial 0 with value: 0.71.
[I 2025-01-15 06:42:24,413] Trial 2 finished with value: 0.86 and parameters: {'max_depth': 18, 'min_samples_split': 7}. Best is trial 2 with value: 0.86.
[I 2025-01-15 06:42:24,795] Trial 3 finished with value: 0.7233333333333334 and parameters: {'max_depth': 4, 'min_samples_split': 5}. Best is trial 2 with value: 0.86.
[I 2025-01-15 06:42:25,428] Trial 4 finished with value: 0.85 and parameters: {'max_depth': 17, 'min_samples_split': 5}. Best is trial 2 with value: 0.86.
[I 2025-01-15 06:42:26,088] Trial 5 finished with value: 0.86 and parameters: {'max_depth': 19, 'min_samples_split': 6}. Best is trial 2 with value: 0.86.
[I 2025-01-15 06:42:26,749] Trial 6 finished with value: 0.

Training and predicting with Random Forest...


[I 2025-01-15 06:42:38,389] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 264, 'max_depth': 18, 'min_samples_split': 3}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:42:40,096] Trial 1 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 234, 'max_depth': 8, 'min_samples_split': 6}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:42:41,651] Trial 2 finished with value: 0.8166666666666667 and parameters: {'n_estimators': 310, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:42:43,423] Trial 3 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 210, 'max_depth': 20, 'min_samples_split': 5}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:42:45,664] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 457, 'max_depth': 10, 'min_samples_split': 3}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:42:46,084] Trial 5 finished with value: 0.8333333333333334 and 

Training and predicting with Logistic Regression...


[I 2025-01-15 06:43:43,519] Trial 0 finished with value: 0.8633333333333333 and parameters: {'C': 2.860850880431229, 'solver': 'liblinear'}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-15 06:43:43,869] Trial 1 finished with value: 0.8866666666666667 and parameters: {'C': 5.228146841926133, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 06:43:44,217] Trial 2 finished with value: 0.8866666666666667 and parameters: {'C': 9.769408733267197, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 06:43:44,587] Trial 3 finished with value: 0.8833333333333333 and parameters: {'C': 7.271187245430818, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 06:43:44,943] Trial 4 finished with value: 0.8866666666666667 and parameters: {'C': 5.192549800148138, 'solver': 'liblinear'}. Best is trial 1 with value: 0.8866666666666667.
[I 2025-01-15 06:43:45,442] Trial 5 finished with value

Training and predicting with k-NN...


[I 2025-01-15 06:43:57,631] Trial 0 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 0.8733333333333333.
[I 2025-01-15 06:43:58,309] Trial 1 finished with value: 0.8666666666666667 and parameters: {'n_neighbors': 15}. Best is trial 0 with value: 0.8733333333333333.
[I 2025-01-15 06:43:58,975] Trial 2 finished with value: 0.8733333333333333 and parameters: {'n_neighbors': 8}. Best is trial 0 with value: 0.8733333333333333.
[I 2025-01-15 06:43:59,649] Trial 3 finished with value: 0.86 and parameters: {'n_neighbors': 4}. Best is trial 0 with value: 0.8733333333333333.
[I 2025-01-15 06:44:00,320] Trial 4 finished with value: 0.88 and parameters: {'n_neighbors': 9}. Best is trial 4 with value: 0.88.
[I 2025-01-15 06:44:00,984] Trial 5 finished with value: 0.8533333333333334 and parameters: {'n_neighbors': 14}. Best is trial 4 with value: 0.88.
[I 2025-01-15 06:44:01,665] Trial 6 finished with value: 0.8733333333333333 and parameters: {'

Training and predicting with Naive Bayes...


[I 2025-01-15 06:44:20,432] Trial 0 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:20,952] Trial 1 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:21,478] Trial 2 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:22,019] Trial 3 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:22,578] Trial 4 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:23,130] Trial 5 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-15 06:44:23,623] Trial 6 finished with value: 0.8666666666666667 and parameters: {}. Best is trial 0 with value: 0.8666666666

Training and predicting with Gradient Boosting...


[I 2025-01-15 06:45:47,131] Trial 0 finished with value: 0.88 and parameters: {'n_estimators': 247, 'learning_rate': 0.30775984742303036, 'max_depth': 16}. Best is trial 0 with value: 0.88.
[I 2025-01-15 06:46:07,295] Trial 1 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 136, 'learning_rate': 0.027432904049019227, 'max_depth': 7}. Best is trial 1 with value: 0.8833333333333333.
[I 2025-01-15 06:47:51,272] Trial 2 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 329, 'learning_rate': 0.18167984289514286, 'max_depth': 17}. Best is trial 2 with value: 0.8933333333333333.
[I 2025-01-15 06:48:22,311] Trial 3 finished with value: 0.8933333333333333 and parameters: {'n_estimators': 175, 'learning_rate': 0.37598105656849634, 'max_depth': 9}. Best is trial 2 with value: 0.8933333333333333.
[I 2025-01-15 06:50:03,104] Trial 4 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 471, 'learning_rate': 0.24882049415920285, 'max_

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 07:21:26,777] Trial 0 finished with value: 0.8866666666666667 and parameters: {'n_estimators': 56, 'max_depth': 8, 'learning_rate': 0.33722822155290183}. Best is trial 0 with value: 0.8866666666666667.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 07:21:40,837] Trial 1 finished with value: 0.9033333333333333 and parameters: {'n_estimators': 107, 'max_depth': 20, 'learning_rate': 0.16726219628088992}. Best is trial 1 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 07:22:00,818] Trial 2 finished with value: 0.8966666666666666 and parameters: {'n_estimators': 210, 'max_depth': 18, 'learning_rate': 0.31042283445911345}. Best is trial 1 with value: 0.9033333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-15 07:22:33,861] Trial 3 finished with value: 0.9 and parameters: {'n_estimators': 443, 'max_depth': 18, 'learning_rate': 0.401431844

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001639 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:07,952] Trial 0 finished with value: 0.84 and parameters: {'n_estimators': 235, 'max_depth': 15, 'learning_rate': 0.05587851318082565}. Best is trial 0 with value: 0.84.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001401 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:08,467] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 295, 'max_depth': 19, 'learning_rate': 0.22231695613252703}. Best is trial 1 with value: 0.8466666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001441 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:08,950] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 269, 'max_depth': 5, 'learning_rate': 0.26810934892363053}. Best is trial 1 with value: 0.8466666666666667.
[I 2025-01-15 07:32:09,338] Trial 3 finished with value: 0.85 and parameters: {'n_estimators': 145, 'max_depth': 5, 'learning_rate': 0.4590558268610153}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001442 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:09,823] Trial 4 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 113, 'max_depth': 5, 'learning_rate': 0.30587368501422907}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001387 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001412 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:10,232] Trial 5 finished with value: 0.84 and parameters: {'n_estimators': 167, 'max_depth': 5, 'learning_rate': 0.24997461354958173}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001377 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:10,816] Trial 6 finished with value: 0.84 and parameters: {'n_estimators': 427, 'max_depth': 4, 'learning_rate': 0.42924298923650717}. Best is trial 3 with value: 0.85.




[I 2025-01-15 07:32:11,199] Trial 7 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 140, 'max_depth': 4, 'learning_rate': 0.3138941831943858}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001458 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001460 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:11,781] Trial 8 finished with value: 0.84 and parameters: {'n_estimators': 374, 'max_depth': 7, 'learning_rate': 0.36937678151782444}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001406 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:12,362] Trial 9 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 409, 'max_depth': 5, 'learning_rate': 0.040679637660227766}. Best is trial 3 with value: 0.85.




[I 2025-01-15 07:32:12,703] Trial 10 finished with value: 0.84 and parameters: {'n_estimators': 56, 'max_depth': 9, 'learning_rate': 0.49805239788543054}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001353 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001535 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:13,282] Trial 11 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 329, 'max_depth': 20, 'learning_rate': 0.15219147207452738}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001408 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:13,982] Trial 12 finished with value: 0.85 and parameters: {'n_estimators': 497, 'max_depth': 14, 'learning_rate': 0.190616312532637}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001742 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:14,857] Trial 13 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 471, 'max_depth': 13, 'learning_rate': 0.15228366303583224}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:15,514] Trial 14 finished with value: 0.84 and parameters: {'n_estimators': 189, 'max_depth': 11, 'learning_rate': 0.16553727389132578}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001678 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:16,384] Trial 15 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 490, 'max_depth': 16, 'learning_rate': 0.49879285215817176}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001680 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:17,049] Trial 16 finished with value: 0.85 and parameters: {'n_estimators': 230, 'max_depth': 10, 'learning_rate': 0.3951850054328387}. Best is trial 3 with value: 0.85.
[I 2025-01-15 07:32:17,531] Trial 17 finished with value: 0.83 and parameters: {'n_estimators': 58, 'max_depth': 13, 'learning_rate': 0.10137993342603346}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001842 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001649 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:18,342] Trial 18 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 358, 'max_depth': 17, 'learning_rate': 0.19469310716139865}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001765 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:18,894] Trial 19 finished with value: 0.84 and parameters: {'n_estimators': 113, 'max_depth': 8, 'learning_rate': 0.3319343448867673}. Best is trial 3 with value: 0.85.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001976 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:19,633] Trial 20 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 228, 'max_depth': 13, 'learning_rate': 0.42754325998153897}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001399 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:20,223] Trial 21 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 211, 'max_depth': 13, 'learning_rate': 0.44712112000034443}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001411 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:20,719] Trial 22 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 255, 'max_depth': 15, 'learning_rate': 0.44448133837844783}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001395 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:21,266] Trial 23 finished with value: 0.84 and parameters: {'n_estimators': 305, 'max_depth': 12, 'learning_rate': 0.3672768091268607}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001393 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:21,708] Trial 24 finished with value: 0.84 and parameters: {'n_estimators': 165, 'max_depth': 17, 'learning_rate': 0.10605076460130891}. Best is trial 20 with value: 0.8533333333333334.
[I 2025-01-15 07:32:22,091] Trial 25 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 104, 'max_depth': 14, 'learning_rate': 0.4146261519125358}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001427 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:22,575] Trial 26 finished with value: 0.85 and parameters: {'n_estimators': 199, 'max_depth': 11, 'learning_rate': 0.2756624140839214}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001409 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:23,212] Trial 27 finished with value: 0.85 and parameters: {'n_estimators': 454, 'max_depth': 8, 'learning_rate': 0.4672406577511857}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001468 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:23,644] Trial 28 finished with value: 0.85 and parameters: {'n_estimators': 151, 'max_depth': 18, 'learning_rate': 0.37005811466356364}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001415 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:24,157] Trial 29 finished with value: 0.82 and parameters: {'n_estimators': 232, 'max_depth': 15, 'learning_rate': 0.010019532264773312}. Best is trial 20 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001390 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-15 07:32:24,699] A new study created in memory with name: no-name-6311cfdf-c3c9-4159-8f1e-5d4c9f85ec2d


Training and predicting with AdaBoost...


[I 2025-01-15 07:32:36,201] Trial 0 finished with value: 0.8033333333333333 and parameters: {'n_estimators': 149, 'learning_rate': 0.922903141983489}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-15 07:32:51,340] Trial 1 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 204, 'learning_rate': 0.2992488256787768}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-15 07:33:25,190] Trial 2 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 468, 'learning_rate': 0.3040262814761872}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-15 07:33:36,474] Trial 3 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 146, 'learning_rate': 0.30449198043016834}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-15 07:34:12,309] Trial 4 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 496, 'learning_rate': 0.11422475402286317}. Best is trial 0 with value: 0.8033333333333333.

Training and predicting with Neural Network...


[I 2025-01-15 07:46:19,417] Trial 0 finished with value: 0.91 and parameters: {'hidden_layer_1': 78, 'hidden_layer_2': 44, 'learning_rate_init': 0.037188235982632774}. Best is trial 0 with value: 0.91.
[I 2025-01-15 07:46:53,974] Trial 1 finished with value: 0.92 and parameters: {'hidden_layer_1': 85, 'hidden_layer_2': 80, 'learning_rate_init': 0.09510061312020879}. Best is trial 1 with value: 0.92.
[I 2025-01-15 07:47:08,201] Trial 2 finished with value: 0.9 and parameters: {'hidden_layer_1': 27, 'hidden_layer_2': 34, 'learning_rate_init': 0.015102873872363608}. Best is trial 1 with value: 0.92.
[I 2025-01-15 07:48:10,317] Trial 3 finished with value: 0.9 and parameters: {'hidden_layer_1': 75, 'hidden_layer_2': 100, 'learning_rate_init': 0.001109898117013194}. Best is trial 1 with value: 0.92.
[I 2025-01-15 07:48:27,338] Trial 4 finished with value: 0.9066666666666666 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 47, 'learning_rate_init': 0.04936893390563183}. Best is trial

Training and predicting with MLP...


[I 2025-01-15 07:59:41,089] Trial 0 finished with value: 0.91 and parameters: {'layer_1': 97, 'layer_2': 97, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.035866919768500795}. Best is trial 0 with value: 0.91.
[I 2025-01-15 08:00:24,426] Trial 1 finished with value: 0.8933333333333333 and parameters: {'layer_1': 94, 'layer_2': 121, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.0015212446889584451}. Best is trial 0 with value: 0.91.
[I 2025-01-15 08:01:46,833] Trial 2 finished with value: 0.8533333333333334 and parameters: {'layer_1': 66, 'layer_2': 62, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.0063328585741566175}. Best is trial 0 with value: 0.91.
[I 2025-01-15 08:02:51,435] Trial 3 finished with value: 0.9033333333333333 and parameters: {'layer_1': 140, 'layer_2': 118, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.07420068180406914}. Best is trial 0 with value: 0.91.
[I 2025-01-15 08:03:25,131] Trial 4 finished

CFV dataset created and saved!


CPFV (Combined Probability and Class Feature Vector)

In [None]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/CTDT_validation_negative_features.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_CTDT.csv", index=False)




Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...


Parameters: { "use_label_encoder" } are not used.



Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001386 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...


# **parameter grids for RandomizedSearchCV**

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier # Import path for KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_negative_features.csv")


# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for RandomizedSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply random search
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing RandomizedSearchCV for SVM...
Performing RandomizedSearchCV for Decision Tree...
Performing RandomizedSearchCV for Random Forest...


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

Performing RandomizedSearchCV for Logistic Regression...




Performing RandomizedSearchCV for k-NN...
Performing RandomizedSearchCV for Naive Bayes...
Performing RandomizedSearchCV for Gradient Boosting...
Performing RandomizedSearchCV for XGBoost...
Performing RandomizedSearchCV for LightGBM...




[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001764 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing RandomizedSearchCV for CatBoost...
Performing RandomizedSearchCV for AdaBoost...




Performing RandomizedSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.928681   
1         Decision Tree    0.854791   
2         Random Forest    0.922673   
3   Logistic Regression    0.924401   
4                  k-NN    0.915810   
5           Naive Bayes    0.905531   
6     Gradient Boosting    0.897762   
7               XGBoost    0.892615   
8              LightGBM    0.795549   
9              CatBoost    0.906356   
10             AdaBoost    0.884882   
11       Neural Network    0.937276   

                                      Best Parameters  
0    {'kernel': 'linear', 'gamma': 'scale', 'C': 100}  
1   {'min_samples_split': 2, 'min_samples_leaf': 2...  
2   {'n_estimators': 100, 'min_samples_split': 5, ...  
3       {'solver': 'saga', 'penalty': 'l2', 'C': 100}  
4   {'weights': 'distance', 'n_neighbors': 3, 'met...  
5            {'var_smoothing': 2.782559402207126e-05}  
6   {'n_estimators': 200, 'max_depth': 5, 'learnin...  
7   {'n_estimators': 200, 'max_depth': 5, 'l

# **Hyperparameter grids for GridSearchCV**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_positive_features.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/CTDT/CTDT (Physicochemical Properties)/ctdt_validation_negative_features.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for GridSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing GridSearchCV for SVM...
Performing GridSearchCV for Decision Tree...
Performing GridSearchCV for Random Forest...


135 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
135 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Performing GridSearchCV for Logistic Regression...




Performing GridSearchCV for k-NN...
Performing GridSearchCV for Naive Bayes...
Performing GridSearchCV for Gradient Boosting...
Performing GridSearchCV for XGBoost...
Performing GridSearchCV for LightGBM...


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001428 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 890
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing GridSearchCV for CatBoost...
Performing GridSearchCV for AdaBoost...




Performing GridSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.928681   
1         Decision Tree    0.880590   
2         Random Forest    0.923535   
3   Logistic Regression    0.924401   
4                  k-NN    0.915810   
5           Naive Bayes    0.905531   
6     Gradient Boosting    0.899478   
7               XGBoost    0.894339   
8              LightGBM    0.795549   
9              CatBoost    0.906356   
10             AdaBoost    0.886592   
11       Neural Network    0.935567   

                                      Best Parameters  
0    {'C': 100, 'gamma': 'scale', 'kernel': 'linear'}  
1   {'max_depth': None, 'min_samples_leaf': 1, 'mi...  
2   {'max_depth': None, 'max_features': 'sqrt', 'm...  
3       {'C': 100, 'penalty': 'l2', 'solver': 'saga'}  
4   {'metric': 'manhattan', 'n_neighbors': 3, 'wei...  
5            {'var_smoothing': 2.782559402207126e-05}  
6   {'learning_rate': 0.2, 'max_depth': 5, 'n_esti...  
7   {'learning_rate': 0.2, 'max_depth': 7, '

PROPOSED MODEL FOR CTDT

In [None]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, matthews_corrcoef,
                           cohen_kappa_score, roc_auc_score)

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_main_positive_features (1).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_validation_positive_features (2).csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_validation_negative_features.csv")

# Add target labels
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine and shuffle data
train_data = pd.concat([main_p, main_n]).sample(frac=1, random_state=42)
validation_data = pd.concat([validation_p, validation_n]).sample(frac=1, random_state=42)

# Check class balance
print("Class distribution in training set:")
print(train_data['Target'].value_counts())
print("\nClass distribution in validation set:")
print(validation_data['Target'].value_counts())

# Separate features and labels
X_train = train_data.drop(columns=['Target']).values
y_train = train_data['Target'].values
X_val = validation_data.drop(columns=['Target']).values
y_val = validation_data['Target'].values

# Check for NaN/inf and replace
X_train = np.nan_to_num(X_train)
X_val = np.nan_to_num(X_val)

# Normalize features carefully
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Clip extreme values
X_train = np.clip(X_train, -5, 5)
X_val = np.clip(X_val, -5, 5)

# Build simpler and more stable model
model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train.shape[1],),
          kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

# Compile with class weighting if imbalanced
optimizer = Adam(learning_rate=0.0005)
model.compile(optimizer=optimizer,
              loss='binary_crossentropy',
              metrics=['accuracy',
                      tf.keras.metrics.AUC(name='auc'),
                      tf.keras.metrics.Recall(name='recall'),
                      tf.keras.metrics.Precision(name='precision')])

# Model summary
model.summary()

# Early stopping based on validation AUC
early_stop = EarlyStopping(monitor='val_auc',
                          patience=15,
                          mode='max',
                          restore_best_weights=True,
                          verbose=1)

# Train model
history = model.fit(X_train, y_train,
                   validation_data=(X_val, y_val),
                   epochs=10,
                   batch_size=64,
                   callbacks=[early_stop],
                   verbose=1)

# Evaluate model
val_probabilities = model.predict(X_val).flatten()
val_predictions = (val_probabilities > 0.5).astype(int)

# Calculate metrics
accuracy = accuracy_score(y_val, val_predictions)
tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
mcc = matthews_corrcoef(y_val, val_predictions)
kappa = cohen_kappa_score(y_val, val_predictions)
auc = roc_auc_score(y_val, val_probabilities)

# Print metrics
print("\nValidation Metrics:")
print(f"Accuracy: {accuracy:.4f}")
print(f"Sensitivity (Recall): {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"MCC: {mcc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"AUC: {auc:.4f}")

print("\nClassification Report:")
print(classification_report(y_val, val_predictions))

print("\nConfusion Matrix:")
print(confusion_matrix(y_val, val_predictions))

Class distribution in training set:
Target
0    582
1    582
Name: count, dtype: int64

Class distribution in validation set:
Target
0    150
1    150
Name: count, dtype: int64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 86ms/step - accuracy: 0.5048 - auc: 0.4937 - loss: 1.6740 - precision: 0.5362 - recall: 0.4849 - val_accuracy: 0.5767 - val_auc: 0.5917 - val_loss: 1.1628 - val_precision: 0.6769 - val_recall: 0.2933
Epoch 2/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 71ms/step - accuracy: 0.5103 - auc: 0.4993 - loss: 1.3647 - precision: 0.4972 - recall: 0.5049 - val_accuracy: 0.5367 - val_auc: 0.5776 - val_loss: 1.0701 - val_precision: 0.6897 - val_recall: 0.1333
Epoch 3/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.4734 - auc: 0.5141 - loss: 1.2609 - precision: 0.4625 - recall: 0.4734 - val_accuracy: 0.5200 - val_auc: 0.5628 - val_loss: 1.0552 - val_precision: 0.7500 - val_recall: 0.0600
Epoch 4/10
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.4901 - auc: 0.4717 - loss: 1.2925 - precision: 0.4868 - recall: 0.4831 - val

In [None]:
# cross valiation 5 fold


import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (accuracy_score, classification_report,
                           confusion_matrix, matthews_corrcoef,
                           cohen_kappa_score, roc_auc_score)

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_main_positive_features (1).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_validation_positive_features (2).csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/5_CTDT/ctdt_validation_negative_features.csv")

# Add target labels
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

# Combine all data for cross-validation
all_data = pd.concat([main_p, main_n, validation_p, validation_n])
all_data = all_data.sample(frac=1, random_state=42).reset_index(drop=True)

# Check class balance
print("Class distribution:")
print(all_data['Target'].value_counts())

# Check for non-numeric columns
non_numeric_cols = all_data.select_dtypes(exclude=[np.number]).columns.tolist()
if non_numeric_cols:
    print(f"\nDropping non-numeric columns: {non_numeric_cols}")
    all_data = all_data.drop(columns=non_numeric_cols)

# Separate features and labels
X = all_data.drop(columns=['Target']).values
y = all_data['Target'].values

# Initialize 5-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
fold_no = 1
results = []

for train_idx, val_idx in kfold.split(X, y):
    print(f'\n{"="*40}')
    print(f'Training fold {fold_no}')
    print(f'{"="*40}')

    # Split data
    X_train, X_val = X[train_idx], X[val_idx]
    y_train, y_val = y[train_idx], y[val_idx]

    # Handle NaN/inf and normalize with clipping
    X_train = np.nan_to_num(X_train)
    X_val = np.nan_to_num(X_val)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)
    X_train = np.clip(X_train, -5, 5)
    X_val = np.clip(X_val, -5, 5)

    # Build model with regularization
    model = Sequential([
        Dense(256, activation='relu', input_shape=(X_train.shape[1],),
              kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.5),

        Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.4),

        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(1, activation='sigmoid')
    ])

    # Compile with adjusted learning rate
    optimizer = Adam(learning_rate=0.0005)
    model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

    # Early stopping
    early_stop = EarlyStopping(monitor='val_auc',
                              patience=15,
                              mode='max',
                              restore_best_weights=True,
                              verbose=1)

    # Class weighting if imbalanced
    class_counts = np.bincount(y_train)
    class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

    # Train model
    history = model.fit(X_train, y_train,
                       validation_data=(X_val, y_val),
                       epochs=100,
                       batch_size=64,
                       callbacks=[early_stop],
                       class_weight=class_weight,
                       verbose=1)

    # Evaluate model
    val_probabilities = model.predict(X_val).flatten()
    val_predictions = (val_probabilities > 0.5).astype(int)

    # Calculate metrics
    tn, fp, fn, tp = confusion_matrix(y_val, val_predictions).ravel()
    metrics = {
        'fold': fold_no,
        'accuracy': accuracy_score(y_val, val_predictions),
        'sensitivity': tp / (tp + fn) if (tp + fn) > 0 else 0,
        'specificity': tn / (tn + fp) if (tn + fp) > 0 else 0,
        'mcc': matthews_corrcoef(y_val, val_predictions),
        'kappa': cohen_kappa_score(y_val, val_predictions),
        'auc': roc_auc_score(y_val, val_probabilities)
    }

    results.append(metrics)

    print(f'\nFold {fold_no} Results:')
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"AUC: {metrics['auc']:.4f}")
    print(f"MCC: {metrics['mcc']:.4f}")

    fold_no += 1

# Calculate and display average metrics
print('\n' + '='*50)
print('Cross-Validation Summary')
print('='*50)

avg_metrics = {
    'accuracy': np.mean([r['accuracy'] for r in results]),
    'sensitivity': np.mean([r['sensitivity'] for r in results]),
    'specificity': np.mean([r['specificity'] for r in results]),
    'mcc': np.mean([r['mcc'] for r in results]),
    'kappa': np.mean([r['kappa'] for r in results]),
    'auc': np.mean([r['auc'] for r in results])
}

print(f"\nAverage Metrics Across All Folds:")
print(f"Accuracy: {avg_metrics['accuracy']:.4f}")
print(f"Sensitivity: {avg_metrics['sensitivity']:.4f}")
print(f"Specificity: {avg_metrics['specificity']:.4f}")
print(f"MCC: {avg_metrics['mcc']:.4f}")
print(f"Cohen's Kappa: {avg_metrics['kappa']:.4f}")
print(f"AUC: {avg_metrics['auc']:.4f}")

# Train final model on all data
print("\nTraining final model on all data...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = np.clip(X_scaled, -5, 5)

final_model = Sequential([
    Dense(256, activation='relu', input_shape=(X_scaled.shape[1],),
          kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.5),

    Dense(128, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(1, activation='sigmoid')
])

optimizer = Adam(learning_rate=0.0005)
final_model.compile(optimizer=optimizer,
                  loss='binary_crossentropy',
                  metrics=['accuracy', tf.keras.metrics.AUC(name='auc')])

# Final class weights
class_counts = np.bincount(y)
final_class_weight = {0: 1/class_counts[0], 1: 1/class_counts[1]}

history = final_model.fit(X_scaled, y,
               epochs=100,
               batch_size=64,
               class_weight=final_class_weight,
               verbose=1)

# Save the final model
final_model.save('ctdt_final_model.h5')
print("Final model training complete and saved!")

Class distribution:
Target
1    732
0    732
Name: count, dtype: int64

Training fold 1


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 76ms/step - accuracy: 0.4724 - auc: 0.4918 - loss: 0.6403 - val_accuracy: 0.5051 - val_auc: 0.4955 - val_loss: 1.0622
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 50ms/step - accuracy: 0.5046 - auc: 0.5014 - loss: 0.3183 - val_accuracy: 0.4915 - val_auc: 0.4867 - val_loss: 0.9056
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.5451 - auc: 0.5547 - loss: 0.1896 - val_accuracy: 0.5017 - val_auc: 0.4652 - val_loss: 0.8445
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 69ms/step - accuracy: 0.5593 - auc: 0.5902 - loss: 0.1373 - val_accuracy: 0.4983 - val_auc: 0.4319 - val_loss: 0.8144
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.5360 - auc: 0.5300 - loss: 0.1078 - val_accuracy: 0.4983 - val_auc: 0.4479 - val_loss: 0.7927
Epoch 6/100
[1m19/19[0m [3

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 80ms/step - accuracy: 0.5026 - auc: 0.4992 - loss: 0.6407 - val_accuracy: 0.4881 - val_auc: 0.4193 - val_loss: 1.0648
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 47ms/step - accuracy: 0.5160 - auc: 0.5326 - loss: 0.3183 - val_accuracy: 0.4983 - val_auc: 0.3919 - val_loss: 0.9093
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.5938 - auc: 0.6030 - loss: 0.1896 - val_accuracy: 0.4983 - val_auc: 0.3905 - val_loss: 0.8504
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 63ms/step - accuracy: 0.5545 - auc: 0.5776 - loss: 0.1374 - val_accuracy: 0.4983 - val_auc: 0.3757 - val_loss: 0.8207
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 68ms/step - accuracy: 0.5661 - auc: 0.6050 - loss: 0.1079 - val_accuracy: 0.4983 - val_auc: 0.3705 - val_loss: 0.7991
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 72ms/step - accuracy: 0.5075 - auc: 0.5161 - loss: 0.6403 - val_accuracy: 0.4949 - val_auc: 0.4749 - val_loss: 1.0616
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 43ms/step - accuracy: 0.4995 - auc: 0.4948 - loss: 0.3185 - val_accuracy: 0.4744 - val_auc: 0.4229 - val_loss: 0.9045
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5594 - auc: 0.5663 - loss: 0.1899 - val_accuracy: 0.4812 - val_auc: 0.4206 - val_loss: 0.8433
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.5275 - auc: 0.5570 - loss: 0.1374 - val_accuracy: 0.4642 - val_auc: 0.4043 - val_loss: 0.8114
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 44ms/step - accuracy: 0.5710 - auc: 0.5876 - loss: 0.1078 - val_accuracy: 0.4642 - val_auc: 0.4348 - val_loss: 0.7876
Epoch 6/100
[1m19/19[0m [32

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 79ms/step - accuracy: 0.5494 - auc: 0.5383 - loss: 0.6401 - val_accuracy: 0.4778 - val_auc: 0.4532 - val_loss: 1.0642
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.5110 - auc: 0.5187 - loss: 0.3191 - val_accuracy: 0.4642 - val_auc: 0.4431 - val_loss: 0.9043
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.5382 - auc: 0.5562 - loss: 0.1907 - val_accuracy: 0.4846 - val_auc: 0.4437 - val_loss: 0.8421
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - accuracy: 0.5461 - auc: 0.5767 - loss: 0.1382 - val_accuracy: 0.5051 - val_auc: 0.4629 - val_loss: 0.8091
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 46ms/step - accuracy: 0.5561 - auc: 0.5774 - loss: 0.1084 - val_accuracy: 0.4812 - val_auc: 0.4631 - val_loss: 0.7862
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 99ms/step - accuracy: 0.5047 - auc: 0.5042 - loss: 0.6401 - val_accuracy: 0.5000 - val_auc: 0.5030 - val_loss: 1.0613
Epoch 2/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 46ms/step - accuracy: 0.5413 - auc: 0.5598 - loss: 0.3188 - val_accuracy: 0.4897 - val_auc: 0.4700 - val_loss: 0.9041
Epoch 3/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.5314 - auc: 0.5458 - loss: 0.1902 - val_accuracy: 0.5000 - val_auc: 0.3903 - val_loss: 0.8439
Epoch 4/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - accuracy: 0.5589 - auc: 0.5878 - loss: 0.1377 - val_accuracy: 0.5000 - val_auc: 0.3778 - val_loss: 0.8132
Epoch 5/100
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - accuracy: 0.5418 - auc: 0.5710 - loss: 0.1080 - val_accuracy: 0.5000 - val_auc: 0.4170 - val_loss: 0.7901
Epoch 6/100
[1m19/19[0m [32m━━━━━━━━━━━

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 53ms/step - accuracy: 0.5114 - auc: 0.4974 - loss: 0.6189
Epoch 2/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 62ms/step - accuracy: 0.5102 - auc: 0.5217 - loss: 0.2718
Epoch 3/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 41ms/step - accuracy: 0.5301 - auc: 0.5380 - loss: 0.1592
Epoch 4/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - accuracy: 0.5072 - auc: 0.5401 - loss: 0.1150
Epoch 5/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5646 - auc: 0.5733 - loss: 0.0871
Epoch 6/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5454 - auc: 0.5390 - loss: 0.0663
Epoch 7/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 40ms/step - accuracy: 0.5168 - auc: 0.5425 - loss: 0.0505
Epoch 8/100
[1m23/23[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[



Final model training complete and saved!
