In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.2/3.2 MB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.84


In [None]:
from Bio import SeqIO
import pandas as pd
import numpy as np

# PCP dictionary
pcp_dict = {
    'A': [1.8, 0.62, 0.046],
    'C': [2.5, 0.29, 0.128],
    'D': [-3.5, 1.90, -0.836],
    'E': [-3.5, 1.50, -0.736],
    'F': [2.8, 0.32, 0.257],
    'G': [-0.4, 0.48, 0.000],
    'H': [-3.2, 1.68, 0.434],
    'I': [4.5, 0.15, 0.000],
    'K': [-3.9, 1.81, -0.500],
    'L': [3.8, 0.15, 0.000],
    'M': [1.9, 0.17, 0.000],
    'N': [-3.5, 1.62, -0.259],
    'P': [-1.6, 0.64, 0.000],
    'Q': [-3.5, 1.56, -0.186],
    'R': [-4.5, 1.82, 0.291],
    'S': [-0.8, 0.66, 0.000],
    'T': [-0.7, 0.65, 0.000],
    'V': [4.2, 0.54, 0.000],
    'W': [-0.9, 0.37, 0.324],
    'Y': [-1.3, 0.61, 0.324],
}

# Clean sequence
def clean_sequence(sequence):
    valid_amino_acids = set(pcp_dict.keys())
    return ''.join([aa for aa in sequence if aa in valid_amino_acids])

# Compute PCP features
def compute_pcp_features(sequence):
    # Map amino acids to PCP values
    pcp_values = [pcp_dict[aa] for aa in sequence if aa in pcp_dict]
    if not pcp_values:  # Skip if no valid amino acids
        return [0] * 9  # Return zeros for mean/std

    # Convert to numpy array
    pcp_array = np.array(pcp_values)

    # Compute mean and standard deviation for each PCP dimension
    pcp_means = np.mean(pcp_array, axis=0)
    pcp_stds = np.std(pcp_array, axis=0)

    # Flatten and return features
    return np.concatenate([pcp_means, pcp_stds]).tolist()

# Process FASTA file
def process_fasta(input_path, output_path):
    features = []
    for record in SeqIO.parse(input_path, "fasta"):
        cleaned_sequence = clean_sequence(str(record.seq))
        if cleaned_sequence:  # Skip empty sequences
            pcp_features = compute_pcp_features(cleaned_sequence)
            features.append(pcp_features)

    # Convert to DataFrame and save
    column_names = [
        'Hydrophobicity_mean', 'Polarity_mean', 'Charge_mean',
        'Hydrophobicity_std', 'Polarity_std', 'Charge_std'
    ]
    df = pd.DataFrame(features, columns=column_names)
    df.to_csv(output_path, index=False)

# File paths
main_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/POSITIVE_main (2) (1).fasta"
main_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/NEGATIVE_main (2) (1).fasta"
validation_p = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/POSITIVE_validation (2) (1).fasta"
validation_n = "/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/NEGATIVE_validation (2) (1).fasta"

# Output paths
output_main_p = "/content/positive_main_pcp.csv"
output_main_n = "/content/negative_main_pcp.csv"
output_validation_p = "/content/positive_validation_pcp.csv"
output_validation_n = "/content/negative_validation_pcp.csv"

# Process datasets
process_fasta(main_p, output_main_p)
process_fasta(main_n, output_main_n)
process_fasta(validation_p, output_validation_p)
process_fasta(validation_n, output_validation_n)


# **All Algorithm **

In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/positive_main_pcp.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/9_PCP (Physicochemical Properties)/negative_validation_pcp.csv")

In [None]:
# Label the datasets
main_p['label'] = 1
main_n['label'] = 0
validation_p['label'] = 1
validation_n['label'] = 0

# Combine datasets
train_data = pd.concat([main_p, main_n], ignore_index=True)
val_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Separate features and labels
X_train = train_data.drop(columns=['label']).values
y_train = train_data['label'].values
X_val = val_data.drop(columns=['label']).values
y_val = val_data['label'].values


In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ]),
    "MLP": Sequential([
        Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Compile the neural network models
models["Neural Network"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
models["MLP"].compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])

In [None]:
# Store accuracies
results = []

# Train each model and evaluate
for name, model in models.items():
    print(f"\nTraining {name}...")

    if name in ["Neural Network", "MLP"]:
        # Neural Network training
        model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_val, y_val), verbose=0)
        train_pred = (model.predict(X_train) > 0.5).astype("int32")
        val_pred = (model.predict(X_val) > 0.5).astype("int32")
    else:
        # Traditional ML model training
        model.fit(X_train, y_train)
        train_pred = model.predict(X_train)
        val_pred = model.predict(X_val)

    # Calculate train and validation accuracy
    train_accuracy = accuracy_score(y_train, train_pred)
    val_accuracy = accuracy_score(y_val, val_pred)

    results.append({"Model": name, "Train Accuracy": train_accuracy, "Validation Accuracy": val_accuracy})


Training SVM...

Training Decision Tree...

Training Random Forest...

Training Logistic Regression...

Training k-NN...

Training Naive Bayes...

Training Gradient Boosting...

Training XGBoost...

Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.




Training CatBoost...

Training AdaBoost...

Training Neural Network...




[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step 
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 

Training MLP...
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step 


In [None]:
# Convert to DataFrame and sort by Validation Accuracy and Train Accuracy in descending order
results_df = pd.DataFrame(results).sort_values(by=["Validation Accuracy", "Train Accuracy"], ascending=False).reset_index(drop=True)

# Display results
print("\nModel Accuracy Table (Descending Order of Validation Accuracy)")
print(results_df)


Model Accuracy Table (Descending Order of Validation Accuracy)
                  Model  Train Accuracy  Validation Accuracy
0         Random Forest        0.995704             0.870000
1               XGBoost        0.995704             0.866667
2     Gradient Boosting        0.906357             0.860000
3              LightGBM        0.995704             0.856667
4              CatBoost        0.954467             0.856667
5                   MLP        0.885739             0.830000
6         Decision Tree        0.995704             0.826667
7        Neural Network        0.819588             0.816667
8              AdaBoost        0.837629             0.786667
9                  k-NN        0.847079             0.783333
10                  SVM        0.729381             0.730000
11  Logistic Regression        0.702749             0.703333
12          Naive Bayes        0.674399             0.700000


# **CROSS VALIDATION**

In [None]:
!pip install catboost



In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.optimizers import Adam
import numpy as np

In [None]:
# Dictionary of models
models = {
    "SVM": SVC(kernel='linear', probability=True),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
}


In [None]:
# Define Neural Network models
def create_neural_network(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model

def create_mlp(input_shape):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_shape,)),
        Dense(64, activation='relu'),
        Dense(32, activation='relu'),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(), loss='binary_crossentropy', metrics=['accuracy'])
    return model



In [None]:
# Cross-validation for traditional models
results = []

# For traditional ML models, we use cross_val_score
for name, model in models.items():
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Calculate cross-validation accuracy
    cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    mean_accuracy = np.mean(cv_scores)
    std_accuracy = np.std(cv_scores)

    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})

# Cross-validation for Neural Networks (manual implementation)
for name, create_model in [("Neural Network", create_neural_network), ("MLP", create_mlp)]:
    print(f"\nPerforming Cross-validation for {name}...")
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Custom function to calculate accuracy for neural networks
    def neural_network_cross_val(model_func, X_train, y_train):
        accuracies = []
        for train_index, val_index in cv.split(X_train, y_train):
            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
            y_train_fold, y_val_fold = y_train[train_index], y_train[val_index]

            model = model_func(X_train.shape[1])  # Create a new model for each fold
            model.fit(X_train_fold, y_train_fold, epochs=100, batch_size=32, verbose=0)

            y_pred = (model.predict(X_val_fold) > 0.5).astype("int32")
            accuracy = accuracy_score(y_val_fold, y_pred)
            accuracies.append(accuracy)

        return np.mean(accuracies), np.std(accuracies)

    mean_accuracy, std_accuracy = neural_network_cross_val(create_model, X_train, y_train)
    results.append({"Model": name, "Mean CV Accuracy": mean_accuracy, "STD CV Accuracy": std_accuracy})



Performing Cross-validation for SVM...

Performing Cross-validation for Decision Tree...

Performing Cross-validation for Random Forest...

Performing Cross-validation for Logistic Regression...

Performing Cross-validation for k-NN...

Performing Cross-validation for Naive Bayes...

Performing Cross-validation for Gradient Boosting...

Performing Cross-validation for XGBoost...


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.




Performing Cross-validation for LightGBM...
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training from score -0.002148
[LightGBM] [Info] Number of positive: 465, number of negative: 466
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1528
[LightGBM] [Info] Number of data points in the train set: 931, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499463 -> initscore=-0.002148
[LightGBM] [Info] Start training fr




Performing Cross-validation for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step

Performing Cross-validation for MLP...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step


In [None]:
# Convert to DataFrame and display
cv_results_df = pd.DataFrame(results).sort_values(by="Mean CV Accuracy", ascending=False).reset_index(drop=True)
print("\nCross-Validation Accuracy Table")
print(cv_results_df)


Cross-Validation Accuracy Table
                  Model  Mean CV Accuracy  STD CV Accuracy
0              LightGBM          0.860833         0.017231
1         Random Forest          0.858266         0.025951
2               XGBoost          0.852246         0.017831
3              CatBoost          0.845360         0.011835
4     Gradient Boosting          0.811836         0.017278
5         Decision Tree          0.799834         0.018866
6              AdaBoost          0.786074         0.012993
7                   MLP          0.781778         0.011167
8                  k-NN          0.780069         0.021609
9        Neural Network          0.755135         0.016813
10                  SVM          0.724186         0.028460
11  Logistic Regression          0.703574         0.024591
12          Naive Bayes          0.668381         0.025422


# **Hyperparameter optimization with Optuna**

In [1]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.1.0-py3-none-any.whl.metadata (16 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.14.0-py3-none-any.whl.metadata (7.4 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading Mako-1.3.8-py3-none-any.whl.metadata (2.9 kB)
Downloading optuna-4.1.0-py3-none-any.whl (364 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m364.4/364.4 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.14.0-py3-none-any.whl (233 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.5/233.5 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Downloading Mako-1.3.8-py3-none-any.whl (78 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Ma

In [None]:
!pip install scikeras

Collecting scikeras
  Downloading scikeras-0.13.0-py3-none-any.whl.metadata (3.1 kB)
Downloading scikeras-0.13.0-py3-none-any.whl (26 kB)
Installing collected packages: scikeras
Successfully installed scikeras-0.13.0


In [None]:
main_p = pd.read_csv("/content/positive_main_pcp.csv")
main_n = pd.read_csv("/content/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/negative_validation_pcp.csv")

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_main_pcp.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_validation_pcp.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]



# Define models with MLP included
models = {
    "SVM": lambda trial: SVC(
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}


results = []

def optimize_model(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        return accuracy_score(y_val, preds)

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the results
    results.append({
        "Model": model_name,
        "Accuracy": study.best_value,
        "Best Params": study.best_params
    })

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model(model_name, model_func)


# Convert results to a DataFrame
results_df = pd.DataFrame(results)


# Display the DataFrame
print(results_df)


[I 2025-01-02 07:25:38,456] A new study created in memory with name: no-name-d4ee7d09-61d6-497b-9e81-2e124b9c3cf2
[I 2025-01-02 07:25:38,531] Trial 0 finished with value: 0.7333333333333333 and parameters: {'C': 8.375073199644817, 'kernel': 'linear'}. Best is trial 0 with value: 0.7333333333333333.
[I 2025-01-02 07:25:38,624] Trial 1 finished with value: 0.5266666666666666 and parameters: {'C': 8.957116162218938, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.7333333333333333.


Optimizing SVM...


[I 2025-01-02 07:25:38,705] Trial 2 finished with value: 0.7333333333333333 and parameters: {'C': 7.994292026356287, 'kernel': 'linear'}. Best is trial 0 with value: 0.7333333333333333.
[I 2025-01-02 07:25:38,797] Trial 3 finished with value: 0.5266666666666666 and parameters: {'C': 9.236123476456102, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.7333333333333333.
[I 2025-01-02 07:25:38,856] Trial 4 finished with value: 0.7666666666666667 and parameters: {'C': 9.534175407375821, 'kernel': 'poly'}. Best is trial 4 with value: 0.7666666666666667.
[I 2025-01-02 07:25:38,911] Trial 5 finished with value: 0.7466666666666667 and parameters: {'C': 1.8823242256511135, 'kernel': 'rbf'}. Best is trial 4 with value: 0.7666666666666667.
[I 2025-01-02 07:25:38,960] Trial 6 finished with value: 0.7433333333333333 and parameters: {'C': 0.8787460454767659, 'kernel': 'poly'}. Best is trial 4 with value: 0.7666666666666667.
[I 2025-01-02 07:25:39,022] Trial 7 finished with value: 0.74 and paramete

Optimizing Decision Tree...


[I 2025-01-02 07:25:41,505] Trial 10 finished with value: 0.82 and parameters: {'max_depth': 15, 'min_samples_split': 2}. Best is trial 7 with value: 0.8366666666666667.
[I 2025-01-02 07:25:41,540] Trial 11 finished with value: 0.8133333333333334 and parameters: {'max_depth': 10, 'min_samples_split': 10}. Best is trial 7 with value: 0.8366666666666667.
[I 2025-01-02 07:25:41,576] Trial 12 finished with value: 0.8266666666666667 and parameters: {'max_depth': 14, 'min_samples_split': 4}. Best is trial 7 with value: 0.8366666666666667.
[I 2025-01-02 07:25:41,624] Trial 13 finished with value: 0.8166666666666667 and parameters: {'max_depth': 9, 'min_samples_split': 4}. Best is trial 7 with value: 0.8366666666666667.
[I 2025-01-02 07:25:41,662] Trial 14 finished with value: 0.8233333333333334 and parameters: {'max_depth': 16, 'min_samples_split': 7}. Best is trial 7 with value: 0.8366666666666667.
[I 2025-01-02 07:25:41,707] Trial 15 finished with value: 0.7933333333333333 and parameters: {

Optimizing Random Forest...


[I 2025-01-02 07:25:44,327] Trial 0 finished with value: 0.82 and parameters: {'n_estimators': 490, 'max_depth': 5, 'min_samples_split': 2}. Best is trial 0 with value: 0.82.
[I 2025-01-02 07:25:45,494] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 277, 'max_depth': 20, 'min_samples_split': 3}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-02 07:25:46,222] Trial 2 finished with value: 0.7533333333333333 and parameters: {'n_estimators': 299, 'max_depth': 3, 'min_samples_split': 5}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-02 07:25:46,443] Trial 3 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 67, 'max_depth': 7, 'min_samples_split': 10}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-02 07:25:47,784] Trial 4 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 475, 'max_depth': 5, 'min_samples_split': 4}. Best is trial 1 with value: 0.8633333333333333.
[I 2025-01-0

Optimizing Logistic Regression...


[I 2025-01-02 07:26:09,933] Trial 6 finished with value: 0.7166666666666667 and parameters: {'C': 5.614995000558539, 'solver': 'liblinear'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:26:09,956] Trial 7 finished with value: 0.7 and parameters: {'C': 0.5041513970435523, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:26:09,984] Trial 8 finished with value: 0.71 and parameters: {'C': 2.504410178628773, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:26:09,998] Trial 9 finished with value: 0.7166666666666667 and parameters: {'C': 6.923622072691682, 'solver': 'liblinear'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:26:10,029] Trial 10 finished with value: 0.7266666666666667 and parameters: {'C': 9.857176275623301, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:26:10,062] Trial 11 finished with value: 0.7266666666666667 and parameters: {

Optimizing k-NN...


[I 2025-01-02 07:26:11,272] Trial 2 finished with value: 0.77 and parameters: {'n_neighbors': 14}. Best is trial 1 with value: 0.7766666666666666.
[I 2025-01-02 07:26:11,353] Trial 3 finished with value: 0.77 and parameters: {'n_neighbors': 14}. Best is trial 1 with value: 0.7766666666666666.
[I 2025-01-02 07:26:11,434] Trial 4 finished with value: 0.7766666666666666 and parameters: {'n_neighbors': 18}. Best is trial 1 with value: 0.7766666666666666.
[I 2025-01-02 07:26:11,521] Trial 5 finished with value: 0.7833333333333333 and parameters: {'n_neighbors': 4}. Best is trial 5 with value: 0.7833333333333333.
[I 2025-01-02 07:26:11,572] Trial 6 finished with value: 0.7733333333333333 and parameters: {'n_neighbors': 20}. Best is trial 5 with value: 0.7833333333333333.
[I 2025-01-02 07:26:11,635] Trial 7 finished with value: 0.78 and parameters: {'n_neighbors': 11}. Best is trial 5 with value: 0.7833333333333333.
[I 2025-01-02 07:26:11,690] Trial 8 finished with value: 0.8 and parameters: 

Optimizing Naive Bayes...


[I 2025-01-02 07:26:14,645] Trial 11 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,669] Trial 12 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,696] Trial 13 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,721] Trial 14 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,748] Trial 15 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,771] Trial 16 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,793] Trial 17 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,812] Trial 18 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:26:14,841] Trial 19 finished with value: 0.7 and parameters: {}

Optimizing Gradient Boosting...


[I 2025-01-02 07:26:16,600] Trial 0 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 140, 'learning_rate': 0.23896826260091797, 'max_depth': 4}. Best is trial 0 with value: 0.8533333333333334.
[I 2025-01-02 07:26:18,631] Trial 1 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 139, 'learning_rate': 0.4606739980837399, 'max_depth': 19}. Best is trial 0 with value: 0.8533333333333334.
[I 2025-01-02 07:26:21,462] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 432, 'learning_rate': 0.13970984604867698, 'max_depth': 5}. Best is trial 2 with value: 0.8566666666666667.
[I 2025-01-02 07:26:28,248] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 460, 'learning_rate': 0.14301003008095417, 'max_depth': 10}. Best is trial 3 with value: 0.8666666666666667.
[I 2025-01-02 07:26:31,396] Trial 4 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 449, 'learning_rate': 

Optimizing XGBoost...


[I 2025-01-02 07:27:59,645] Trial 0 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 267, 'max_depth': 15, 'learning_rate': 0.09632924737472826}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:27:59,792] Trial 1 finished with value: 0.84 and parameters: {'n_estimators': 307, 'max_depth': 3, 'learning_rate': 0.4786634001089895}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:28:00,040] Trial 2 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 356, 'max_depth': 4, 'learning_rate': 0.1986151769832489}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:28:00,720] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 299, 'max_depth': 18, 'learning_rate': 0.025772786204824455}. Best is trial 0 with value: 0.8633333333

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000329 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:09,482] Trial 0 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 213, 'max_depth': 15, 'learning_rate': 0.2686147820451307}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:09,676] Trial 1 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 128, 'max_depth': 16, 'learning_rate': 0.18665909486446638}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:09,920] Trial 2 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 186, 'max_depth': 16, 'learning_rate': 0.406729731008701}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:10,509] Trial 3 finished with value: 0.86 and parameters: {'n_estimators': 417, 'max_depth': 13, 'learning_rate': 0.022482087874848365}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:28:10,778] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 171, 'max_depth': 8, 'learning_rate': 0.4238113769230834}. Best is trial 0 with value: 0.8666666666666667.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000169 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:10,955] Trial 5 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 197, 'max_depth': 3, 'learning_rate': 0.4983555733721035}. Best is trial 0 with value: 0.8666666666666667.
[I 2025-01-02 07:28:11,085] Trial 6 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 148, 'max_depth': 3, 'learning_rate': 0.24064570976733363}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:11,746] Trial 7 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 494, 'max_depth': 6, 'learning_rate': 0.0910298765214764}. Best is trial 0 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000083 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:28:11,901] Trial 8 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 128, 'max_depth': 5, 'learning_rate': 0.4476894171131791}. Best is trial 0 with value: 0.8666666666666667.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000228 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:28:12,302] Trial 9 finished with value: 0.87 and parameters: {'n_estimators': 346, 'max_depth': 16, 'learning_rate': 0.2160590940522399}. Best is trial 9 with value: 0.87.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:12,705] Trial 10 finished with value: 0.85 and parameters: {'n_estimators': 310, 'max_depth': 20, 'learning_rate': 0.3248234942630731}. Best is trial 9 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:13,166] Trial 11 finished with value: 0.87 and parameters: {'n_estimators': 307, 'max_depth': 13, 'learning_rate': 0.17801759718972887}. Best is trial 9 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:13,674] Trial 12 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 326, 'max_depth': 10, 'learning_rate': 0.14999085232286885}. Best is trial 9 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:14,230] Trial 13 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 364, 'max_depth': 19, 'learning_rate': 0.1478326763166013}. Best is trial 9 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:14,617] Trial 14 finished with value: 0.86 and parameters: {'n_estimators': 400, 'max_depth': 13, 'learning_rate': 0.304548514721934}. Best is trial 9 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:14,699] Trial 15 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 54, 'max_depth': 11, 'learning_rate': 0.22129446068803316}. Best is trial 15 with value: 0.8766666666666667.
[I 2025-01-02 07:28:14,784] Trial 16 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 58, 'max_depth': 10, 'learning_rate': 0.34957250982060084}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:15,026] Trial 17 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 254, 'max_depth': 18, 'learning_rate': 0.23568431843509774}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:15,485] Trial 18 finished with value: 0.86 and parameters: {'n_estimators': 468, 'max_depth': 10, 'learning_rate': 0.09008895883717938}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:15,568] Trial 19 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 57, 'max_depth': 17, 'learning_rate': 0.2169031159670044}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000207 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:15,827] Trial 20 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 250, 'max_depth': 14, 'learning_rate': 0.3688642462876822}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:16,112] Trial 21 finished with value: 0.86 and parameters: {'n_estimators': 299, 'max_depth': 12, 'learning_rate': 0.16262957901813285}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:16,440] Trial 22 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 351, 'max_depth': 11, 'learning_rate': 0.28324926693346025}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:16,857] Trial 23 finished with value: 0.85 and parameters: {'n_estimators': 392, 'max_depth': 8, 'learning_rate': 0.10715370971041764}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000162 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:17,119] Trial 24 finished with value: 0.85 and parameters: {'n_estimators': 270, 'max_depth': 14, 'learning_rate': 0.21689677276530228}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:17,349] Trial 25 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 231, 'max_depth': 12, 'learning_rate': 0.19195976831271583}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:17,796] Trial 26 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 432, 'max_depth': 8, 'learning_rate': 0.012928145246880962}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:18,146] Trial 27 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 353, 'max_depth': 15, 'learning_rate': 0.1217736964117551}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000077 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:18,404] Trial 28 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 287, 'max_depth': 17, 'learning_rate': 0.06531016367963222}. Best is trial 15 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:28:18,818] Trial 29 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 448, 'max_depth': 15, 'learning_rate': 0.26986063802519694}. Best is trial 15 with value: 0.8766666666666667.
[I 2025-01-02 07:28:18,820] A new study created in memory with name: no-name-2979b169-42be-4687-bbd1-d4432413dddf


Optimizing AdaBoost...


[I 2025-01-02 07:28:20,523] Trial 0 finished with value: 0.7566666666666667 and parameters: {'n_estimators': 499, 'learning_rate': 0.046551875529801424}. Best is trial 0 with value: 0.7566666666666667.
[I 2025-01-02 07:28:22,437] Trial 1 finished with value: 0.7833333333333333 and parameters: {'n_estimators': 500, 'learning_rate': 0.5600094185829375}. Best is trial 1 with value: 0.7833333333333333.
[I 2025-01-02 07:28:22,683] Trial 2 finished with value: 0.7633333333333333 and parameters: {'n_estimators': 66, 'learning_rate': 0.6174220750294701}. Best is trial 1 with value: 0.7833333333333333.
[I 2025-01-02 07:28:23,538] Trial 3 finished with value: 0.7633333333333333 and parameters: {'n_estimators': 247, 'learning_rate': 0.1355215792000284}. Best is trial 1 with value: 0.7833333333333333.
[I 2025-01-02 07:28:23,828] Trial 4 finished with value: 0.7466666666666667 and parameters: {'n_estimators': 83, 'learning_rate': 0.20232863589684544}. Best is trial 1 with value: 0.7833333333333333.

Optimizing Neural Network...


[I 2025-01-02 07:28:54,781] Trial 0 finished with value: 0.8033333333333333 and parameters: {'hidden_layer_1': 92, 'hidden_layer_2': 37, 'learning_rate_init': 0.05724108654792482}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-02 07:28:55,766] Trial 1 finished with value: 0.77 and parameters: {'hidden_layer_1': 12, 'hidden_layer_2': 89, 'learning_rate_init': 0.030994169680450757}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-02 07:28:56,453] Trial 2 finished with value: 0.7666666666666667 and parameters: {'hidden_layer_1': 96, 'hidden_layer_2': 25, 'learning_rate_init': 0.06095359160483512}. Best is trial 0 with value: 0.8033333333333333.
[I 2025-01-02 07:28:57,598] Trial 3 finished with value: 0.8133333333333334 and parameters: {'hidden_layer_1': 87, 'hidden_layer_2': 21, 'learning_rate_init': 0.02399126159736829}. Best is trial 3 with value: 0.8133333333333334.
[I 2025-01-02 07:28:58,579] Trial 4 finished with value: 0.84 and parameters: {'hidden_layer_1': 

Optimizing MLP...


[I 2025-01-02 07:29:23,593] Trial 0 finished with value: 0.7966666666666666 and parameters: {'layer_1': 99, 'layer_2': 57, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.021205111634270843}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:29:24,720] Trial 1 finished with value: 0.75 and parameters: {'layer_1': 80, 'layer_2': 85, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.09158635889732844}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:29:30,758] Trial 2 finished with value: 0.8366666666666667 and parameters: {'layer_1': 74, 'layer_2': 87, 'activation': 'tanh', 'solver': 'adam', 'learning_rate_init': 0.02849368541017579}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-02 07:29:32,492] Trial 3 finished with value: 0.8 and parameters: {'layer_1': 104, 'layer_2': 82, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.06331707517766152}. Best is trial 2 with value: 0.8366666666666667.

                  Model  Accuracy  \
0                   SVM  0.770000   
1         Decision Tree  0.836667   
2         Random Forest  0.870000   
3   Logistic Regression  0.726667   
4                  k-NN  0.800000   
5           Naive Bayes  0.700000   
6     Gradient Boosting  0.870000   
7               XGBoost  0.873333   
8              LightGBM  0.876667   
9              AdaBoost  0.796667   
10       Neural Network  0.840000   
11                  MLP  0.843333   

                                          Best Params  
0            {'C': 9.9024812180168, 'kernel': 'poly'}  
1           {'max_depth': 16, 'min_samples_split': 3}  
2   {'n_estimators': 140, 'max_depth': 18, 'min_sa...  
3         {'C': 8.926788958223662, 'solver': 'lbfgs'}  
4                                  {'n_neighbors': 3}  
5                                                  {}  
6   {'n_estimators': 346, 'learning_rate': 0.31600...  
7   {'n_estimators': 237, 'max_depth': 8, 'learnin...  
8   {'n_estima

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix, matthews_corrcoef, cohen_kappa_score, roc_auc_score # Import confusion_matrix and other metrics

# Function to calculate metrics with model name
def calculate_metrics(y_true, y_pred, model_name=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
    specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
    accuracy = (tp + tn) / (tp + tn + fp + fn)
    mcc = matthews_corrcoef(y_true, y_pred)
    kappa = cohen_kappa_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_pred)
    return {
        "Model": model_name,
        "Accuracy": accuracy,
        "Sensitivity": sensitivity,
        "Specificity": specificity,
        "MCC": mcc,
        "Kappa": kappa,
        "AUC": auc,
    }

# Results storage
results = []

# Optimization function
def optimize_model_with_metrics(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        trial.set_user_attr("preds", preds)
        metrics = calculate_metrics(y_val, preds, model_name=model_name)
        return metrics["Accuracy"]

    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Store the best trial metrics
    best_trial_preds = study.best_trial.user_attrs["preds"]
    best_metrics = calculate_metrics(y_val, best_trial_preds, model_name=model_name)
    best_metrics["Best Params"] = study.best_params
    results.append(best_metrics)

# Run optimization for all models
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    optimize_model_with_metrics(model_name, model_func)

# Store the final results with selected metrics and best parameters
final_results = []
for result in results:
    final_results.append(
        {
            "Model": result["Model"],
            "Accuracy": result["Accuracy"],
            "Sensitivity": result["Sensitivity"],
            "Specificity": result["Specificity"],
            "MCC": result["MCC"],
            "Kappa": result["Kappa"],
            "AUC": result["AUC"],
            "Best Params": result["Best Params"],
        }
    )

# Convert results to a DataFrame and display it
final_results_df = pd.DataFrame(final_results)
print(final_results_df)


[I 2025-01-02 07:31:21,816] A new study created in memory with name: no-name-cdff402d-0ef4-4908-9007-e9ea64f756d0
[I 2025-01-02 07:31:21,918] Trial 0 finished with value: 0.5133333333333333 and parameters: {'C': 2.45047391777472, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.5133333333333333.
[I 2025-01-02 07:31:21,996] Trial 1 finished with value: 0.7233333333333334 and parameters: {'C': 9.410322265193635, 'kernel': 'linear'}. Best is trial 1 with value: 0.7233333333333334.


Optimizing SVM...


[I 2025-01-02 07:31:22,099] Trial 2 finished with value: 0.51 and parameters: {'C': 5.963472858582593, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.7233333333333334.
[I 2025-01-02 07:31:22,211] Trial 3 finished with value: 0.5166666666666667 and parameters: {'C': 2.2765004882700093, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.7233333333333334.
[I 2025-01-02 07:31:22,309] Trial 4 finished with value: 0.5133333333333333 and parameters: {'C': 3.1764292266617287, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.7233333333333334.
[I 2025-01-02 07:31:22,398] Trial 5 finished with value: 0.75 and parameters: {'C': 6.306257850684314, 'kernel': 'poly'}. Best is trial 5 with value: 0.75.
[I 2025-01-02 07:31:22,526] Trial 6 finished with value: 0.74 and parameters: {'C': 5.845770093311746, 'kernel': 'rbf'}. Best is trial 5 with value: 0.75.
[I 2025-01-02 07:31:22,671] Trial 7 finished with value: 0.7433333333333333 and parameters: {'C': 7.1828785738184955, 'kernel': 'rbf'}. Best

Optimizing Decision Tree...


[I 2025-01-02 07:31:30,201] Trial 2 finished with value: 0.81 and parameters: {'max_depth': 20, 'min_samples_split': 9}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:31:30,276] Trial 3 finished with value: 0.8033333333333333 and parameters: {'max_depth': 19, 'min_samples_split': 4}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:31:30,346] Trial 4 finished with value: 0.8166666666666667 and parameters: {'max_depth': 15, 'min_samples_split': 2}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:31:30,423] Trial 5 finished with value: 0.8 and parameters: {'max_depth': 16, 'min_samples_split': 9}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:31:30,487] Trial 6 finished with value: 0.82 and parameters: {'max_depth': 19, 'min_samples_split': 7}. Best is trial 0 with value: 0.8266666666666667.
[I 2025-01-02 07:31:30,530] Trial 7 finished with value: 0.7333333333333333 and parameters: {'max_depth': 3, 'min_samples_split'

Optimizing Random Forest...


[I 2025-01-02 07:31:33,590] Trial 0 finished with value: 0.86 and parameters: {'n_estimators': 451, 'max_depth': 13, 'min_samples_split': 6}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:34,256] Trial 1 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 194, 'max_depth': 9, 'min_samples_split': 9}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:35,847] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 463, 'max_depth': 11, 'min_samples_split': 10}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:36,051] Trial 3 finished with value: 0.8 and parameters: {'n_estimators': 70, 'max_depth': 4, 'min_samples_split': 9}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:36,852] Trial 4 finished with value: 0.86 and parameters: {'n_estimators': 231, 'max_depth': 9, 'min_samples_split': 4}. Best is trial 0 with value: 0.86.
[I 2025-01-02 07:31:38,227] Trial 5 finished with value: 0.8366666666666667 and parameters: {'n_

Optimizing Logistic Regression...


[I 2025-01-02 07:31:56,038] Trial 5 finished with value: 0.72 and parameters: {'C': 9.393774797571659, 'solver': 'liblinear'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:31:56,072] Trial 6 finished with value: 0.7133333333333334 and parameters: {'C': 3.933395703168894, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:31:56,102] Trial 7 finished with value: 0.7033333333333334 and parameters: {'C': 0.9543529566205661, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:31:56,137] Trial 8 finished with value: 0.7233333333333334 and parameters: {'C': 8.188597614079441, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:31:56,169] Trial 9 finished with value: 0.7233333333333334 and parameters: {'C': 7.520109020925042, 'solver': 'lbfgs'}. Best is trial 1 with value: 0.7266666666666667.
[I 2025-01-02 07:31:56,201] Trial 10 finished with value: 0.7166666666666667 and par

Optimizing k-NN...


[I 2025-01-02 07:31:57,136] Trial 4 finished with value: 0.7733333333333333 and parameters: {'n_neighbors': 6}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,176] Trial 5 finished with value: 0.78 and parameters: {'n_neighbors': 11}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,217] Trial 6 finished with value: 0.7733333333333333 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,256] Trial 7 finished with value: 0.7666666666666667 and parameters: {'n_neighbors': 12}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,299] Trial 8 finished with value: 0.7833333333333333 and parameters: {'n_neighbors': 17}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,343] Trial 9 finished with value: 0.7833333333333333 and parameters: {'n_neighbors': 17}. Best is trial 1 with value: 0.8.
[I 2025-01-02 07:31:57,393] Trial 10 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is trial 1 with value: 0.8.
[I 2025-

Optimizing Naive Bayes...


[I 2025-01-02 07:31:58,638] Trial 10 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,668] Trial 11 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,698] Trial 12 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,722] Trial 13 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,747] Trial 14 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,771] Trial 15 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,795] Trial 16 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,814] Trial 17 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:31:58,836] Trial 18 finished with value: 0.7 and parameters: {}

Optimizing Gradient Boosting...


[I 2025-01-02 07:32:00,770] Trial 0 finished with value: 0.85 and parameters: {'n_estimators': 293, 'learning_rate': 0.4336423083252079, 'max_depth': 4}. Best is trial 0 with value: 0.85.
[I 2025-01-02 07:32:01,554] Trial 1 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 144, 'learning_rate': 0.373284540999473, 'max_depth': 4}. Best is trial 1 with value: 0.8566666666666667.
[I 2025-01-02 07:32:06,118] Trial 2 finished with value: 0.84 and parameters: {'n_estimators': 377, 'learning_rate': 0.45501962521926753, 'max_depth': 15}. Best is trial 1 with value: 0.8566666666666667.
[I 2025-01-02 07:32:06,871] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 183, 'learning_rate': 0.45174135326201625, 'max_depth': 3}. Best is trial 1 with value: 0.8566666666666667.
[I 2025-01-02 07:32:07,139] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 61, 'learning_rate': 0.2823394396070678, 'max_depth': 3}. Best is trial 1 with va

Optimizing XGBoost...


[I 2025-01-02 07:34:14,968] Trial 0 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 403, 'max_depth': 13, 'learning_rate': 0.4034211623435752}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:34:15,311] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 177, 'max_depth': 11, 'learning_rate': 0.0725308069342348}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:34:15,657] Trial 2 finished with value: 0.85 and parameters: {'n_estimators': 330, 'max_depth': 15, 'learning_rate': 0.24105844692455752}. Best is trial 0 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:34:15,966] Trial 3 finished with value: 0.85 and parameters: {'n_estimators': 451, 'max_depth': 14, 'learning_rate': 0.48284300804838143}. Best is trial 0 with value: 0.8633333333333333.
Param

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:26,749] Trial 1 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 415, 'max_depth': 5, 'learning_rate': 0.20131285685897188}. Best is trial 0 with value: 0.8533333333333334.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:26,994] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 278, 'max_depth': 11, 'learning_rate': 0.44248651015913637}. Best is trial 2 with value: 0.86.
[I 2025-01-02 07:34:27,185] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 209, 'max_depth': 12, 'learning_rate': 0.29538169071802284}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000159 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:27,589] Trial 4 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 419, 'max_depth': 11, 'learning_rate': 0.13214662688836362}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:27,868] Trial 5 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 319, 'max_depth': 13, 'learning_rate': 0.4735280773076885}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:28,203] Trial 6 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 410, 'max_depth': 4, 'learning_rate': 0.03193244642602404}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:28,416] Trial 7 finished with value: 0.85 and parameters: {'n_estimators': 221, 'max_depth': 7, 'learning_rate': 0.23017599374187775}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:28,664] Trial 8 finished with value: 0.84 and parameters: {'n_estimators': 267, 'max_depth': 5, 'learning_rate': 0.44584150182084137}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:28,782] Trial 9 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 104, 'max_depth': 20, 'learning_rate': 0.40102196713419697}. Best is trial 3 with value: 0.8666666666666667.
[I 2025-01-02 07:34:28,954] Trial 10 finished with value: 0.85 and parameters: {'n_estimators': 168, 'max_depth': 18, 'learning_rate': 0.34309296258023114}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:29,260] Trial 11 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 325, 'max_depth': 15, 'learning_rate': 0.3163100230259088}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:29,560] Trial 12 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 349, 'max_depth': 14, 'learning_rate': 0.33116466939972056}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:29,992] Trial 13 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 494, 'max_depth': 8, 'learning_rate': 0.4885550182700021}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000167 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,190] Trial 14 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 179, 'max_depth': 9, 'learning_rate': 0.13477783035980062}. Best is trial 3 with value: 0.8666666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,420] Trial 15 finished with value: 0.87 and parameters: {'n_estimators': 229, 'max_depth': 16, 'learning_rate': 0.29188118833880794}. Best is trial 15 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,604] Trial 16 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 172, 'max_depth': 17, 'learning_rate': 0.2873413149643324}. Best is trial 15 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,849] Trial 17 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 242, 'max_depth': 17, 'learning_rate': 0.18237507874439585}. Best is trial 15 with value: 0.87.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:30,979] Trial 18 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 120, 'max_depth': 20, 'learning_rate': 0.3698470331522368}. Best is trial 15 with value: 0.87.
[I 2025-01-02 07:34:31,195] Trial 19 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 203, 'max_depth': 12, 'learning_rate': 0.26187993274226906}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:31,334] Trial 20 finished with value: 0.86 and parameters: {'n_estimators': 122, 'max_depth': 16, 'learning_rate': 0.252027237821165}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:31,542] Trial 21 finished with value: 0.86 and parameters: {'n_estimators': 197, 'max_depth': 12, 'learning_rate': 0.2773940454673776}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:31,797] Trial 22 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 233, 'max_depth': 10, 'learning_rate': 0.3797399724595374}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:32,081] Trial 23 finished with value: 0.87 and parameters: {'n_estimators': 264, 'max_depth': 13, 'learning_rate': 0.2935089222083308}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000166 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:32,359] Trial 24 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 288, 'max_depth': 15, 'learning_rate': 0.21962435372694353}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:32,636] Trial 25 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 294, 'max_depth': 18, 'learning_rate': 0.1484762718990715}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000148 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:34:33,003] Trial 26 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 361, 'max_depth': 15, 'learning_rate': 0.06454399270676986}. Best is trial 19 with value: 0.8766666666666667.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:33,155] Trial 27 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 135, 'max_depth': 16, 'learning_rate': 0.2279270626094166}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000144 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:33,440] Trial 28 finished with value: 0.86 and parameters: {'n_estimators': 301, 'max_depth': 14, 'learning_rate': 0.17944822070349928}. Best is trial 19 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000069 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:34:33,801] Trial 29 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 379, 'max_depth': 19, 'learning_rate': 0.09942293426423876}. Best is trial 19 with value: 0.8766666666666667.
[I 2025-01-02 07:34:33,814] A new study created in memory with name: no-name-70780312-9c04-4a8e-97d9-7fa0bb9d0cc1


Optimizing AdaBoost...


[I 2025-01-02 07:34:35,311] Trial 0 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 429, 'learning_rate': 0.46191519988910595}. Best is trial 0 with value: 0.7666666666666667.
[I 2025-01-02 07:34:36,493] Trial 1 finished with value: 0.7233333333333334 and parameters: {'n_estimators': 296, 'learning_rate': 0.04772842647718921}. Best is trial 0 with value: 0.7666666666666667.
[I 2025-01-02 07:34:38,045] Trial 2 finished with value: 0.77 and parameters: {'n_estimators': 312, 'learning_rate': 0.49907650804978076}. Best is trial 2 with value: 0.77.
[I 2025-01-02 07:34:40,301] Trial 3 finished with value: 0.7866666666666666 and parameters: {'n_estimators': 453, 'learning_rate': 0.5514842418513126}. Best is trial 3 with value: 0.7866666666666666.
[I 2025-01-02 07:34:40,994] Trial 4 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 141, 'learning_rate': 0.12078493860433875}. Best is trial 3 with value: 0.7866666666666666.
[I 2025-01-02 07:34:42,1

Optimizing Neural Network...


[I 2025-01-02 07:35:15,283] Trial 0 finished with value: 0.7966666666666666 and parameters: {'hidden_layer_1': 67, 'hidden_layer_2': 27, 'learning_rate_init': 0.05223221867799265}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:35:15,876] Trial 1 finished with value: 0.7766666666666666 and parameters: {'hidden_layer_1': 29, 'hidden_layer_2': 74, 'learning_rate_init': 0.07175575791251039}. Best is trial 0 with value: 0.7966666666666666.
[I 2025-01-02 07:35:16,556] Trial 2 finished with value: 0.8 and parameters: {'hidden_layer_1': 50, 'hidden_layer_2': 98, 'learning_rate_init': 0.07994248708588166}. Best is trial 2 with value: 0.8.
[I 2025-01-02 07:35:17,292] Trial 3 finished with value: 0.8166666666666667 and parameters: {'hidden_layer_1': 66, 'hidden_layer_2': 39, 'learning_rate_init': 0.035531336479265756}. Best is trial 3 with value: 0.8166666666666667.
[I 2025-01-02 07:35:17,879] Trial 4 finished with value: 0.7866666666666666 and parameters: {'hidden_layer_1': 39

Optimizing MLP...


[I 2025-01-02 07:35:51,017] Trial 0 finished with value: 0.75 and parameters: {'layer_1': 78, 'layer_2': 105, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.015617408290169613}. Best is trial 0 with value: 0.75.
[I 2025-01-02 07:35:53,891] Trial 1 finished with value: 0.7566666666666667 and parameters: {'layer_1': 140, 'layer_2': 110, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.06581586643992404}. Best is trial 1 with value: 0.7566666666666667.
[I 2025-01-02 07:35:56,295] Trial 2 finished with value: 0.7366666666666667 and parameters: {'layer_1': 112, 'layer_2': 74, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.034443701961199856}. Best is trial 1 with value: 0.7566666666666667.
[I 2025-01-02 07:35:57,831] Trial 3 finished with value: 0.77 and parameters: {'layer_1': 55, 'layer_2': 59, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.07175137017691838}. Best is trial 3 with value: 0.77.
[I 2025-01-02 07:36:00,561] Trial 

                  Model  Accuracy  Sensitivity  Specificity       MCC  \
0                   SVM  0.770000     0.686667     0.853333  0.547660   
1         Decision Tree  0.826667     0.873333     0.780000  0.656198   
2         Random Forest  0.873333     0.880000     0.866667  0.746733   
3   Logistic Regression  0.726667     0.733333     0.720000  0.453374   
4                  k-NN  0.800000     0.820000     0.780000  0.600481   
5           Naive Bayes  0.700000     0.560000     0.840000  0.416667   
6     Gradient Boosting  0.880000     0.906667     0.853333  0.761083   
7               XGBoost  0.870000     0.900000     0.840000  0.741336   
8              LightGBM  0.876667     0.913333     0.840000  0.755367   
9              AdaBoost  0.796667     0.813333     0.780000  0.593663   
10       Neural Network  0.850000     0.840000     0.860000  0.700140   
11                  MLP  0.836667     0.786667     0.886667  0.676725   

       Kappa       AUC                            

In [None]:
import pandas as pd
import optuna
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Define models
models = {
    "SVM": lambda trial: SVC(probability=True, C=trial.suggest_float("C", 0.1, 10.0), kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])),
    "Decision Tree": lambda trial: DecisionTreeClassifier(max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Random Forest": lambda trial: RandomForestClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), min_samples_split=trial.suggest_int("min_samples_split", 2, 10)),
    "Logistic Regression": lambda trial: LogisticRegression(C=trial.suggest_float("C", 0.1, 10.0), solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])),
    "k-NN": lambda trial: KNeighborsClassifier(n_neighbors=trial.suggest_int("n_neighbors", 3, 20)),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), max_depth=trial.suggest_int("max_depth", 3, 20)),
    "XGBoost": lambda trial: XGBClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5), use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": lambda trial: LGBMClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), max_depth=trial.suggest_int("max_depth", 3, 20), learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)),
    "AdaBoost": lambda trial: AdaBoostClassifier(n_estimators=trial.suggest_int("n_estimators", 50, 500), learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)),
    "Neural Network": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("hidden_layer_1", 10, 100), trial.suggest_int("hidden_layer_2", 10, 100)), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200),
    "MLP": lambda trial: MLPClassifier(hidden_layer_sizes=(trial.suggest_int("layer_1", 50, 150), trial.suggest_int("layer_2", 50, 150)), activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]), solver=trial.suggest_categorical("solver", ["adam", "sgd"]), learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1), max_iter=200, random_state=42)
}

# Prepare a dictionary to store model probabilities horizontally
probabilities = {"Target": y_val}  # Starting with the target column (y_val)

# Run optimization and compute probabilities for each model
for model_name, model_func in models.items():
    print(f"Optimizing {model_name}...")
    study = optuna.create_study(direction="maximize")

    # Objective function for Optuna
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        return accuracy_score(y_val, model.predict(X_val))

    study.optimize(objective, n_trials=30)

    # Train the best model using the best hyperparameters
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)

    # Get predicted probabilities for the positive class (class 1)
    probs = best_model.predict_proba(X_val)[:, 1]

    # Add to the probabilities dictionary
    probabilities[model_name] = probs

# Convert the probabilities dictionary to a DataFrame
probability_df = pd.DataFrame(probabilities)

# Save the probability dataset to a CSV file
probability_df.to_csv("N_PCP_OPTUNA_probability_predictions.csv", index=False)

print("Dataset saved successfully!")


[I 2025-01-02 07:37:03,934] A new study created in memory with name: no-name-441fbc3d-1de6-4e99-b75b-08d19aa52215


Optimizing SVM...


[I 2025-01-02 07:37:04,115] Trial 0 finished with value: 0.73 and parameters: {'C': 0.6568125188328466, 'kernel': 'linear'}. Best is trial 0 with value: 0.73.
[I 2025-01-02 07:37:04,392] Trial 1 finished with value: 0.51 and parameters: {'C': 7.23452806752154, 'kernel': 'sigmoid'}. Best is trial 0 with value: 0.73.
[I 2025-01-02 07:37:04,623] Trial 2 finished with value: 0.7666666666666667 and parameters: {'C': 9.468839233767257, 'kernel': 'poly'}. Best is trial 2 with value: 0.7666666666666667.
[I 2025-01-02 07:37:04,921] Trial 3 finished with value: 0.51 and parameters: {'C': 6.161244313179552, 'kernel': 'sigmoid'}. Best is trial 2 with value: 0.7666666666666667.
[I 2025-01-02 07:37:05,144] Trial 4 finished with value: 0.7466666666666667 and parameters: {'C': 8.390617743142846, 'kernel': 'rbf'}. Best is trial 2 with value: 0.7666666666666667.
[I 2025-01-02 07:37:05,431] Trial 5 finished with value: 0.5133333333333333 and parameters: {'C': 5.193940363987987, 'kernel': 'sigmoid'}. Best

Optimizing Decision Tree...


[I 2025-01-02 07:37:11,413] Trial 10 finished with value: 0.8266666666666667 and parameters: {'max_depth': 20, 'min_samples_split': 5}. Best is trial 6 with value: 0.8333333333333334.
[I 2025-01-02 07:37:11,448] Trial 11 finished with value: 0.8133333333333334 and parameters: {'max_depth': 11, 'min_samples_split': 6}. Best is trial 6 with value: 0.8333333333333334.
[I 2025-01-02 07:37:11,485] Trial 12 finished with value: 0.82 and parameters: {'max_depth': 13, 'min_samples_split': 7}. Best is trial 6 with value: 0.8333333333333334.
[I 2025-01-02 07:37:11,521] Trial 13 finished with value: 0.83 and parameters: {'max_depth': 12, 'min_samples_split': 6}. Best is trial 6 with value: 0.8333333333333334.
[I 2025-01-02 07:37:11,555] Trial 14 finished with value: 0.82 and parameters: {'max_depth': 10, 'min_samples_split': 5}. Best is trial 6 with value: 0.8333333333333334.
[I 2025-01-02 07:37:11,593] Trial 15 finished with value: 0.8166666666666667 and parameters: {'max_depth': 8, 'min_samples

Optimizing Random Forest...


[I 2025-01-02 07:37:12,978] Trial 0 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 155, 'max_depth': 20, 'min_samples_split': 3}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-02 07:37:14,205] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 240, 'max_depth': 17, 'min_samples_split': 6}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-02 07:37:15,167] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 181, 'max_depth': 12, 'min_samples_split': 6}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-02 07:37:16,187] Trial 3 finished with value: 0.86 and parameters: {'n_estimators': 257, 'max_depth': 18, 'min_samples_split': 4}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-01-02 07:37:17,213] Trial 4 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 291, 'max_depth': 14, 'min_samples_split': 9}. Best is trial 0 with value: 0.8633333333333333.
[I 2025-

Optimizing Logistic Regression...


[I 2025-01-02 07:37:35,699] Trial 12 finished with value: 0.7266666666666667 and parameters: {'C': 9.887205917887366, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7266666666666667.
[I 2025-01-02 07:37:35,721] Trial 13 finished with value: 0.7233333333333334 and parameters: {'C': 8.192642716422473, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7266666666666667.
[I 2025-01-02 07:37:35,745] Trial 14 finished with value: 0.72 and parameters: {'C': 6.308388904713599, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7266666666666667.
[I 2025-01-02 07:37:35,769] Trial 15 finished with value: 0.7266666666666667 and parameters: {'C': 9.872394528475024, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7266666666666667.
[I 2025-01-02 07:37:35,788] Trial 16 finished with value: 0.6766666666666666 and parameters: {'C': 0.11979703116783114, 'solver': 'lbfgs'}. Best is trial 3 with value: 0.7266666666666667.
[I 2025-01-02 07:37:35,810] Trial 17 finished with value: 0.7133333333333334 and p

Optimizing k-NN...


[I 2025-01-02 07:37:36,350] Trial 7 finished with value: 0.77 and parameters: {'n_neighbors': 16}. Best is trial 5 with value: 0.7766666666666666.
[I 2025-01-02 07:37:36,380] Trial 8 finished with value: 0.7733333333333333 and parameters: {'n_neighbors': 15}. Best is trial 5 with value: 0.7766666666666666.
[I 2025-01-02 07:37:36,404] Trial 9 finished with value: 0.7833333333333333 and parameters: {'n_neighbors': 4}. Best is trial 9 with value: 0.7833333333333333.
[I 2025-01-02 07:37:36,434] Trial 10 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8.
[I 2025-01-02 07:37:36,464] Trial 11 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8.
[I 2025-01-02 07:37:36,495] Trial 12 finished with value: 0.78 and parameters: {'n_neighbors': 9}. Best is trial 10 with value: 0.8.
[I 2025-01-02 07:37:36,525] Trial 13 finished with value: 0.7833333333333333 and parameters: {'n_neighbors': 4}. Best is trial 10 with 

Optimizing Naive Bayes...


[I 2025-01-02 07:37:37,292] Trial 26 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:37:37,301] Trial 27 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:37:37,309] Trial 28 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:37:37,316] Trial 29 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-02 07:37:37,322] A new study created in memory with name: no-name-65c99ce7-2b41-4e5e-b114-6ff27f3cbebd


Optimizing Gradient Boosting...


[I 2025-01-02 07:37:44,396] Trial 0 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 462, 'learning_rate': 0.1435248880081998, 'max_depth': 13}. Best is trial 0 with value: 0.8366666666666667.
[I 2025-01-02 07:37:45,948] Trial 1 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 118, 'learning_rate': 0.30406917679730167, 'max_depth': 17}. Best is trial 0 with value: 0.8366666666666667.
[I 2025-01-02 07:37:49,483] Trial 2 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 420, 'learning_rate': 0.4185646792991666, 'max_depth': 7}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-02 07:37:51,958] Trial 3 finished with value: 0.8333333333333334 and parameters: {'n_estimators': 171, 'learning_rate': 0.15575710770490195, 'max_depth': 16}. Best is trial 2 with value: 0.8533333333333334.
[I 2025-01-02 07:37:53,323] Trial 4 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 114, 'learning_rate': 

Optimizing XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:39:08,078] Trial 0 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 433, 'max_depth': 3, 'learning_rate': 0.2924467453916459}. Best is trial 0 with value: 0.8533333333333334.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:39:10,726] Trial 1 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 470, 'max_depth': 12, 'learning_rate': 0.07135191439276162}. Best is trial 1 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:39:11,350] Trial 2 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 427, 'max_depth': 3, 'learning_rate': 0.15409406086228014}. Best is trial 1 with value: 0.8633333333333333.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-02 07:39:11,750] Trial 3 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 290, 'max_depth': 3, 'learning_rate'

Optimizing LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000155 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:25,004] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 409, 'max_depth': 5, 'learning_rate': 0.48210000985612833}. Best is trial 0 with value: 0.8633333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:25,587] Trial 2 finished with value: 0.85 and parameters: {'n_estimators': 393, 'max_depth': 13, 'learning_rate': 0.17156454395447965}. Best is trial 0 with value: 0.8633333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:26,172] Trial 3 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 470, 'max_depth': 16, 'learning_rate': 0.2091358739255628}. Best is trial 3 with value: 0.8766666666666667.
[I 2025-01-02 07:39:26,275] Trial 4 finished with value: 0.8233333333333334 and parameters: {'n_estimators': 74, 'max_depth': 19, 'learning_rate': 0.01889761978005868}. Best is trial 3 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:26,400] Trial 5 finished with value: 0.8833333333333333 and parameters: {'n_estimators': 85, 'max_depth': 8, 'learning_rate': 0.08974994653973369}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000149 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:26,672] Trial 6 finished with value: 0.88 and parameters: {'n_estimators': 189, 'max_depth': 17, 'learning_rate': 0.28167723413049167}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000

[I 2025-01-02 07:39:27,013] Trial 7 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 247, 'max_depth': 7, 'learning_rate': 0.22631425900232874}. Best is trial 5 with value: 0.8833333333333333.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:27,356] Trial 8 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 433, 'max_depth': 16, 'learning_rate': 0.46255273980015044}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:27,547] Trial 9 finished with value: 0.87 and parameters: {'n_estimators': 217, 'max_depth': 16, 'learning_rate': 0.08688410266096366}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:27,871] Trial 10 finished with value: 0.85 and parameters: {'n_estimators': 325, 'max_depth': 9, 'learning_rate': 0.11636132462223014}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-02 07:39:28,055] Trial 11 finished with value: 0.86 and parameters: {'n_estimators': 175, 'max_depth': 11, 'learning_rate': 0.33606865845892253}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:28,129] Trial 12 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 53, 'max_depth': 3, 'learning_rate': 0.3106559524738872}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:28,305] Trial 13 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 152, 'max_depth': 12, 'learning_rate': 0.39381685888438017}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000182 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:28,599] Trial 14 finished with value: 0.85 and parameters: {'n_estimators': 299, 'max_depth': 8, 'learning_rate': 0.14995229116543063}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-02 07:39:28,709] Trial 15 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 96, 'max_depth': 14, 'learning_rate': 0.013603872813955364}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000157 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000152 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:28,916] Trial 16 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 208, 'max_depth': 10, 'learning_rate': 0.26149337651376003}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:29,062] Trial 17 finished with value: 0.86 and parameters: {'n_estimators': 126, 'max_depth': 6, 'learning_rate': 0.26990722918792937}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:29,259] Trial 18 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 176, 'max_depth': 14, 'learning_rate': 0.4036834880189485}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000164 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:29,577] Trial 19 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 333, 'max_depth': 18, 'learning_rate': 0.07768665147963373}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:29,824] Trial 20 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 260, 'max_depth': 4, 'learning_rate': 0.18453945737062072}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:30,154] Trial 21 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 360, 'max_depth': 17, 'learning_rate': 0.21871013654929053}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:30,294] Trial 22 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 110, 'max_depth': 15, 'learning_rate': 0.1389302093360883}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:30,735] Trial 23 finished with value: 0.85 and parameters: {'n_estimators': 466, 'max_depth': 18, 'learning_rate': 0.20704017268547178}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000176 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:31,179] Trial 24 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 484, 'max_depth': 20, 'learning_rate': 0.06530581269838509}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-02 07:39:31,406] Trial 25 finished with value: 0.86 and parameters: {'n_estimators': 214, 'max_depth': 11, 'learning_rate': 0.2753178397214857}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000197 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000158 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:31,698] Trial 26 finished with value: 0.87 and parameters: {'n_estimators': 288, 'max_depth': 9, 'learning_rate': 0.3819965765514771}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:31,852] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 157, 'max_depth': 13, 'learning_rate': 0.24116123041840704}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-02 07:39:31,948] Trial 28 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 78, 'max_depth': 16, 'learning_rate': 0.3510607054822573}. Best is trial 5 with value: 0.8833333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-02 07:39:32,099] Trial 29 finished with value: 0.86 and parameters: {'n_estimators': 138, 'max_depth': 20, 'learning_rate': 0.30301740007550076}. Best is trial 5 with value: 0.8833333333333333.
[I 2025-01-02 07:39:32,182] A new study created in memory with name: no-name-69aee203-c12c-462e-bce6-4b575ec66a55


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000165 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Optimizing AdaBoost...


[I 2025-01-02 07:39:32,497] Trial 0 finished with value: 0.77 and parameters: {'n_estimators': 85, 'learning_rate': 0.6171811578135125}. Best is trial 0 with value: 0.77.
[I 2025-01-02 07:39:33,857] Trial 1 finished with value: 0.78 and parameters: {'n_estimators': 404, 'learning_rate': 0.7521310279264452}. Best is trial 1 with value: 0.78.
[I 2025-01-02 07:39:34,557] Trial 2 finished with value: 0.76 and parameters: {'n_estimators': 204, 'learning_rate': 0.14681037627080132}. Best is trial 1 with value: 0.78.
[I 2025-01-02 07:39:35,075] Trial 3 finished with value: 0.7766666666666666 and parameters: {'n_estimators': 153, 'learning_rate': 0.7558315731976423}. Best is trial 1 with value: 0.78.
[I 2025-01-02 07:39:35,852] Trial 4 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 224, 'learning_rate': 0.8999858347037253}. Best is trial 1 with value: 0.78.
[I 2025-01-02 07:39:36,291] Trial 5 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 128

Optimizing Neural Network...


[I 2025-01-02 07:40:13,725] Trial 0 finished with value: 0.7633333333333333 and parameters: {'hidden_layer_1': 77, 'hidden_layer_2': 47, 'learning_rate_init': 0.03928855792603735}. Best is trial 0 with value: 0.7633333333333333.
[I 2025-01-02 07:40:14,579] Trial 1 finished with value: 0.8066666666666666 and parameters: {'hidden_layer_1': 24, 'hidden_layer_2': 50, 'learning_rate_init': 0.030262917991924234}. Best is trial 1 with value: 0.8066666666666666.
[I 2025-01-02 07:40:15,097] Trial 2 finished with value: 0.77 and parameters: {'hidden_layer_1': 70, 'hidden_layer_2': 95, 'learning_rate_init': 0.03655673204545458}. Best is trial 1 with value: 0.8066666666666666.
[I 2025-01-02 07:40:15,829] Trial 3 finished with value: 0.7333333333333333 and parameters: {'hidden_layer_1': 87, 'hidden_layer_2': 78, 'learning_rate_init': 0.08188963381092447}. Best is trial 1 with value: 0.8066666666666666.
[I 2025-01-02 07:40:16,449] Trial 4 finished with value: 0.7866666666666666 and parameters: {'hid

Optimizing MLP...


[I 2025-01-02 07:40:48,329] Trial 0 finished with value: 0.75 and parameters: {'layer_1': 105, 'layer_2': 94, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.03995423528060628}. Best is trial 0 with value: 0.75.
[I 2025-01-02 07:40:49,171] Trial 1 finished with value: 0.7533333333333333 and parameters: {'layer_1': 63, 'layer_2': 87, 'activation': 'relu', 'solver': 'sgd', 'learning_rate_init': 0.03979638596283026}. Best is trial 1 with value: 0.7533333333333333.
[I 2025-01-02 07:40:50,999] Trial 2 finished with value: 0.7333333333333333 and parameters: {'layer_1': 74, 'layer_2': 135, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.03473948821252878}. Best is trial 1 with value: 0.7533333333333333.
[I 2025-01-02 07:40:52,971] Trial 3 finished with value: 0.7266666666666667 and parameters: {'layer_1': 83, 'layer_2': 105, 'activation': 'logistic', 'solver': 'sgd', 'learning_rate_init': 0.054156386501434645}. Best is trial 1 with value: 0.7533333333333333.
[I 2

Dataset saved successfully!


Class Feature Vector (CFV)

In [2]:
import optuna
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_main_pcp.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_validation_pcp.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)])
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)])

# Split features and labels
X_train = main_data.drop("label", axis=1)
y_train = main_data["label"]
X_val = validation_data.drop("label", axis=1)
y_val = validation_data["label"]

# Define models with hyperparameter optimization (Optuna)
models = {
    "SVM": lambda trial: SVC(
         probability=True,
        C=trial.suggest_float("C", 0.1, 10.0),
        kernel=trial.suggest_categorical("kernel", ["linear", "poly", "rbf", "sigmoid"])
    ),
    "Decision Tree": lambda trial: DecisionTreeClassifier(
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Random Forest": lambda trial: RandomForestClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        min_samples_split=trial.suggest_int("min_samples_split", 2, 10)
    ),
    "Logistic Regression": lambda trial: LogisticRegression(
        C=trial.suggest_float("C", 0.1, 10.0),
        solver=trial.suggest_categorical("solver", ["lbfgs", "liblinear"])
    ),
    "k-NN": lambda trial: KNeighborsClassifier(
        n_neighbors=trial.suggest_int("n_neighbors", 3, 20)
    ),
    "Naive Bayes": lambda trial: GaussianNB(),
    "Gradient Boosting": lambda trial: GradientBoostingClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        max_depth=trial.suggest_int("max_depth", 3, 20)
    ),
    "XGBoost": lambda trial: XGBClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5),
        use_label_encoder=False,
        eval_metric="logloss"
    ),
    "LightGBM": lambda trial: LGBMClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        max_depth=trial.suggest_int("max_depth", 3, 20),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 0.5)
    ),
    "AdaBoost": lambda trial: AdaBoostClassifier(
        n_estimators=trial.suggest_int("n_estimators", 50, 500),
        learning_rate=trial.suggest_float("learning_rate", 0.01, 1.0)
    ),
    "Neural Network": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("hidden_layer_1", 10, 100),
            trial.suggest_int("hidden_layer_2", 10, 100)
        ),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200
    ),
    "MLP": lambda trial: MLPClassifier(
        hidden_layer_sizes=(
            trial.suggest_int("layer_1", 50, 150),
            trial.suggest_int("layer_2", 50, 150)
        ),
        activation=trial.suggest_categorical("activation", ["logistic", "tanh", "relu"]),
        solver=trial.suggest_categorical("solver", ["adam", "sgd"]),
        learning_rate_init=trial.suggest_float("learning_rate_init", 0.001, 0.1),
        max_iter=200,
        random_state=42
    )
}

# Initialize a list to store the CFV data
cfv_data = []

# Define the optimization and prediction function
def optimize_and_predict(model_name, model_func):
    def objective(trial):
        model = model_func(trial)
        model.fit(X_train, y_train)
        preds = model.predict_proba(X_val)[:, 1]  # Get probability for class 1 (positive)
        return accuracy_score(y_val, model.predict(X_val))

    # Perform optimization with Optuna
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=30)

    # Return the best model
    best_model = model_func(study.best_trial)
    best_model.fit(X_train, y_train)
    preds = best_model.predict_proba(X_val)[:, 1]  # Using the probability for class 1 (positive)

    # Append predictions to CFV list
    return preds

# Train each model and generate predictions for CFV
for model_name, model_func in models.items():
    print(f"Training and predicting with {model_name}...")
    preds = optimize_and_predict(model_name, model_func)
    cfv_data.append(preds)

# Convert the CFV data into a DataFrame
cfv_df = pd.DataFrame(np.array(cfv_data).T, columns=models.keys())

# Optionally, add the true labels column
cfv_df["True_Label"] = y_val.values

# Save the CFV dataset to CSV
cfv_df.to_csv("CFV_PCP.csv", index=False)
print("CFV dataset created and saved!")


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

[I 2025-01-16 03:36:51,195] A new study created in memory with name: no-name-af6d98d5-f943-40c6-a5c4-7f52055b1b68
[I 2025-01-16 03:36:51,385] Trial 0 finished with value: 0.73 and parameters: {'C': 0.7035346929081906, 'kernel': 'linear'}. Best is trial 0 with value: 0.73.


Training and predicting with SVM...


[I 2025-01-16 03:36:51,621] Trial 1 finished with value: 0.74 and parameters: {'C': 1.2984578186676417, 'kernel': 'rbf'}. Best is trial 1 with value: 0.74.
[I 2025-01-16 03:36:51,852] Trial 2 finished with value: 0.74 and parameters: {'C': 7.5688258439041185, 'kernel': 'rbf'}. Best is trial 1 with value: 0.74.
[I 2025-01-16 03:36:52,062] Trial 3 finished with value: 0.74 and parameters: {'C': 3.7170030726867753, 'kernel': 'linear'}. Best is trial 1 with value: 0.74.
[I 2025-01-16 03:36:52,237] Trial 4 finished with value: 0.74 and parameters: {'C': 1.1058831515744973, 'kernel': 'poly'}. Best is trial 1 with value: 0.74.
[I 2025-01-16 03:36:52,554] Trial 5 finished with value: 0.51 and parameters: {'C': 7.22130766395129, 'kernel': 'sigmoid'}. Best is trial 1 with value: 0.74.
[I 2025-01-16 03:36:52,773] Trial 6 finished with value: 0.7466666666666667 and parameters: {'C': 2.308562017822125, 'kernel': 'rbf'}. Best is trial 6 with value: 0.7466666666666667.
[I 2025-01-16 03:36:52,990] Tri

Training and predicting with Decision Tree...


[I 2025-01-16 03:36:58,731] Trial 11 finished with value: 0.82 and parameters: {'max_depth': 11, 'min_samples_split': 4}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-16 03:36:58,757] Trial 12 finished with value: 0.82 and parameters: {'max_depth': 15, 'min_samples_split': 4}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-16 03:36:58,784] Trial 13 finished with value: 0.8233333333333334 and parameters: {'max_depth': 9, 'min_samples_split': 5}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-16 03:36:58,815] Trial 14 finished with value: 0.8266666666666667 and parameters: {'max_depth': 13, 'min_samples_split': 2}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-16 03:36:58,837] Trial 15 finished with value: 0.7333333333333333 and parameters: {'max_depth': 3, 'min_samples_split': 10}. Best is trial 2 with value: 0.8366666666666667.
[I 2025-01-16 03:36:58,864] Trial 16 finished with value: 0.8233333333333334 and parameters: {'max_depth': 9,

Training and predicting with Random Forest...


[I 2025-01-16 03:36:59,907] Trial 1 finished with value: 0.86 and parameters: {'n_estimators': 130, 'max_depth': 11, 'min_samples_split': 6}. Best is trial 1 with value: 0.86.
[I 2025-01-16 03:37:00,930] Trial 2 finished with value: 0.86 and parameters: {'n_estimators': 279, 'max_depth': 18, 'min_samples_split': 7}. Best is trial 1 with value: 0.86.
[I 2025-01-16 03:37:01,861] Trial 3 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 210, 'max_depth': 15, 'min_samples_split': 8}. Best is trial 3 with value: 0.8666666666666667.
[I 2025-01-16 03:37:03,646] Trial 4 finished with value: 0.85 and parameters: {'n_estimators': 223, 'max_depth': 10, 'min_samples_split': 7}. Best is trial 3 with value: 0.8666666666666667.
[I 2025-01-16 03:37:04,999] Trial 5 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 135, 'max_depth': 19, 'min_samples_split': 10}. Best is trial 3 with value: 0.8666666666666667.
[I 2025-01-16 03:37:06,904] Trial 6 finished with

Training and predicting with Logistic Regression...


[I 2025-01-16 03:37:28,950] Trial 10 finished with value: 0.7266666666666667 and parameters: {'C': 9.967306831233032, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7266666666666667.
[I 2025-01-16 03:37:28,973] Trial 11 finished with value: 0.7233333333333334 and parameters: {'C': 8.241632208461818, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7266666666666667.
[I 2025-01-16 03:37:28,996] Trial 12 finished with value: 0.7233333333333334 and parameters: {'C': 7.854397517553263, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7266666666666667.
[I 2025-01-16 03:37:29,018] Trial 13 finished with value: 0.7133333333333334 and parameters: {'C': 3.460507385059107, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7266666666666667.
[I 2025-01-16 03:37:29,042] Trial 14 finished with value: 0.7266666666666667 and parameters: {'C': 9.914172272190289, 'solver': 'lbfgs'}. Best is trial 6 with value: 0.7266666666666667.
[I 2025-01-16 03:37:29,067] Trial 15 finished with value: 0.7233333333

Training and predicting with k-NN...


[I 2025-01-16 03:37:29,633] Trial 6 finished with value: 0.7666666666666667 and parameters: {'n_neighbors': 12}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 03:37:29,660] Trial 7 finished with value: 0.7733333333333333 and parameters: {'n_neighbors': 15}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 03:37:29,692] Trial 8 finished with value: 0.7866666666666666 and parameters: {'n_neighbors': 8}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 03:37:29,718] Trial 9 finished with value: 0.78 and parameters: {'n_neighbors': 11}. Best is trial 1 with value: 0.7866666666666666.
[I 2025-01-16 03:37:29,754] Trial 10 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8.
[I 2025-01-16 03:37:29,785] Trial 11 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is trial 10 with value: 0.8.
[I 2025-01-16 03:37:29,819] Trial 12 finished with value: 0.8 and parameters: {'n_neighbors': 3}. Best is t

Training and predicting with Naive Bayes...


[I 2025-01-16 03:37:30,651] Trial 21 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,659] Trial 22 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,666] Trial 23 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,674] Trial 24 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,682] Trial 25 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,696] Trial 26 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,705] Trial 27 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,714] Trial 28 finished with value: 0.7 and parameters: {}. Best is trial 0 with value: 0.7.
[I 2025-01-16 03:37:30,722] Trial 29 finished with value: 0.7 and parameters: {}

Training and predicting with Gradient Boosting...


[I 2025-01-16 03:37:35,246] Trial 0 finished with value: 0.8366666666666667 and parameters: {'n_estimators': 279, 'learning_rate': 0.47495109786691403, 'max_depth': 17}. Best is trial 0 with value: 0.8366666666666667.
[I 2025-01-16 03:37:39,874] Trial 1 finished with value: 0.87 and parameters: {'n_estimators': 441, 'learning_rate': 0.4035444154816968, 'max_depth': 9}. Best is trial 1 with value: 0.87.
[I 2025-01-16 03:37:41,955] Trial 2 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 158, 'learning_rate': 0.47395374622983216, 'max_depth': 11}. Best is trial 1 with value: 0.87.
[I 2025-01-16 03:37:42,400] Trial 3 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 83, 'learning_rate': 0.47037996582037017, 'max_depth': 4}. Best is trial 1 with value: 0.87.
[I 2025-01-16 03:37:49,519] Trial 4 finished with value: 0.84 and parameters: {'n_estimators': 473, 'learning_rate': 0.10515503282637229, 'max_depth': 20}. Best is trial 1 with value: 0.87

Training and predicting with XGBoost...


Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:39:14,355] Trial 0 finished with value: 0.85 and parameters: {'n_estimators': 446, 'max_depth': 7, 'learning_rate': 0.48360228315317855}. Best is trial 0 with value: 0.85.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:39:14,551] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 121, 'max_depth': 6, 'learning_rate': 0.0215969022000415}. Best is trial 0 with value: 0.85.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:39:14,969] Trial 2 finished with value: 0.87 and parameters: {'n_estimators': 265, 'max_depth': 14, 'learning_rate': 0.11313204290380863}. Best is trial 2 with value: 0.87.
Parameters: { "use_label_encoder" } are not used.

[I 2025-01-16 03:39:18,857] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 243, 'max_depth': 10, 'learning_rate': 0.1894584653007162}. Best is trial 2 with value: 0.87.
Parameters: { "use_label_e

Training and predicting with LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000508 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:32,877] Trial 0 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 229, 'max_depth': 6, 'learning_rate': 0.024637781800338952}. Best is trial 0 with value: 0.8566666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000163 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:33,261] Trial 1 finished with value: 0.85 and parameters: {'n_estimators': 448, 'max_depth': 9, 'learning_rate': 0.4202288983880984}. Best is trial 0 with value: 0.8566666666666667.
[I 2025-01-16 03:39:33,331] Trial 2 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 63, 'max_depth': 6, 'learning_rate': 0.41377908335351643}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:33,765] Trial 3 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 453, 'max_depth': 7, 'learning_rate': 0.1529547404374066}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:34,035] Trial 4 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 281, 'max_depth': 6, 'learning_rate': 0.26024907750675613}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:34,307] Trial 5 finished with value: 0.8433333333333334 and parameters: {'n_estimators': 326, 'max_depth': 16, 'learning_rate': 0.4737944967680215}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000194 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:34,647] Trial 6 finished with value: 0.86 and parameters: {'n_estimators': 415, 'max_depth': 20, 'learning_rate': 0.22741431285995886}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:34,801] Trial 7 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 168, 'max_depth': 18, 'learning_rate': 0.2885851676860808}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:34,998] Trial 8 finished with value: 0.85 and parameters: {'n_estimators': 215, 'max_depth': 12, 'learning_rate': 0.20786679848742537}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:35,125] Trial 9 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 122, 'max_depth': 11, 'learning_rate': 0.41484966422135927}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:35,198] Trial 10 finished with value: 0.8733333333333333 and parameters: {'n_estimators': 53, 'max_depth': 4, 'learning_rate': 0.34906080628130615}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000225 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:35,277] Trial 11 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 61, 'max_depth': 3, 'learning_rate': 0.3538893838438712}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:35,344] Trial 12 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 51, 'max_depth': 4, 'learning_rate': 0.3445001648959982}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2025-01-16 03:39:35,462] Trial 13 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 123, 'max_depth': 3, 'learning_rate': 0.4794362801237734}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:35,584] Trial 14 finished with value: 0.86 and parameters: {'n_estimators': 102, 'max_depth': 9, 'learning_rate': 0.3493037791964327}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:35,756] Trial 15 finished with value: 0.85 and parameters: {'n_estimators': 174, 'max_depth': 13, 'learning_rate': 0.40565284155508097}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:36,123] Trial 16 finished with value: 0.86 and parameters: {'n_estimators': 355, 'max_depth': 8, 'learning_rate': 0.12288459686684933}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:36,231] Trial 17 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 98, 'max_depth': 5, 'learning_rate': 0.3144947523975783}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:36,413] Trial 18 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 176, 'max_depth': 11, 'learning_rate': 0.390724191979189}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:36,654] Trial 19 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 252, 'max_depth': 5, 'learning_rate': 0.48682263967003037}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:36,742] Trial 20 finished with value: 0.8633333333333333 and parameters: {'n_estimators': 71, 'max_depth': 9, 'learning_rate': 0.2973654189771484}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000142 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:36,820] Trial 21 finished with value: 0.83 and parameters: {'n_estimators': 53, 'max_depth': 3, 'learning_rate': 0.35815489207757767}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:36,970] Trial 22 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 143, 'max_depth': 5, 'learning_rate': 0.4483154421580282}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000150 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:37,069] Trial 23 finished with value: 0.8566666666666667 and parameters: {'n_estimators': 54, 'max_depth': 4, 'learning_rate': 0.3298769117779356}. Best is trial 2 with value: 0.8733333333333333.
[I 2025-01-16 03:39:37,209] Trial 24 finished with value: 0.84 and parameters: {'n_estimators': 99, 'max_depth': 7, 'learning_rate': 0.3836807881444644}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000168 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overh

[I 2025-01-16 03:39:37,628] Trial 25 finished with value: 0.8666666666666667 and parameters: {'n_estimators': 500, 'max_depth': 14, 'learning_rate': 0.44047433308615297}. Best is trial 2 with value: 0.8733333333333333.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000146 seconds.
You can set `force_col_wise=true` to remove the overhead.


[I 2025-01-16 03:39:37,727] Trial 26 finished with value: 0.8766666666666667 and parameters: {'n_estimators': 86, 'max_depth': 4, 'learning_rate': 0.25917744471937093}. Best is trial 26 with value: 0.8766666666666667.


[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000175 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:37,888] Trial 27 finished with value: 0.8466666666666667 and parameters: {'n_estimators': 144, 'max_depth': 7, 'learning_rate': 0.1690401087840606}. Best is trial 26 with value: 0.8766666666666667.


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000147 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


[I 2025-01-16 03:39:38,088] Trial 28 finished with value: 0.8533333333333334 and parameters: {'n_estimators': 204, 'max_depth': 4, 'learning_rate': 0.25854546395372274}. Best is trial 26 with value: 0.8766666666666667.
[I 2025-01-16 03:39:38,207] Trial 29 finished with value: 0.8266666666666667 and parameters: {'n_estimators': 91, 'max_depth': 6, 'learning_rate': 0.013368966275553978}. Best is trial 26 with value: 0.8766666666666667.
[I 2025-01-16 03:39:38,286] A new study created in memory with name: no-name-94c662b7-da99-4cff-8705-c78cd1a8af3c


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000170 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Training and predicting with AdaBoost...


[I 2025-01-16 03:39:39,225] Trial 0 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 281, 'learning_rate': 0.40241880077288245}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-16 03:39:40,236] Trial 1 finished with value: 0.7666666666666667 and parameters: {'n_estimators': 306, 'learning_rate': 0.18406230312212815}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-16 03:39:40,519] Trial 2 finished with value: 0.7266666666666667 and parameters: {'n_estimators': 85, 'learning_rate': 0.1863334857712532}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-16 03:39:41,356] Trial 3 finished with value: 0.7733333333333333 and parameters: {'n_estimators': 250, 'learning_rate': 0.5391607414602158}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-16 03:39:41,747] Trial 4 finished with value: 0.76 and parameters: {'n_estimators': 117, 'learning_rate': 0.671055056935818}. Best is trial 0 with value: 0.7733333333333333.
[I 2025-01-16 

Training and predicting with Neural Network...


[I 2025-01-16 03:40:18,183] Trial 0 finished with value: 0.8 and parameters: {'hidden_layer_1': 97, 'hidden_layer_2': 13, 'learning_rate_init': 0.05348823150461099}. Best is trial 0 with value: 0.8.
[I 2025-01-16 03:40:18,758] Trial 1 finished with value: 0.7833333333333333 and parameters: {'hidden_layer_1': 21, 'hidden_layer_2': 62, 'learning_rate_init': 0.042264791203450794}. Best is trial 0 with value: 0.8.
[I 2025-01-16 03:40:19,147] Trial 2 finished with value: 0.8133333333333334 and parameters: {'hidden_layer_1': 26, 'hidden_layer_2': 11, 'learning_rate_init': 0.021984735939652778}. Best is trial 2 with value: 0.8133333333333334.
[I 2025-01-16 03:40:20,091] Trial 3 finished with value: 0.79 and parameters: {'hidden_layer_1': 94, 'hidden_layer_2': 19, 'learning_rate_init': 0.01605275619721131}. Best is trial 2 with value: 0.8133333333333334.
[I 2025-01-16 03:40:20,808] Trial 4 finished with value: 0.8166666666666667 and parameters: {'hidden_layer_1': 17, 'hidden_layer_2': 51, 'lea

Training and predicting with MLP...


[I 2025-01-16 03:40:48,137] Trial 0 finished with value: 0.7466666666666667 and parameters: {'layer_1': 135, 'layer_2': 102, 'activation': 'tanh', 'solver': 'sgd', 'learning_rate_init': 0.08597006469633955}. Best is trial 0 with value: 0.7466666666666667.
[I 2025-01-16 03:40:50,217] Trial 1 finished with value: 0.7966666666666666 and parameters: {'layer_1': 53, 'layer_2': 130, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.085480861892545}. Best is trial 1 with value: 0.7966666666666666.
[I 2025-01-16 03:40:52,940] Trial 2 finished with value: 0.7966666666666666 and parameters: {'layer_1': 120, 'layer_2': 63, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.010574244232975125}. Best is trial 1 with value: 0.7966666666666666.
[I 2025-01-16 03:40:53,346] Trial 3 finished with value: 0.5 and parameters: {'layer_1': 133, 'layer_2': 97, 'activation': 'logistic', 'solver': 'adam', 'learning_rate_init': 0.08607486176289701}. Best is trial 1 with value: 

CFV dataset created and saved!


CPFV (Combined Probability and Class Feature Vector)

In [3]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier

# Load datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_main_pcp.csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_3/9_PCP (Physicochemical Properties)/negative_validation_pcp.csv")

# Combine positive and negative datasets
main_data = pd.concat([main_p.assign(label=1), main_n.assign(label=0)], ignore_index=True)
validation_data = pd.concat([validation_p.assign(label=1), validation_n.assign(label=0)], ignore_index=True)

# Separate features and labels
X_train = main_data.drop(columns=["label"])
y_train = main_data["label"]
X_val = validation_data.drop(columns=["label"])
y_val = validation_data["label"]

# Initialize models with their tuned hyperparameters
trained_models = {
    "SVM": SVC(C=1.0, kernel="rbf", probability=True),  # Example parameters
    "Decision Tree": DecisionTreeClassifier(max_depth=10, min_samples_split=5),
    "Random Forest": RandomForestClassifier(n_estimators=100, max_depth=10, min_samples_split=5),
    "Logistic Regression": LogisticRegression(C=1.0, solver="lbfgs"),
    "k-NN": KNeighborsClassifier(n_neighbors=5),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=10),
    "XGBoost": XGBClassifier(n_estimators=100, max_depth=10, learning_rate=0.1, use_label_encoder=False, eval_metric="logloss"),
    "LightGBM": LGBMClassifier(n_estimators=100, max_depth=10, learning_rate=0.1),
    "AdaBoost": AdaBoostClassifier(n_estimators=100, learning_rate=0.1),
    "Neural Network (MLPClassifier)": MLPClassifier(hidden_layer_sizes=(100, 50), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200),
    "Multilayer Perceptron (Custom MLP)": MLPClassifier(hidden_layer_sizes=(128, 64), activation="relu", solver="adam", learning_rate_init=0.01, max_iter=200)
}

# Train all models on the training dataset
for model_name, model in trained_models.items():
    print(f"Training {model_name}...")
    model.fit(X_train, y_train)

# Function to create CPFV dataset
def create_cpfv(models, X_data, y_data):
    cpfv_data = pd.DataFrame()
    for model_name, model in models.items():
        # Add predicted class labels
        cpfv_data[f"{model_name}_Class"] = model.predict(X_data)
        # Add predicted probabilities or decision scores
        if hasattr(model, "predict_proba"):
            cpfv_data[f"{model_name}_Prob"] = model.predict_proba(X_data)[:, 1]
        elif hasattr(model, "decision_function"):
            cpfv_data[f"{model_name}_Prob"] = model.decision_function(X_data)
        else:
            cpfv_data[f"{model_name}_Prob"] = cpfv_data[f"{model_name}_Class"]
    # Add true labels
    cpfv_data["True_Label"] = y_data.reset_index(drop=True)
    return cpfv_data

# Create CPFV dataset using validation data
cpfv_dataset = create_cpfv(trained_models, X_val, y_val)

# Save CPFV dataset to CSV
cpfv_dataset.to_csv("CPFV_PCP.csv", index=False)




Training SVM...
Training Decision Tree...
Training Random Forest...
Training Logistic Regression...
Training k-NN...
Training Naive Bayes...
Training Gradient Boosting...
Training XGBoost...
Training LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


Parameters: { "use_label_encoder" } are not used.



Training AdaBoost...
Training Neural Network (MLPClassifier)...
Training Multilayer Perceptron (Custom MLP)...


# **Hyperparameter grids for RandomizedSearchCV**

In [None]:
import optuna
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier # Import path for KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/positive_main_pcp.csv")
main_n = pd.read_csv("/content/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/negative_validation_pcp.csv")


# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for RandomizedSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply random search
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model, param_grid, n_iter=10, cv=cv, scoring='accuracy', n_jobs=-1, random_state=42)
    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing RandomizedSearchCV for SVM...
Performing RandomizedSearchCV for Decision Tree...
Performing RandomizedSearchCV for Random Forest...


20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sklea

Performing RandomizedSearchCV for Logistic Regression...




Performing RandomizedSearchCV for k-NN...
Performing RandomizedSearchCV for Naive Bayes...
Performing RandomizedSearchCV for Gradient Boosting...
Performing RandomizedSearchCV for XGBoost...
Performing RandomizedSearchCV for LightGBM...
[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000344 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing RandomizedSearchCV for CatBoost...
Performing RandomizedSearchCV for AdaBoost...




Performing RandomizedSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.750829   
1         Decision Tree    0.807548   
2         Random Forest    0.865991   
3   Logistic Regression    0.707019   
4                  k-NN    0.830775   
5           Naive Bayes    0.680413   
6     Gradient Boosting    0.850511   
7               XGBoost    0.859971   
8              LightGBM    0.861699   
9              CatBoost    0.862543   
10             AdaBoost    0.795527   
11       Neural Network    0.699316   

                                      Best Parameters  
0         {'kernel': 'rbf', 'gamma': 'auto', 'C': 10}  
1   {'min_samples_split': 2, 'min_samples_leaf': 2...  
2   {'n_estimators': 500, 'min_samples_split': 5, ...  
3    {'solver': 'liblinear', 'penalty': 'l2', 'C': 1}  
4   {'weights': 'distance', 'n_neighbors': 5, 'met...  
5                              {'var_smoothing': 0.1}  
6   {'n_estimators': 100, 'max_depth': 7, 'learnin...  
7   {'n_estimators': 200, 'max_depth': 5, 'l

In [None]:
# Storage for predictions and target column
probability_datasets = pd.DataFrame(y_train, columns=['Target'])

# Loop through models, perform random search, and save probabilities
for model_name, model in models.items():
    print(f"Performing RandomizedSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform randomized search
    random_search = RandomizedSearchCV(model,
                                       param_grid,
                                       n_iter=10,
                                       cv=cv,
                                       scoring='accuracy',
                                       n_jobs=-1,
                                       random_state=42)

    random_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = random_search.best_params_
    best_scores.append(random_search.best_score_)



    # Get probability predictions (if supported)
    if hasattr(random_search.best_estimator_, "predict_proba"):
        probabilities = random_search.best_estimator_.predict_proba(X_train)[:, 1]  # Probability for the positive class
        probability_datasets[f"{model_name}_Probabilities"] = probabilities
    else:
        # Fallback if probability prediction isn't supported
        predictions = random_search.best_estimator_.predict(X_train)
        probability_datasets[f"{model_name}_Predictions"] = predictions



# Display final dataset with probabilities
print(probability_datasets.head())

# Save the probability dataset to a CSV file
probability_datasets.to_csv("model_probabilities_with_target GDC in Randomsearch .csv", index=False)
print("Probability dataset saved to 'model_probabilities_with_target.csv'.")


# **Hyperparameter grids for GridSearchCV**

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from scikeras.wrappers import KerasClassifier

# Load datasets
main_p = pd.read_csv("/content/positive_main_pcp.csv")
main_n = pd.read_csv("/content/negative_main_pcp.csv")
validation_p = pd.read_csv("/content/positive_validation_pcp.csv")
validation_n = pd.read_csv("/content/negative_validation_pcp.csv")

# Combine positive and negative samples
X_train = pd.concat([main_p, main_n])
y_train = np.concatenate([np.ones(len(main_p)), np.zeros(len(main_n))])

# Define cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Function to create a neural network model
def create_nn(num_units, dropout_rate, learning_rate, input_shape):
    model = Sequential([
        Dense(num_units, activation='relu', input_shape=input_shape),
        Dropout(dropout_rate),
        Dense(num_units, activation='relu'),
        Dropout(dropout_rate),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer=Adam(learning_rate=learning_rate), loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Model definitions and parameter grids for GridSearchCV
models = {
    "SVM": SVC(),
    "Decision Tree": DecisionTreeClassifier(),
    "Random Forest": RandomForestClassifier(),
    "Logistic Regression": LogisticRegression(),
    "k-NN": KNeighborsClassifier(),
    "Naive Bayes": GaussianNB(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "XGBoost": XGBClassifier(),
    "LightGBM": LGBMClassifier(),
    "CatBoost": CatBoostClassifier(verbose=0),
    "AdaBoost": AdaBoostClassifier(),
    "Neural Network": KerasClassifier(
        model=create_nn,
        num_units=64,
        dropout_rate=0.2,
        learning_rate=0.001,
        input_shape=(X_train.shape[1],),
        epochs=5,
        batch_size=32,
        verbose=0
    )
}

# Parameter grids for each model
param_grids = {
    "SVM": {'C': [0.1, 1, 10, 100], 'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['scale', 'auto']},
    "Decision Tree": {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4]},
    "Random Forest": {'n_estimators': [100, 200, 500], 'max_depth': [10, 20, None], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 10]},
    "Logistic Regression": {'C': [0.1, 1, 10, 100], 'solver': ['liblinear', 'saga'], 'penalty': ['l2']},
    "k-NN": {'n_neighbors': [3, 5, 11, 19], 'weights': ['uniform', 'distance'], 'metric': ['euclidean', 'manhattan']},
    "Naive Bayes": {'var_smoothing': np.logspace(-9, -1, 10)},
    "Gradient Boosting": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "XGBoost": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [3, 5, 7]},
    "LightGBM": {'n_estimators': [100, 200], 'learning_rate': [0.01, 0.1, 0.2], 'max_depth': [10, 20, -1]},
    "CatBoost": {'depth': [6, 8, 10], 'learning_rate': [0.01, 0.1, 0.2], 'iterations': [100, 200]},
    "AdaBoost": {'n_estimators': [50, 100, 200], 'learning_rate': [0.01, 0.1, 1.0]},
    "Neural Network": {
        'model__num_units': [32, 64, 128],
        'model__dropout_rate': [0.1, 0.2, 0.3],
        'model__learning_rate': [0.001, 0.01, 0.1]
    }
}

# Results storage
best_params = {}
best_scores = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

# Display results in a DataFrame
results_df = pd.DataFrame({
    'Model': list(models.keys()),
    'Best Score': best_scores,
    'Best Parameters': [best_params[model] for model in models]
})

print(results_df)


Performing GridSearchCV for SVM...
Performing GridSearchCV for Decision Tree...
Performing GridSearchCV for Random Forest...


135 fits failed out of a total of 270.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
112 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 1466, in wrapper
    estimator._validate_params()
  File "/usr/local/lib/python3.10/dist-packages/sklearn/base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "/usr/local/lib/python3.10/dist-packages/sklearn/utils/_param_validation.py", line 95, in validate_parameter_constraints
    raise InvalidParameterError(
sk

Performing GridSearchCV for Logistic Regression...
Performing GridSearchCV for k-NN...
Performing GridSearchCV for Naive Bayes...
Performing GridSearchCV for Gradient Boosting...
Performing GridSearchCV for XGBoost...
Performing GridSearchCV for LightGBM...


  _data = np.array(data, dtype=dtype, copy=copy,


[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000076 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1529
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 6
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
Performing GridSearchCV for CatBoost...
Performing GridSearchCV for AdaBoost...




Performing GridSearchCV for Neural Network...


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


                  Model  Best Score  \
0                   SVM    0.776595   
1         Decision Tree    0.814437   
2         Random Forest    0.867700   
3   Logistic Regression    0.707019   
4                  k-NN    0.830775   
5           Naive Bayes    0.680413   
6     Gradient Boosting    0.852231   
7               XGBoost    0.859971   
8              LightGBM    0.865980   
9              CatBoost    0.865995   
10             AdaBoost    0.795527   
11       Neural Network    0.705309   

                                      Best Parameters  
0        {'C': 100, 'gamma': 'auto', 'kernel': 'rbf'}  
1   {'max_depth': 30, 'min_samples_leaf': 1, 'min_...  
2   {'max_depth': None, 'max_features': 'sqrt', 'm...  
3    {'C': 1, 'penalty': 'l2', 'solver': 'liblinear'}  
4   {'metric': 'manhattan', 'n_neighbors': 5, 'wei...  
5                              {'var_smoothing': 0.1}  
6   {'learning_rate': 0.2, 'max_depth': 7, 'n_esti...  
7   {'learning_rate': 0.2, 'max_depth': 5, '

In [None]:
# Prepare the final dataset with probabilities and target
all_probabilities = []
all_targets = []

# Loop through models and apply grid search
for model_name, model in models.items():
    print(f"Performing GridSearchCV for {model_name}...")

    # Special handling for SVC: enable probability estimation
    if model_name == "SVM":
        model.probability = True  # Enable probability for SVC

    # Get the parameter grid for the current model
    param_grid = param_grids[model_name]

    # Perform grid search
    grid_search = GridSearchCV(model, param_grid, cv=cv, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)

    # Store best parameters and score
    best_params[model_name] = grid_search.best_params_
    best_scores.append(grid_search.best_score_)

    # Predict probabilities using the best estimator
    best_model = grid_search.best_estimator_
    if hasattr(best_model, "predict_proba"):
        probabilities = best_model.predict_proba(X_train)[:, 1]  # Positive class probabilities
    else:
        # Fallback for models without predict_proba (e.g., SVM with linear kernel)
        probabilities = best_model.decision_function(X_train)
        probabilities = (probabilities - probabilities.min()) / (probabilities.max() - probabilities.min())

    # Append probabilities and targets for this model
    all_probabilities.append(probabilities)
    all_targets.append(y_train)

    # Combine probabilities, features, and target into a DataFrame
    model_data = pd.DataFrame(X_train, columns=main_p.columns)  # Ensure column consistency
    model_data[f"{model_name}_probability"] = probabilities
    model_data['target'] = y_train

    # Save to CSV
    output_path = f"/content/{model_name}_probabilities.csv"
    model_data.to_csv(output_path, index=False)
    print(f"Saved probabilities for {model_name} to {output_path}")

# Combine all model probabilities into a single DataFrame (optional)
final_dataset = pd.DataFrame({'target': y_train})
for idx, model_name in enumerate(models.keys()):
    final_dataset[f"{model_name}_probability"] = all_probabilities[idx]

# Save the combined dataset
final_output_path = "/content/combined_probabilities_GridSearchCV.csv"
final_dataset.to_csv(final_output_path, index=False)
print(f"Saved combined dataset to {final_output_path}")
