In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, roc_curve, auc
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.neural_network import MLPClassifier
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, LSTM, Dense, Dropout, BatchNormalization, Flatten
import matplotlib.pyplot as plt

# Load individual datasets
main_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_positive_features (3).csv")
main_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_main_negative_features.csv")
validation_p = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_positive_features.csv")
validation_n = pd.read_csv("/content/drive/MyDrive/Cell penetrating peptide/NEW_WORK/work_2/4_CTDD/ctdd_validation_negative_features.csv")

# Combine datasets
main_p['Target'] = 1
main_n['Target'] = 0
validation_p['Target'] = 1
validation_n['Target'] = 0

train_data = pd.concat([main_p, main_n], ignore_index=True)
validation_data = pd.concat([validation_p, validation_n], ignore_index=True)

# Ensure no NaN or Inf values
train_data = train_data.dropna()
validation_data = validation_data.dropna()
assert np.isfinite(train_data.values).all(), "Training data contains NaN or Inf values!"
assert np.isfinite(validation_data.values).all(), "Validation data contains NaN or Inf values!"

# Separate features and labels
X_train = train_data.drop(columns=['Target']).values
y_train = train_data['Target'].values
X_val = validation_data.drop(columns=['Target']).values
y_val = validation_data['Target'].values

# Normalize features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)

# Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_probabilities = rf_model.predict_proba(X_val)[:, 1]

# XGBoost Model
xgb_model = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_model.fit(X_train, y_train)
xgb_probabilities = xgb_model.predict_proba(X_val)[:, 1]

# LightGBM Model
lgbm_model = LGBMClassifier(random_state=42)
lgbm_model.fit(X_train, y_train)
lgbm_probabilities = lgbm_model.predict_proba(X_val)[:, 1]

# Multi-Layer Perceptron Model
mlp_model = MLPClassifier(hidden_layer_sizes=(128, 64), activation='relu', max_iter=200, random_state=42)
mlp_model.fit(X_train, y_train)
mlp_probabilities = mlp_model.predict_proba(X_val)[:, 1]

# CNN Model
X_train_cnn = X_train[..., np.newaxis]  # Shape: (samples, timesteps, 1)
X_val_cnn = X_val[..., np.newaxis]      # Shape: (samples, timesteps, 1)

cnn_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1)),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Flatten(),
    Dense(1, activation='sigmoid')
])
cnn_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
cnn_model.fit(X_train_cnn, y_train, epochs=50, batch_size=32, verbose=1)
cnn_probabilities = cnn_model.predict(X_val_cnn).flatten()

# Proposed Model
proposed_model = Sequential([
    Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(X_train.shape[1], 1), padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(filters=128, kernel_size=5, activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling1D(pool_size=2),
    Dropout(0.3),
    Conv1D(filters=256, kernel_size=3, activation='relu', padding='same'),
    BatchNormalization(),
    Dropout(0.3),
    LSTM(64, return_sequences=False),
    Dense(128, activation='swish'),
    Dropout(0.3),
    Dense(64, activation='swish'),
    Dropout(0.3),
    Dense(1, activation='sigmoid')
])
proposed_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
proposed_model.fit(X_train_cnn, y_train, validation_data=(X_val_cnn, y_val), epochs=100, batch_size=32, verbose=1)
proposed_probabilities = proposed_model.predict(X_val_cnn).flatten()

# Compute ROC curve and AUC for a model
def compute_roc_auc(model_name, y_true, y_scores):
    fpr, tpr, _ = roc_curve(y_true, y_scores)
    roc_auc = auc(fpr, tpr)
    return fpr, tpr, roc_auc

# Compute ROC and AUC for each model
roc_data_4 = {
    "RF": compute_roc_auc("RF", y_val, rf_probabilities),
    "XGB": compute_roc_auc("XGB", y_val, xgb_probabilities),
    "LGBM": compute_roc_auc("LGBM", y_val, lgbm_probabilities),
    "MLP": compute_roc_auc("MLP", y_val, mlp_probabilities),
    "CNN": compute_roc_auc("CNN", y_val, cnn_probabilities),
    "Deep_CLD": compute_roc_auc("Deep_CLD", y_val, proposed_probabilities),
}

# Plot ROC curves
plt.figure(figsize=(10, 8))
for model_name, (fpr, tpr, roc_auc) in roc_data_4.items():
    plt.plot(fpr, tpr, label=f"{model_name} AUC = {roc_auc:.2f}")

plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel("False Positive Rate", fontsize=14)
plt.ylabel("True Positive Rate", fontsize=14)
plt.legend(fontsize=12)
plt.grid(alpha=0.3)
plt.title("ROC Curve Comparison", fontsize=16)
plt.show()


Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

Parameters: { "use_label_encoder" } are not used.



[LightGBM] [Info] Number of positive: 582, number of negative: 582
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.001560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1006
[LightGBM] [Info] Number of data points in the train set: 1164, number of used features: 116
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Epoch 1/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m16s[0m 374ms/step - accuracy: 0.7381 - loss: 0.4956
Epoch 2/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 397ms/step - accuracy: 0.9699 - loss: 0.1082
Epoch 3/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m21s[0m 407ms/step - accuracy: 0.9850 - loss: 0.0428
Epoch 4/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 347ms/step - accuracy: 0.9855 - loss: 0.0399
Epoch 5/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 342ms/step - accuracy: 0.9923 - loss: 0.0285
Epoch 6/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 376ms/step - accuracy: 0.9920 - loss: 0.0212
Epoch 7/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m14s[0m 381ms/step - accuracy: 0.9940 - loss: 0.0225
Epoch 8/50
[1m37/37[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 376ms/step - accuracy: 0.9903 - loss: 0.0225
Epoch 9/50
[1m37/37[0m [32m━━