# Stacking Model

In [7]:
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from pathlib import Path
import datetime
import joblib

# Define root directory and subdirectories
root = "C:/Users/PayaPc.Com/Downloads/43591_2023_57_MOESM2_ESM/"  # Updated path
dataSplitDir = root + "DataSplit/"
modelDir = root + "DataModels/"

# Load the split datasets
train_data = np.load(dataSplitDir + 'train.npz')
val_data = np.load(dataSplitDir + 'val.npz')
test_data = np.load(dataSplitDir + 'test.npz')

X_train = train_data['x']
y_train = train_data['y']
X_val = val_data['x']
y_val = val_data['y']
X_test = test_data['x']
y_test = test_data['y']

# Replace NaN values with zero
X_train[np.isnan(X_train)] = 0
X_val[np.isnan(X_val)] = 0
X_test[np.isnan(X_test)] = 0

# Calculate class weights based on training data distribution
unique_classes, class_counts = np.unique(y_train, return_counts=True)
total_samples = len(y_train)
class_weights = {cls: total_samples / (len(unique_classes) * count) for cls, count in zip(unique_classes, class_counts)}
print("Class weights:", class_weights)

# Define base models for stacking
base_models = [
    ('mlp', MLPClassifier(  # Without class_weight
        hidden_layer_sizes=(128,),
        activation='relu',
        solver='adam',
        learning_rate_init=0.001,
        max_iter=1500,
        batch_size=100000,
        random_state=42
    )),
    ('sgd', SGDClassifier(
        loss='log_loss',
        learning_rate='optimal',
        max_iter=1500,
        random_state=42,
        n_jobs=-1,
        class_weight=class_weights  # Apply class weights
    )),
    ('rf', RandomForestClassifier(
        n_estimators=100,
        max_depth=10,
        random_state=42,
        n_jobs=-1,
        class_weight=class_weights  # Apply class weights
    )),
    ('svc', SVC(
        kernel='rbf',
        probability=True,
        random_state=42,
        class_weight=class_weights  # Apply class weights
    ))
]

# Define stacking model with LogisticRegression as the meta-model
stacking_model = StackingClassifier(
    estimators=base_models,
    final_estimator=LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight=class_weights  # Apply class weights to meta-model
    ),
    cv=5,  # Use 5-fold cross-validation
    n_jobs=-1  # Use all available CPU cores
)

# Train the stacking model
print("Starting stacking model training:", datetime.datetime.now())
stacking_model.fit(X_train, y_train)

# Evaluate on training set
y_train_pred = stacking_model.predict(X_train)
train_accuracy = accuracy_score(y_train, y_train_pred)
print("Training accuracy of stacking:", train_accuracy)

# Evaluate on validation set
y_val_pred = stacking_model.predict(X_val)
val_accuracy = accuracy_score(y_val, y_val_pred)
print("Validation accuracy of stacking:", val_accuracy)

# Evaluate on test set
y_test_pred = stacking_model.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print("Test accuracy of stacking:", test_accuracy)

# Save the stacking model
Path(modelDir).mkdir(parents=True, exist_ok=True)
joblib.dump(stacking_model, modelDir + 'stacking_model_scikit.pkl')

# Print completion time
print("Training completed:", datetime.datetime.now())

Class weights: {0: 0.18855342018627425, 1: 12.272893772893774, 2: 38.29142857142857, 3: 136.75510204081633, 4: 8.252463054187192, 5: 1.284947267497603, 6: 1.4659811857361629}
Starting stacking model training: 2025-03-05 18:25:55.702007
Training accuracy of stacking: 0.9855245485748396
Validation accuracy of stacking: 0.9509138381201044
Test accuracy of stacking: 0.9519832985386222
Training completed: 2025-03-05 18:31:01.962807


# Deep Learning

In [5]:
import numpy as np
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.metrics import accuracy_score
from pathlib import Path
import datetime

# Define root directory and subdirectories
root = "C:/Users/PayaPc.Com/Downloads/43591_2023_57_MOESM2_ESM/" 
dataSplitDir = root + "DataSplit/"
modelDir = root + "DataModels1/"

# Load the split datasets
train_data = np.load(dataSplitDir + 'train.npz')
val_data = np.load(dataSplitDir + 'val.npz')
test_data = np.load(dataSplitDir + 'test.npz')

X_train = train_data['x']
y_train = train_data['y']
X_val = val_data['x']
y_val = val_data['y']
X_test = test_data['x']
y_test = test_data['y']

# Replace NaN values with zero
X_train[np.isnan(X_train)] = 0
X_val[np.isnan(X_val)] = 0
X_test[np.isnan(X_test)] = 0

# Calculate class weights to handle imbalance
unique_classes, class_counts = np.unique(y_train, return_counts=True)
total_samples = len(y_train)
class_weights = {cls: total_samples / (len(unique_classes) * count) for cls, count in zip(unique_classes, class_counts)}
print("Class weights:", class_weights)

# Convert labels to one-hot encoding for multi-class classification
num_classes = len(unique_classes)
y_train_onehot = keras.utils.to_categorical(y_train, num_classes)
y_val_onehot = keras.utils.to_categorical(y_val, num_classes)
y_test_onehot = keras.utils.to_categorical(y_test, num_classes)

# Define the deep learning model
model = Sequential([
    # Input layer with 256 units and ReLU activation
    Dense(256, activation='relu', input_shape=(X_train.shape[1],), kernel_initializer='he_uniform'),
    # Dropout layer to prevent overfitting
    Dropout(0.5),
    # Hidden layer with 128 units
    Dense(128, activation='relu', kernel_initializer='he_uniform'),
    Dropout(0.5),
    # Hidden layer with 64 units
    Dense(64, activation='relu', kernel_initializer='he_uniform'),
    Dropout(0.5),
    # Output layer with softmax for multi-class classification
    Dense(num_classes, activation='softmax')
])

# Compile the model with categorical crossentropy loss and Adam optimizer
optimizer = Adam(learning_rate=0.001, clipvalue=0.5)
model.compile(
    optimizer=optimizer,
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Print model summary
model.summary()

# Train the model
print("Starting model training:", datetime.datetime.now())
history = model.fit(
    X_train, y_train_onehot,
    validation_data=(X_val, y_val_onehot),
    epochs=1500,
    batch_size=100000,
    class_weight=class_weights,
    verbose=1
)

# Evaluate on validation set
y_val_pred = model.predict(X_val)
y_val_pred_classes = np.argmax(y_val_pred, axis=1)
val_accuracy = accuracy_score(y_val, y_val_pred_classes)
print("Validation accuracy:", val_accuracy)

# Evaluate on test set
y_test_pred = model.predict(X_test)
y_test_pred_classes = np.argmax(y_test_pred, axis=1)
test_accuracy = accuracy_score(y_test, y_test_pred_classes)
print("Test accuracy:", test_accuracy)

# Save the model
Path(modelDir).mkdir(parents=True, exist_ok=True)
model.save(modelDir + 'dnn_model.keras')

print("Training completed:", datetime.datetime.now())

Class weights: {0: 0.18855342018627425, 1: 12.272893772893774, 2: 38.29142857142857, 3: 136.75510204081633, 4: 8.252463054187192, 5: 1.284947267497603, 6: 1.4659811857361629}


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


Starting model training: 2025-03-05 18:12:06.962579
Epoch 1/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step - accuracy: 0.2422 - loss: 5.6542 - val_accuracy: 0.4225 - val_loss: 1.5382
Epoch 2/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.2318 - loss: 4.5790 - val_accuracy: 0.4240 - val_loss: 1.5408
Epoch 3/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 127ms/step - accuracy: 0.2294 - loss: 3.7533 - val_accuracy: 0.4073 - val_loss: 1.6105
Epoch 4/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 144ms/step - accuracy: 0.2225 - loss: 3.6079 - val_accuracy: 0.3577 - val_loss: 1.6886
Epoch 5/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 226ms/step - accuracy: 0.2103 - loss: 3.3803 - val_accuracy: 0.3248 - val_loss: 1.7553
Epoch 6/1500
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 189ms/step - accuracy: 0.2118 - loss: 3.0022 - val_accuracy: 0.290