# Neural Network

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, accuracy_score
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, make_scorer, r2_score
from sklearn.metrics import (
    roc_curve, auc, confusion_matrix, ConfusionMatrixDisplay,
    accuracy_score, precision_recall_fscore_support
)
from scipy.stats import pearsonr

2024-12-05 21:43:30.232728: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
problem_types = ['regression','classification'] #regression or classification
input_types = ["random_forest_elimination","correlation_variance_filter","PCA","Factor_Analysis"] #reduced features: , genetic_algorithm, PCA, Factor_Analysis, random_forest
seed = 254

In [3]:
def read_file(problem_type, input_file):
    """
    Read input features and recover target values (IC50 or Potency).

    Parameters:
        problem_type (str): 'regression' or 'classification'.
        input_file (str): Path to the input feature file.

    Returns:
        tuple:
            - X_indexed (pd.DataFrame): Feature DataFrame indexed by 'Molecule ChEMBL ID'.
            - target (pd.Series): Target values indexed by 'Molecule ChEMBL ID'.
    """
    # Validate problem type
    if problem_type not in {'regression', 'classification'}:
        raise ValueError("Invalid problem_type. Must be 'regression' or 'classification'.")

    # Define the target column based on problem type
    target_column = '-logIC50' if problem_type == 'regression' else 'Potency'

    # Load input features and target data
    X = pd.read_csv(input_file)
    target = pd.read_csv('../../3_train_test_split/descriptors_all.csv', usecols=['Molecule ChEMBL ID', target_column])

    # Merge target data with features based on 'Molecule ChEMBL ID'
    target = target[target['Molecule ChEMBL ID'].isin(X['Molecule ChEMBL ID'])]

    # Return original and indexed feature DataFrame, and target values as a Series
    return X.set_index('Molecule ChEMBL ID'), target.set_index('Molecule ChEMBL ID')[target_column]

In [4]:
def run_neural_network(problem_type, input_type, seed):
    """
    Train and evaluate a neural network for regression or classification with 5-fold cross-validation.

    Parameters:
        problem_type (str): 'regression' or 'classification'.
        input_type (str): Feature reduction scheme directory (e.g., 'PCA', 'Factor_Analysis').
        seed (int): Random seed for reproducibility.

    Returns:
        dict: Best model, test predictions, evaluation metrics, and test dataset.
    """
    tf.random.set_seed(seed)
    np.random.seed(seed)

    base_dir = f"../../4_feature_selection/{input_type}/"
    if input_type == "PCA":
        base_dir = os.path.join(base_dir, "PCA_results/PCA_components")
    elif input_type == "Factor_Analysis":
        base_dir = os.path.join(base_dir, "FA_results/selected_features")

    problem = "reg" if problem_type == "regression" else "class"
    test_file = os.path.join(base_dir, f"test_{problem}.csv")

    # Load test dataset
    X_test, y_test = read_file(problem_type, test_file)

    # For classification, encode labels
    label_encoder = None
    if problem_type == "classification":
        label_encoder = LabelEncoder()
        y_test = label_encoder.fit_transform(y_test)

    # Perform 5-fold cross-validation
    print(f"Running 5-fold cross-validation for Neural Network ({problem_type})...")
    fold_metrics = []
    best_models = []

    for fold in range(1, 6):
        train_file = os.path.join(base_dir, f"train_{problem}_{fold}.csv")
        val_file = os.path.join(base_dir, f"val_{problem}_{fold}.csv")

        X_train, y_train = read_file(problem_type, train_file)
        X_val, y_val = read_file(problem_type, val_file)

        # Encode classification labels
        if problem_type == "classification":
            y_train = label_encoder.transform(y_train)
            y_val = label_encoder.transform(y_val)

        # Define the neural network model
        model = Sequential()
        model.add(Dense(128, activation="relu", input_shape=(X_train.shape[1],)))
        model.add(Dropout(0.2))
        model.add(Dense(64, activation="relu"))
        if problem_type == "regression":
            model.add(Dense(1))  
            loss = "mean_squared_error"
            metrics = ["mean_squared_error"]
        else:
            model.add(Dense(len(np.unique(y_train)), activation="softmax"))  # Multiclass classification
            loss = "sparse_categorical_crossentropy"
            metrics = ["accuracy"]

        optimizer = Adam(learning_rate=0.001)
        model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

        # Train the model with early stopping
        early_stopping = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True)
        history = model.fit(
            X_train, y_train,
            validation_data=(X_val, y_val),
            epochs=100,
            batch_size=32,
            callbacks=[early_stopping],
            verbose=1
        )

        # Evaluate the model on validation data
        val_metrics = model.evaluate(X_val, y_val, verbose=0)
        if problem_type == "regression":
            metric = val_metrics[1]  # MSE
        else:
            metric = val_metrics[1]  # Accuracy
        fold_metrics.append(metric)
        best_models.append(model)
        print(f"Fold {fold} - Validation Metric: {metric:.4f}")

    # Average metric across folds
    avg_metric = np.mean(fold_metrics)
    print(f"Average Cross-Validation Metric: {avg_metric:.4f}")

    # Evaluate the best model on the test set
    if problem_type == "regression":
        final_model = best_models[np.argmin(fold_metrics)]  # Best model with lowest MSE
    else:
        final_model = best_models[np.argmax(fold_metrics)]  # Best model with highest accuracy

    # Evaluate the model on the test set and return results
    evaluate_model(final_model,X_test,y_test,problem_type,input_type,"Neural_Network")

    return {
        'best_model': final_model,
        'test_predictions': y_test_pred,
        'test_metric': test_metric,
        'X_test': X_test,
        'y_test': y_test
    }

In [5]:
def evaluate_model(final_model, X_test, y_test, problem_type, input_type, model_type):
    """
    Evaluate the model, create plots, and save performance summary and predictions.

    Parameters:
        final_model: Trained model.
        X_test (pd.DataFrame): Test feature set.
        y_test (pd.Series): True target values for the test set.
        problem_type (str): 'regression' or 'classification'.
        input_type (str): Feature reduction scheme used.
        model_type (str): Model type (default is 'random forest').

    Returns:
        None
    """
    # Generate file prefix
    file_prefix = f"{model_type}_{problem_type}_{input_type}"

    # Predictions
    y_pred = final_model.predict(X_test)

    if problem_type == "regression":
        # Calculate regression metrics
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        pearson_corr, _ = pearsonr(y_test, y_pred)

        # Save performance summary
        performance_summary = pd.DataFrame([{
            "model_type": model_type,
            "problem_type": problem_type,
            "input_type": input_type,
            "mse": mse,
            "r2_score": r2,
            "pearson_corr": pearson_corr
        }])
        performance_summary.to_csv(f"{file_prefix}_performance_summary.csv", index=False)

        # Save predictions
        predictions = pd.DataFrame({
            "Molecule ChEMBL ID": X_test.index,
            "True IC50": y_test,
            "Predicted IC50": y_pred
        })
        predictions.to_csv(f"{file_prefix}_predictions.csv", index=False)

        # Plot Predicted vs Target
        plt.figure(figsize=(6, 6), dpi=150)
        sns.scatterplot(x=y_test, y=y_pred, alpha=0.7)
        plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], color="red", linestyle="--")
        plt.xlabel("True IC50")
        plt.ylabel("Predicted IC50")
        plt.title("Predicted vs True")
        plt.savefig(f"{file_prefix}_predicted_vs_true.png")
        plt.show()

        # Plot Residuals
        residuals = y_test - y_pred
        plt.figure(figsize=(6, 6), dpi=150)
        sns.scatterplot(x=y_test, y=residuals, alpha=0.7)
        plt.axhline(0, color="red", linestyle="--")
        plt.xlabel("True IC50")
        plt.ylabel("Residuals")
        plt.title("Residual Plot")
        plt.savefig(f"{file_prefix}_residuals.png")
        plt.show()

    elif problem_type == "classification":
        # Calculate classification metrics
        accuracy = accuracy_score(y_test, y_pred)
        precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average="weighted")

        # Save performance summary
        performance_summary = pd.DataFrame([{
            "model_type": model_type,
            "problem_type": problem_type,
            "input_type": input_type,
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1_score": f1
        }])
        performance_summary.to_csv(f"{file_prefix}_performance_summary.csv", index=False)

        # Save predictions
        predictions = pd.DataFrame({
            "Molecule ChEMBL ID": X_test.index,
            "True Potency": y_test,
            "Predicted Potency": y_pred
        })
        predictions.to_csv(f"{file_prefix}_predictions.csv", index=False)

        # Plot Confusion Matrix
        cm = confusion_matrix(y_test, y_pred)
        disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=final_model.classes_)
        disp.plot(cmap="Blues", values_format="d")
        plt.title("Confusion Matrix")
        plt.savefig(f"{file_prefix}_confusion_matrix.png")
        plt.show()

        # Plot ROC Curve
        if hasattr(final_model, "predict_proba"):
            y_proba = final_model.predict_proba(X_test)
            plt.figure(figsize=(8, 6), dpi=150)
            for i, class_label in enumerate(final_model.classes_):
                fpr, tpr, _ = roc_curve((y_test == class_label).astype(int), y_proba[:, i])
                roc_auc = auc(fpr, tpr)
                plt.plot(fpr, tpr, label=f"{class_label} (AUC = {roc_auc:.2f})")
            plt.plot([0, 1], [0, 1], color="red", linestyle="--")
            plt.xlabel("False Positive Rate")
            plt.ylabel("True Positive Rate")
            plt.title("ROC Curve (One-vs-Rest)")
            plt.legend()
            plt.savefig(f"{file_prefix}_roc_curve.png")
            plt.show()

# Train Model

In [6]:
for problem_type in problem_types:
    for input_type in input_types:
        print(f"Training Neural Network model for {problem_type} using {input_type} features...")
        results = run_neural_network(problem_type=problem_type, input_type=input_type, seed=seed)
        
        # Unpack results
        final_model = results["best_model"]
        X_test = results["X_test"]
        y_test = results["y_test"]
        
        # Evaluate the model
        print(f"Evaluating the model...")
        evaluate_model(final_model, X_test, y_test, problem_type, input_type, "Neural_Network")
        
        print("Training and evaluation complete")


Training Neural Network model for regression using random_forest_elimination features...
Running 5-fold cross-validation for Neural Network (regression)...
Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 206.5500 - mean_squared_error: 206.5500 - val_loss: 99.5009 - val_mean_squared_error: 99.5009
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 75.5760 - mean_squared_error: 75.5760 - val_loss: 33.8669 - val_mean_squared_error: 33.8669
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 36.1945 - mean_squared_error: 36.1945 - val_loss: 18.3187 - val_mean_squared_error: 18.3187
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 24.1234 - mean_squared_error: 24.1234 - val_loss: 15.0792 - val_mean_squared_error: 15.0792
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 17.6333 - mean_squared_error: 17.6333 - val_loss: 13.9507 - val_mean_squared_error: 13.9507
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 206.2780 - mean_squared_error: 206.2780 - val_loss: 103.1483 - val_mean_squared_error: 103.1483
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 77.0917 - mean_squared_error: 77.0917 - val_loss: 29.3431 - val_mean_squared_error: 29.3431
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 35.7174 - mean_squared_error: 35.7174 - val_loss: 21.3777 - val_mean_squared_error: 21.3777
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - loss: 22.7122 - mean_squared_error: 22.7122 - val_loss: 19.4213 - val_mean_squared_error: 19.4213
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 19.2749 - mean_squared_error: 19.2749 - val_loss: 14.6723 - val_mean_squared_error: 14.6723
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/st

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 14ms/step - loss: 199.8142 - mean_squared_error: 199.8142 - val_loss: 87.5548 - val_mean_squared_error: 87.5548
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 70.3733 - mean_squared_error: 70.3733 - val_loss: 40.9141 - val_mean_squared_error: 40.9141
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 31.5225 - mean_squared_error: 31.5225 - val_loss: 25.4933 - val_mean_squared_error: 25.4933
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 19.8641 - mean_squared_error: 19.8641 - val_loss: 20.8926 - val_mean_squared_error: 20.8926
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 17.8238 - mean_squared_error: 17.8238 - val_loss: 19.1946 - val_mean_squared_error: 19.1946
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 13ms/step - loss: 206.5846 - mean_squared_error: 206.5846 - val_loss: 112.6478 - val_mean_squared_error: 112.6478
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 80.8011 - mean_squared_error: 80.8011 - val_loss: 33.3225 - val_mean_squared_error: 33.3225
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 25.4686 - mean_squared_error: 25.4686 - val_loss: 26.8632 - val_mean_squared_error: 26.8632
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 22.1606 - mean_squared_error: 22.1606 - val_loss: 20.3934 - val_mean_squared_error: 20.3934
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 16.0224 - mean_squared_error: 16.0224 - val_loss: 18.6847 - val_mean_squared_error: 18.6847
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/st

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 12ms/step - loss: 194.0665 - mean_squared_error: 194.0665 - val_loss: 103.9105 - val_mean_squared_error: 103.9105
Epoch 2/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - loss: 70.1007 - mean_squared_error: 70.1007 - val_loss: 40.8532 - val_mean_squared_error: 40.8532
Epoch 3/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 35.2873 - mean_squared_error: 35.2873 - val_loss: 22.7941 - val_mean_squared_error: 22.7941
Epoch 4/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 19.0539 - mean_squared_error: 19.0539 - val_loss: 19.1505 - val_mean_squared_error: 19.1505
Epoch 5/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 15.7990 - mean_squared_error: 15.7990 - val_loss: 16.5371 - val_mean_squared_error: 16.5371
Epoch 6/100
[1m14/14[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/st

  pearson_corr, _ = pearsonr(y_test, y_pred)


ValueError: Per-column arrays must each be 1-dimensional