In [None]:
# Import essential libraries for data manipulation, visualization, modeling, and evaluation

# Basic Libraries
import numpy as np  # For numerical operations on arrays and matrices
import pandas as pd  # For data manipulation and analysis

# Visualization Libraries
import matplotlib.pyplot as plt  # For creating static plots and figures
import seaborn as sns  # For enhanced data visualization, particularly for statistical plots

# Data Preprocessing and Scaling
from sklearn.preprocessing import MinMaxScaler  # To scale feature values to a specified range, often [0, 1]

# Machine Learning Models and Tools
from sklearn.ensemble import RandomForestRegressor  # Random Forest Regressor model for predictive tasks
import catboost  # CatBoost library for handling categorical data efficiently in machine learning
from catboost import CatBoostRegressor  # Gradient-boosting model specific to CatBoost for regression tasks
import shap  # SHAP (SHapley Additive exPlanations) for interpretability and feature importance

# Hyperparameter Tuning and Model Evaluation
from sklearn.model_selection import GridSearchCV  # Grid Search for hyperparameter tuning across multiple models
from sklearn.metrics import r2_score  # R-squared metric to evaluate the performance of regression models

# Deep Learning Framework (TensorFlow and Keras for Neural Networks)
import tensorflow as tf  # TensorFlow for deep learning framework
from tensorflow import keras  # High-level neural network API for TensorFlow
from tensorflow.keras import models, layers, optimizers  # Core modules for creating and training neural networks

# Import necessary classes and functions from Keras for building and training a neural network model
from keras.models import Sequential  # Sequential API for stacking neural network layers linearly
from keras.layers import Dense, Dropout  # Dense is a fully connected layer; Dropout helps prevent overfitting
from keras.callbacks import EarlyStopping  # Callback function to stop training early if validation performance stops improving# Define the Artificial Neural Network (ANN) model architecture with regularization and dropout

# Initialize a Sequential model for a linear stack of layers
ANN = Sequential()

# Add the first Dense (fully connected) layer with:
# - 128 units (neurons), ReLU activation function, L2 regularization to reduce overfitting, and input dimension of 1024
ANN.add(Dense(128, input_dim=1024, activation='relu', kernel_regularizer=l2(0.001)))

# Add a Dropout layer with 50% dropout rate to reduce overfitting by setting half of the units to zero randomly
ANN.add(Dropout(0.5))

# Add a second Dense layer with 64 units, ReLU activation, and L2 regularization
ANN.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))

# Add another Dropout layer with 50% dropout rate
ANN.add(Dropout(0.5))

# Add the output Dense layer with a single unit and sigmoid activation for binary classification
# Sigmoid activation function is used to output probabilities for binary classification
ANN.add(Dense(1, activation='sigmoid'))

# Compile the model with binary crossentropy loss, Adam optimizer, and accuracy as the evaluation metric
# - binary_crossentropy is suitable for binary classification tasks
# - Adam optimizer is efficient and often provides good convergence
ANN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add EarlyStopping callback to prevent overfitting
# - Monitors the validation loss and stops training if it does not improve for 5 consecutive epochs (patience=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

# Save the model summary to a text file for record-keeping
with open('ANN_Classification.txt', 'w') as f:
    ANN.summary(print_fn=lambda x: f.write(x + '\n'))

# Print the model summary to verify the architecture
print(ANN.summary())

# Train (fit) the ANN model on the training data with validation data and early stopping
# - Epochs are set to 100 (maximum), though early stopping may end training sooner
# - batch_size=32 defines the number of samples processed before the model’s internal parameters are updated
# - Validation data (X_test, Y_test) is used to monitor performance and apply early stopping
history2 = ANN.fit(X_train, Y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_test, Y_test), callbacks=[early_stopping])

# Evaluate the model performance on the test data
# - evaluate() returns the loss and accuracy on the test data, providing a measure of model generalization
loss, accuracy = ANN.evaluate(X_test, Y_test, verbose=1)
print("Accuracy:", accuracy)  # Display the test accuracy

from keras.regularizers import l2  # L2 regularization adds a penalty to the model to reduce overfitting by penalizing large weights

# Train-Test Split for Model Validation
from sklearn.model_selection import train_test_split  # To split the dataset into training and testing sets

# Oversampling for Imbalanced Data Handling
from imblearn.over_sampling import SMOTE  # SMOTE (Synthetic Minority Oversampling Technique) to address data imbalance

# Classification Model from CatBoost
from catboost import CatBoostClassifier  # CatBoost Classifier for handling categorical and imbalanced data in classification tasks

# Metrics for Model Evaluation
from sklearn import metrics  # General metrics for model evaluation (e.g., accuracy, precision, recall)
from sklearn.metrics import confusion_matrix  # Confusion matrix to evaluate classification model performance

# Visualization of Model Architecture
import graphviz  # For rendering graphical representations of decision trees

# Keras Deep Learning Models and Layers (used with Sequential API)
from keras.models import Sequential  # Sequential model type for stackable layers
from keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout  # Layers for CNN architecture (Conv1D, MaxPooling1D, etc.)
from keras.callbacks import EarlyStopping  # Early stopping to prevent overfitting during training
from keras.regularizers import l2  # Regularization technique to avoid overfitting by adding penalty terms to the model
from keras.utils import plot_model  # Utility for plotting the model architecture visually

# Machine Learning Classifiers
from sklearn.ensemble import RandomForestClassifier  # Random Forest Classifier model for classification tasks
from sklearn import svm  # Support Vector Machine (SVM) classifier
from sklearn.ensemble import GradientBoostingRegressor  # Gradient Boosting Regressor model for regression tasks

# Ensure unique imports for optimal performance
from sklearn.model_selection import GridSearchCV  # Already imported above but retained for clarity


# Classification Analysis

In [None]:
# Load ECFP4 data into a DataFrame
# The dataset is assumed to contain molecular descriptors (ECFP6) and biological activity values (e.g., pXC50).
# Replace `data_CHEMBL203-ECFP6.csv` with the actual file path if necessary.
Data_06 = pd.read_csv(r"data_CHEMBL203-ECFP6.csv")  # Load CSV data into a DataFrame named Data_06

# Classification Task: Define a binary class based on pXC50 threshold
# The threshold for binary classification is set to pXC50 > 6:
# If pXC50 > 6, the class label is set to 1 (active); otherwise, it is set to 0 (inactive).
Class = Data_06['pXC50'] >= 6  # Create a boolean Series indicating if pXC50 values are >= 6

# Initialize an empty list to store binary class labels
Classes = []
for i in Class:
    # Assign class 1 for active compounds (pXC50 >= 6) and class 0 for inactive compounds (pXC50 < 6)
    if i == True:
        Classes.append(1)
    elif i == False:
        Classes.append(0)

# Add the binary class labels as a new column 'Class' in the DataFrame
Data_06['Class'] = Classes  # Insert binary labels for classification task

# Define feature matrix (X06) and target variable (Y06)
# Exclude 'Smiles' (molecular structure representation), 'pXC50' (activity value), and 'Class' (target variable) from features
X06 = Data_06.drop(['Smiles', 'pXC50', 'Class'], axis=1)  # Feature matrix for model input
Y06 = Data_06['Class'].values.reshape(-1, 1)  # Target variable reshaped as a column vector


In [None]:
# Apply SMOTE for handling data imbalance
# SMOTE (Synthetic Minority Oversampling Technique) is used to oversample the minority class, making the dataset more balanced.
# This can help improve model performance, especially in classification tasks with imbalanced data.

# Initialize SMOTE with a 'minority' sampling strategy, which oversamples the minority class to balance it with the majority class.
smote = SMOTE(sampling_strategy='minority', random_state=42)  # 'random_state=42' ensures reproducibility

# Apply SMOTE on feature matrix (X) and target variable (Y) to create a balanced dataset
X_sm, y_sm = smote.fit_resample(X, Y)  # X_sm and y_sm contain the resampled features and target labels

# Verify class distribution across train, validation, and test sets after SMOTE
# Convert the target values from train, validation, and test sets into a list for checking distribution
y_s = list(Y_train) + list(Y_val) + list(Y_test)  # Aggregate all target values into a single list for verification

# Calculate and display the count of each class label in the resampled data to verify balance
pd.Series(y_s).value_counts()  # Count occurrences of each class label (0 or 1) after SMOTE resampling


In [None]:
# Split data into training, validation, and testing sets
# First, we create a training set and a temporary test set from the resampled dataset
# Then, the test set is further split to create separate validation and final test sets.

# Step 1: Split the data into 80% training and 20% test sets
# 'test_size=0.2' specifies that 20% of the data will be reserved for testing, and 80% for training.
X_train, X_tes, Y_train, Y_tes = train_test_split(X_sm, y_sm, test_size=0.2, random_state=42)

# Step 2: Further split the temporary test set (X_tes and Y_tes) into validation and test sets
# We use a 50% split here, which divides the initial 20% test set into 10% validation and 10% final test sets.
X_val, X_test, Y_val, Y_test = train_test_split(X_tes, Y_tes, test_size=0.5, random_state=42)

# Result:
# - X_train, Y_train: Training set (80% of the data)
# - X_val, Y_val: Validation set (10% of the data)
# - X_test, Y_test: Testing set (10% of the data)


In [None]:
# Define and train a CatBoost classification model using training and validation sets
# The CatBoost model is an optimized gradient boosting algorithm particularly effective for categorical data

# Step 1: Set model parameters for the CatBoostClassifier
params = {
    'iterations': 3000,  # Total number of boosting iterations
    'learning_rate': 0.01,  # Learning rate to control the size of each step during gradient descent
    'depth': 7,  # Depth of each tree, controlling model complexity
    'eval_metric': 'Accuracy',  # Evaluation metric to optimize (here we focus on classification accuracy)
    'verbose': 200,  # Output training logs every 200 iterations for monitoring
    'od_type': "Iter",  # Overfitting detector type, here set to stop after a fixed number of bad iterations
    'od_wait': 1000,  # Overfitting detector: number of iterations to wait after the last improvement before stopping
    'random_seed': 2  # Random seed for reproducibility
}

# Step 2: Initialize the CatBoost classifier with the specified parameters
cat_model = CatBoostClassifier(**params)  # Instantiate the classifier with defined hyperparameters

# Step 3: Train the model on the training set and validate on the validation set
# - eval_set specifies the validation data for evaluation during training
# - use_best_model=False disables stopping at the best iteration (useful if exploring model behavior manually)
# - plot=True provides a graphical plot of the model’s performance on training and validation sets over iterations
cat_model.fit(
    X_train, Y_train,  # Training data (features and labels)
    eval_set=(X_val, Y_val),  # Validation data (features and labels)
    use_best_model=False,  # Disable early stopping at best iteration
    plot=True  # Plot training and validation accuracy over iterations
)


In [None]:
# Evaluate the trained CatBoost model on the test set
# The classification report provides metrics such as precision, recall, and F1-score for each class.

# Print classification report to summarize model performance
# metrics.classification_report() compares true labels (Y_test) with model predictions (cat_model.predict(X_test))
# and outputs detailed evaluation metrics:
# - Precision: The accuracy of positive predictions
# - Recall: The ability of the model to capture positive instances
# - F1-score: The harmonic mean of precision and recall, useful for imbalanced data
# - Support: The number of occurrences of each class in Y_test

print(metrics.classification_report(Y_test, cat_model.predict(X_test)))


In [None]:
# Plot the confusion matrix to visually assess the model's performance in classification
# The confusion matrix shows the counts of true positives, true negatives, false positives, and false negatives

# Step 1: Set up the plot dimensions and style
plt.figure(figsize=(10, 5))  # Define figure size for the plot
plt.xticks(size=17, weight='bold')  # Customize x-axis tick label font size and weight
plt.yticks(size=17, weight='bold')  # Customize y-axis tick label font size and weight
plt.title('Confusion Matrix for Catboost Algorithm', size=20, weight='bold')  # Set title with font size and weight

# Step 2: Generate the confusion matrix and plot it as a heatmap
# - confusion_matrix() computes the matrix from true labels (Y_test) and model predictions
# - sns.heatmap() creates a heatmap to represent the matrix values visually
sns.heatmap(
    confusion_matrix(Y_test, cat_model.predict(X_test)),  # Confusion matrix data for heatmap
    annot=True,  # Annotate each cell with the count values
    fmt='g',  # Display counts as integers (without scientific notation)
    annot_kws={"size": 20, 'weight': 'bold'}  # Customize annotation font size and weight
)

# Step 3: Save the confusion matrix plot to a file
# - dpi=3# SHAP (SHapley Additive exPlanations) Analysis for Feature Importance in CatBoost Model
# SHAP helps interpret the model by showing feature importance and how individual features contribute to predictions.

# Step 1: Prepare data for SHAP analysis
all_preds = cat_model.predict(X_test)  # Generate predictions for the test set to verify model's behavior
X_df = pd.DataFrame(X_test)  # Convert X_test into a DataFrame for easier manipulation
x_df = X_df.copy(deep=True)  # Create a deep copy of the test feature set for SHAP analysis

# Create an additional DataFrame copy to append predictions for further analysis if needed
x_df_1st = x_df.copy(deep=True)
x_df_1st['1st'] = all_preds  # Add a column with model predictions

# Reset index of the DataFrames for easier access to feature data
x_df = x_df.reset_index().drop('index', axis=1)
x_df_1st = x_df_1st.reset_index().drop('index', axis=1)

# Step 2: Apply SHAP analysis using TreeExplainer for CatBoost model
# TreeExplainer provides SHAP values (impact scores) for each feature on each prediction
shap_values = shap.TreeExplainer(cat_model).shap_values(x_df)

# Step 3: Generate SHAP summary plot to visualize feature importance
# shap.summary_plot() shows the average effect of each feature on model output
# - plot_size and max_display control plot size and the number of features displayed
# - plot_type='dot' displays feature importance with dots (one per instance per feature)
shap.summary_plot(shap_values, x_df, plot_size=(10, 10), show=False, plot_type='dot', max_display=10)
plt.title('SHAP for CatBoost', weight='bold', size=20)  # Add title with custom font settings
plt.xticks(size=20, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=20, weight='bold')  # Customize y-axis tick labels
plt.savefig('SHAP 06 Classify', dpi=300, bbox_inches='tight')  # Save plot as 'SHAP 06 Classify.png'

# Step 4: Calculate average absolute SHAP values to assess feature importance
# We compute the mean absolute SHAP values across all instances to get an importance score for each feature
feature_imp = np.mean(np.abs(shap_values), axis=0)
feature_imp.shape  # Check the shape to ensure compatibility with features

# Step 5: Identify the top 10 most important features based on SHAP values
ind = feature_imp.argsort()[-10:]  # Get indices of the top 10 features by importance
ind = ind[::-1]  # Arrange indices in descending order of importance

# Display the top 10 features and their SHAP importance values
np.array(x_df.columns)[ind]  # Names of the top 10 important features
feature_imp[ind]  # Corresponding importance scores for the top features

# Step 6: Plot top 10 features by SHAP importance as a bar chart
plt.figure(figsize=(10, 8))  # Set the figure size
plot = sns.barplot(x=np.array(x_df.columns)[ind], y=feature_imp[ind], color=[0.1, 0.2, 0.1], order=ind)  # Bar plot
plot.set_xticklabels(plot.get_xticklabels(), horizontalalignment='center', size=12)  # Adjust x-axis labels
plt.yticks(size=15, weight='bold')  # Customize y-axis tick labels
plt.xticks(size=20, rotation=20, weight='bold')  # Customize x-axis tick labels with rotation
plt.ylabel('Shap feature absolute importance', size=15, weight='bold')  # Set y-axis label
plt.savefig('SHAP Feature Importance CB 06', dpi=100, bbox_inches='tight')  # Save plot as 'SHAP Feature Importance CB 06.png'
00 sets the resolution for the saved image
# - bbox_inches='tight' removes extra whitespace around the plot
plt.savefig('Catboost CM 06', dpi=300, bbox_inches='tight')  # Save plot as 'Catboost CM 06.png'

# Display the plot
plt.show()


In [None]:
# Define the Convolutional Neural Network (CNN) model architecture with regularization and dropout
# Regularization and dropout layers are added to reduce overfitting and improve model generalization

# Initialize a Sequential model
CNN = Sequential()

# Add the first Conv1D layer with 64 filters, kernel size 3, ReLU activation, and L2 regularization
# - input_shape=(1024, 1) indicates the input feature shape, assuming 1024 features and 1 channel per instance
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu', input_shape=(1024, 1), kernel_regularizer=l2(0.001)))

# Add a second Conv1D layer with 64 filters, kernel size 3, ReLU activation, and L2 regularization
CNN.add(Conv1D(filters=64, kernel_size=3, activation='relu', kernel_regularizer=l2(0.001)))

# Add MaxPooling layer to down-sample the output by reducing the spatial dimensions
CNN.add(MaxPooling1D(pool_size=2))

# Flatten the 3D output to a 1D vector for input into fully connected (Dense) layers
CNN.add(Flatten())

# Add a fully connected (Dense) layer with 64 units, ReLU activation, and L2 regularization
CNN.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))

# Add a Dropout layer with a 50% dropout rate to prevent overfitting by randomly setting half of the units to zero
CNN.add(Dropout(0.5))

# Add the output Dense layer with a single unit and sigmoid activation for binary classification
CNN.add(Dense(1, activation='sigmoid'))  # Output layer uses sigmoid for probability output in binary classification

# Compile the model with binary cross-entropy loss and Adam optimizer
# - binary_crossentropy is suitable for binary classification tasks
# - Adam optimizer is efficient and often provides quick convergence
CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Define EarlyStopping callback to prevent overfitting
# - Monitors the validation loss and stops training if it does not improve for 5 consecutive epochs (patience=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

# Save the model summary to a text file for record-keeping
with open('CNN_Classification.txt', 'w') as f:
    CNN.summary(print_fn=lambda x: f.write(x + '\n'))

# Print the model summary to verify the architecture
print(CNN.summary())

# Save a graphical representation of the CNN model architecture
keras.utils.plot_model(CNN, "my_first_model.png", dpi=300)  # Save as 'my_first_model.png' with 300 DPI for clarity

# Fit the model to the training data with validation data and early stopping
# - Epochs set to 100 as the maximum, but early stopping may stop training earlier
# - Validation data (X_val, Y_val) is used to monitor performance and apply early stopping
history = CNN.fit(X_train, Y_train, epochs=100, validation_data=(X_val, Y_val), callbacks=[early_stopping])


In [None]:
# Import necessary library for plotting
import matplotlib.pyplot as plt

# Plot 1: Training and validation loss over epochs
plt.figure(figsize=(8, 5))  # Define figure size for the plot

# Plot the training and validation loss using data stored in the history object
# - history.history['loss'] contains training loss values over epochs
# - history.history['val_loss'] contains validation loss values over epochs
plt.plot(history.history['loss'], linewidth=5, linestyle='-')  # Training loss plot with custom line width and style
plt.plot(history.history['val_loss'], linewidth=5, linestyle='-')  # Validation loss plot with same style

# Add titles and labels with custom font sizes and weights
plt.title('CNN Model Loss', size=20, weight='bold')
plt.xlabel('Epoch', size=20, weight='bold')
plt.ylabel('Loss', size=20, weight='bold')
plt.xticks(size=15, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=15, weight='bold')  # Customize y-axis tick labels

# Add a legend to differentiate training and validation loss
plt.legend(['Train', 'Validation'], loc='upper right', prop={"size": 15, 'weight': 'bold'})

# Save the plot to a file with specified resolution and tight bounding box
plt.savefig('CNN Loss 06', dpi=100, bbox_inches='tight')  # Save as 'CNN Loss 06.png'
plt.show()  # Display the plot

# Plot 2: Training and validation accuracy over epochs
plt.figure(figsize=(8, 5))  # Define figure size for the plot

# Plot the training and validation accuracy using data stored in the history object
# - history.history['accuracy'] contains training accuracy over epochs
# - history.history['val_accuracy'] contains validation accuracy over epochs
plt.plot(history.history['accuracy'], linewidth=5, linestyle='-')  # Training accuracy plot
plt.plot(history.history['val_accuracy'], linewidth=5, linestyle='-')  # Validation accuracy plot

# Add titles and labels with custom font sizes and weights
plt.title('CNN Model Accuracy', size=20, weight='bold')
plt.xlabel('Epoch', size=20, weight='bold')
plt.ylabel('Accuracy', size=20, weight='bold')
plt.xticks(size=15, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=15, weight='bold')  # Customize y-axis tick labels

# Add a legend to differentiate training and validation accuracy
plt.legend(['Train', 'Validation'], loc='lower right', prop={"size": 15, 'weight': 'bold'})

# Save the plot to a file with specified resolution and tight bounding box
plt.savefig('CNN Accuracy 06', dpi=300, bbox_inches='tight')  # Save as 'CNN Accuracy 06.png'
plt.show()  # Display the plot


In [None]:
# CNN Model Evaluation on Test Data
# This section generates predictions on the test set, converts probabilities to binary class predictions,
# and prints a classification report to evaluate the model's performance.

# Generate predictions on the test set
# - CNN.predict(X_test) outputs probability scores (values between 0 and 1) for each instance in the test set
preds = CNN.predict(X_test)

# Convert probability scores to binary predictions
# - Threshold is set at 0.5: predictions >= 0.5 are classified as 1 (positive class), and predictions < 0.5 as 0 (negative class)
y_pred_binary = (preds >= 0.5).astype(int)

# Print the classification report for model evaluation
# - metrics.classification_report() compares true labels (Y_test) with predicted labels (y_pred_binary)
# - Outputs precision, recall, F1-score, and support for each class (0 and 1 in binary classification)
print(metrics.classification_report(Y_test, y_pred_binary))


In [None]:
# Plot confusion matrix for CNN model predictions
# The confusion matrix provides insights into the number of correct and incorrect predictions for each class (0 and 1).

plt.figure(figsize=(10, 5))  # Define figure size for the plot

# Customize tick label font size and weight for readability
plt.xticks(size=17, weight='bold')
plt.yticks(size=17, weight='bold')

# Set the title of the plot with custom font size and weight
plt.title('Confusion Matrix for CNN', size=20, weight='bold')

# Generate and plot the confusion matrix as a heatmap
# - confusion_matrix(Y_test, y_pred_binary) creates a matrix comparing true vs. predicted labels
# - annot=True displays count values in each cell
# - fmt='g' ensures values are displayed as integers (no scientific notation)
# - cmap='Blues' uses a blue color map to visually differentiate matrix values
# - annot_kws adjusts the size and weight of annotation text for clarity
sns.heatmap(confusion_matrix(Y_test, y_pred_binary), annot=True, fmt='g', cmap='Blues', annot_kws={"size": 20, 'weight': 'bold'})

# Save the plot to a file with specified resolution and bounding box
plt.savefig('CNN CM 06', dpi=100, bbox_inches='tight')  # Save as 'CNN CM 06.png'

# Display the plot
plt.show()


In [None]:
# Define the Artificial Neural Network (ANN) model architecture with regularization and dropout

# Initialize a Sequential model for a linear stack of layers
ANN = Sequential()

# Add the first Dense (fully connected) layer with:
# - 128 units (neurons), ReLU activation function, L2 regularization to reduce overfitting, and input dimension of 1024
ANN.add(Dense(128, input_dim=1024, activation='relu', kernel_regularizer=l2(0.001)))

# Add a Dropout layer with 50% dropout rate to reduce overfitting by setting half of the units to zero randomly
ANN.add(Dropout(0.5))

# Add a second Dense layer with 64 units, ReLU activation, and L2 regularization
ANN.add(Dense(64, activation='relu', kernel_regularizer=l2(0.001)))

# Add another Dropout layer with 50% dropout rate
ANN.add(Dropout(0.5))

# Add the output Dense layer with a single unit and sigmoid activation for binary classification
# Sigmoid activation function is used to output probabilities for binary classification
ANN.add(Dense(1, activation='sigmoid'))

# Compile the model with binary crossentropy loss, Adam optimizer, and accuracy as the evaluation metric
# - binary_crossentropy is suitable for binary classification tasks
# - Adam optimizer is efficient and often provides good convergence
ANN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Add EarlyStopping callback to prevent overfitting
# - Monitors the validation loss and stops training if it does not improve for 5 consecutive epochs (patience=5)
early_stopping = EarlyStopping(monitor='val_loss', patience=5, verbose=1, mode='min')

# Save the model summary to a text file for record-keeping
with open('ANN_Classification.txt', 'w') as f:
    ANN.summary(print_fn=lambda x: f.write(x + '\n'))

# Print the model summary to verify the architecture
print(ANN.summary())

# Train (fit) the ANN model on the training data with validation data and early stopping
# - Epochs are set to 100 (maximum), though early stopping may end training sooner
# - batch_size=32 defines the number of samples processed before the model’s internal parameters are updated
# - Validation data (X_test, Y_test) is used to monitor performance and apply early stopping
history2 = ANN.fit(X_train, Y_train, epochs=100, batch_size=32, verbose=1, validation_data=(X_test, Y_test), callbacks=[early_stopping])

# Evaluate the model performance on the test data
# - evaluate() returns the loss and accuracy on the test data, providing a measure of model generalization
loss, accuracy = ANN.evaluate(X_test, Y_test, verbose=1)
print("Accuracy:", accuracy)  # Display the test accuracy


In [None]:
# Import necessary library for plotting
import matplotlib.pyplot as plt

# Plot 1: Training and validation loss over epochs for the ANN model
plt.figure(figsize=(8, 5))  # Define figure size for the plot

# Plot the training and validation loss values recorded in the training history
# - history2.history['loss'] contains training loss over epochs
# - history2.history['val_loss'] contains validation loss over epochs
plt.plot(history2.history['loss'], linewidth=5, linestyle='-')  # Training loss plot with custom line width and style
plt.plot(history2.history['val_loss'], linewidth=5, linestyle='-')  # Validation loss plot with same style

# Set plot title and axis labels with font size and weight
plt.title('ANN Model Loss', size=20, weight='bold')
plt.xlabel('Epoch', size=20, weight='bold')
plt.ylabel('Loss', size=20, weight='bold')
plt.xticks(size=15, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=15, weight='bold')  # Customize y-axis tick labels

# Add a legend to differentiate training and validation loss
plt.legend(['Train', 'Validation'], loc='upper right', prop={"size": 15, 'weight': 'bold'})

# Save the plot to a file with specified resolution and tight bounding box
plt.savefig('ANN Loss 06', dpi=100, bbox_inches='tight')  # Save as 'ANN Loss 06.png'
plt.show()  # Display the plot

# Plot 2: Training and validation accuracy over epochs for the ANN model
plt.figure(figsize=(8, 5))  # Define figure size for the plot

# Plot the training and validation accuracy values recorded in the training history
# - history2.history['accuracy'] contains training accuracy over epochs
# - history2.history['val_accuracy'] contains validation accuracy over epochs
plt.plot(history2.history['accuracy'], linewidth=5, linestyle='-')  # Training accuracy plot
plt.plot(history2.history['val_accuracy'], linewidth=5, linestyle='-')  # Validation accuracy plot

# Set plot title and axis labels with font size and weight
plt.title('ANN Model Accuracy', size=20, weight='bold')
plt.xlabel('Epoch', size=20, weight='bold')
plt.ylabel('Accuracy', size=20, weight='bold')
plt.xticks(size=15, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=15, weight='bold')  # Customize y-axis tick labels

# Add a legend to differentiate training and validation accuracy
plt.legend(['Train', 'Validation'], loc='lower right', prop={"size": 15, 'weight': 'bold'})

# Save the plot to a file with specified resolution and tight bounding box
plt.savefig('ANN Accuracy 06', dpi=100, bbox_inches='tight')  # Save as 'ANN Accuracy 06.png'
plt.show()  # Display the plot


In [None]:
# Evaluate the ANN model on the test set
# This section generates predictions, converts them to binary classifications, and prints a classification report
# to assess the model's performance, followed by a confusion matrix plot.

# Step 1: Generate predictions on the test set
# - ANN.predict(X_test) outputs probability scores (between 0 and 1) for each instance in the test set
preds = ANN.predict(X_test)

# Step 2: Convert probabilities to binary predictions
# - Threshold of 0.5 is used: predictions >= 0.5 are classified as 1 (positive class), predictions < 0.5 as 0 (negative class)
y_pred_binary = (preds >= 0.5).astype(int)

# Step 3: Print the classification report for performance metrics
# - metrics.classification_report() compares true labels (Y_test) with predicted labels (y_pred_binary)
# - Outputs precision, recall, F1-score, and support for each class (0 and 1 in binary classification)
print(metrics.classification_report(Y_test, y_pred_binary))

# Step 4: Plot confusion matrix for the ANN model
# The confusion matrix visually displays the counts of true positives, false positives, true negatives, and false negatives

plt.figure(figsize=(10, 5))  # Set figure size for the plot

# Customize tick labels and title for readability
plt.xticks(size=17, weight='bold')
plt.yticks(size=17, weight='bold')
plt.title('Confusion Matrix for ANN', size=20, weight='bold')

# Generate and plot the confusion matrix as a heatmap
# - confusion_matrix(Y_test, y_pred_binary) computes the confusion matrix from true vs. predicted labels
# - annot=True displays the count values in each cell
# - fmt='g' formats values as integers (no scientific notation)
# - cmap='Blues' applies a blue color map to differentiate values visually
# - annot_kws adjusts font size and weight of annotations for better readability
sns.heatmap(confusion_matrix(Y_test, y_pred_binary), annot=True, fmt='g', cmap='Blues', annot_kws={"size": 20, 'weight': 'bold'})

# Save the confusion matrix plot to a file with specified resolution and tight bounding box
plt.savefig('ANN CM 06', dpi=100, bbox_inches='tight')  # Save as 'ANN CM 06.png'

# Display the plot
plt.show()


In [None]:
# Plot confusion matrix for Random Forest model predictions
# This confusion matrix visualizes the number of correct and incorrect predictions for each class (0 and 1)

plt.figure(figsize=(10, 5))  # Set figure size for the plot

# Customize x and y-axis tick labels for better readability
plt.xticks(size=17, weight='bold')
plt.yticks(size=17, weight='bold')

# Set the plot title with custom font size and weight
plt.title('Confusion Matrix for Random Forest', size=20, weight='bold')

# Generate and plot the confusion matrix as a heatmap
# - confusion_matrix(Y_test, RF.predict(X_test)) calculates the confusion matrix from true vs. predicted labels
# - annot=True displays count values in each cell
# - fmt='g' formats values as integers (disables scientific notation)
# - cmap='Blues' applies a blue color gradient to differentiate matrix values
# - annot_kws adjusts font size and weight for annotation text in the heatmap cells
sns.heatmap(confusion_matrix(Y_test, RF.predict(X_test)), annot=True, fmt='g', cmap='Blues', annot_kws={"size": 20, 'weight': 'bold'})

# Save the confusion matrix plot to a file with specified resolution and tight bounding box to minimize extra space
plt.savefig('RF CM 06', dpi=100, bbox_inches='tight')  # Save as 'RF CM 06.png'

# Display the plot
plt.show()


In [None]:
# Load training data and labels into X_train and Y_train, and test data into X_test
# Ensure that your training and test datasets are loaded in the variables before running this code

# Step 1: Define the SVM model with a radial basis function (RBF) kernel and balanced class weights
# - kernel='rbf' specifies the use of a radial basis function kernel, which is suitable for nonlinear data
# - class_weight='balanced' adjusts weights inversely proportional to class frequencies to handle imbalance
# - probability=True enables probability estimates, which can be useful for further model evaluation
SVC = svm.SVC(kernel='rbf', class_weight='balanced', probability=True)

# Step 2: Fit the SVM model to the training data
# - X_train and Y_train are used as the input features and target labels for model training
SVC.fit(X_train, Y_train)

# Step 3: Predict on the test data
# - SVC.predict(X_test) generates predictions for each instance in the test set
y_pred = SVC.predict(X_test)

# Step 4: Display the classification report for model evaluation
# - metrics.classification_report() compares true labels (Y_test) with predicted labels (y_pred)
# - Outputs precision, recall, F1-score, and support for each class, providing a detailed view of model performance
print(metrics.classification_report(Y_test, y_pred))

# Step 5 (Optional): Plot confusion matrix for the SVM model
plt.figure(figsize=(10, 5))  # Set figure size for the plot
plt.xticks(size=17, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=17, weight='bold')  # Customize y-axis tick labels
plt.title('Confusion Matrix for SVM', size=20, weight='bold')  # Set plot title with styling

# Generate and plot the confusion matrix as a heatmap
sns.heatmap(confusion_matrix(Y_test, y_pred), annot=True, fmt='g', cmap='Blues', annot_kws={"size": 20, 'weight': 'bold'})

# Save the confusion matrix plot to a file with specified resolution and tight bounding box
plt.savefig('SVM CM 06', dpi=100, bbox_inches='tight')  # Save as 'SVM CM 06.png'
plt.show()  # Display the plot


In [None]:
# Plot confusion matrix for SVC model predictions
# The confusion matrix provides insights into the number of correct and incorrect predictions for each class (0 and 1)

plt.figure(figsize=(10, 5))  # Set figure size for better visualization

# Customize the x and y-axis tick labels for improved readability
plt.xticks(size=17, weight='bold')
plt.yticks(size=17, weight='bold')

# Set plot title with custom font size and weight
plt.title('Confusion Matrix for SVC', size=20, weight='bold')

# Generate and plot the confusion matrix as a heatmap
# - confusion_matrix(Y_test, SVC.predict(X_test)) calculates the confusion matrix from true vs. predicted labels
# - annot=True displays count values in each cell
# - fmt='g' formats values as integers to avoid scientific notation
# - cmap='Blues' applies a blue color map for visual clarity
# - annot_kws adjusts font size and weight of annotation text in each cell
sns.heatmap(confusion_matrix(Y_test, SVC.predict(X_test)), annot=True, fmt='g', cmap='Blues', annot_kws={"size": 20, 'weight': 'bold'})

# Save the confusion matrix plot as an image file with specified resolution and tight bounding box
plt.savefig('SVC CM 06', dpi=100, bbox_inches='tight')  # Save as 'SVC CM 06.png'

# Display the plot
plt.show()


# Regression Analysis

In [None]:
# Set up data for the regression task

# Step 1: Prepare features (X_Pos) and target variable (Y_Pos) for positive class samples
# - 'Smiles', 'Class', and 'pXC50' columns are removed from X_Pos as they are not used as features for prediction
X_Pos = Data_Positive.drop(['Smiles', 'Class', 'pXC50'], axis=1)

# Extract 'pXC50' column as the target variable and reshape it into a 2D array
Y_Pos = Data_Positive['pXC50'].values.reshape(-1, 1)

# Convert Y_Pos into a pandas Series for compatibility with various regression functions
Y_Pos = pd.Series(Y_Pos.reshape(1, -1)[0])

# Step 2: Rename columns in X_Pos to integer column headers
# - This simplifies column naming for easier access in regression models
new_clm_heads = [x for x in range(len(X_Pos.columns))]  # Create a list of integers based on the number of columns
X_Pos.columns = new_clm_heads  # Rename columns in X_Pos

# Filter Data_Positive for samples with pXC50 >= 6
# - Data_Positive now contains only those samples where 'pXC50' is greater than or equal to 6
Data_Positive = Data_06[Data_06['pXC50'] >= 6]

# Display Data_Positive to verify filtered data
Data_Positive


In [None]:
# Split data with active molecules into training, validation, and testing sets
# Stratified splits are used to maintain the distribution of the target variable in each subset

# Step 1: Initial split of data into 90% training and 10% temporary test set
# - test_size=0.1 reserves 10% of the data as a temporary test set
# - stratify=df.iloc[:, 1] ensures the split is stratified based on the target variable's distribution
X_train, X_tes, Y_train, Y_tes = train_test_split(X_Pos, df, test_size=0.1, random_state=42, stratify=df.iloc[:, 1])

# Step 2: Further split the temporary test set (X_tes and Y_tes) into validation and final test sets
# - test_size=0.5 reserves half of the temporary test set as validation and the other half as the final test set
# - stratify=Y_tes.iloc[:, 1] maintains the target distribution within each subset
X_val, X_test, Y_val, Y_test = train_test_split(X_tes, Y_tes, test_size=0.5, random_state=42, stratify=Y_tes.iloc[:, 1])

# Results:
# - X_train, Y_train: Training set (90% of the data)
# - X_val, Y_val: Validation set (5% of the data)
# - X_test, Y_test: Testing set (5% of the data)


In [None]:
# Define and train a CatBoost regression model with specified parameters
# CatBoost is a gradient boosting model that handles categorical data efficiently

# Step 1: Set the model parameters for the CatBoostRegressor
params = {
    'iterations': 5000,         # Number of boosting iterations
    'learning_rate': 0.01,      # Learning rate for controlling the step size during gradient descent
    'depth': 8,                 # Depth of each tree, which affects model complexity
    'eval_metric': 'R2',        # Evaluation metric set to R-squared for regression
    'verbose': 200,             # Output training progress every 200 iterations
    'od_type': "Iter",          # Overfitting detector type; stops training if no improvement within a given number of iterations
    'od_wait': 1000,            # Number of iterations to wait after the last improvement before stopping
    'random_seed': 8            # Random seed for reproducibility
}

# Step 2: Initialize the CatBoostRegressor model with the defined parameters
cat_model = CatBoostRegressor(**params)

# Step 3: Train the model on the training data, with validation data for monitoring
# - eval_set=(X_val, Y_val.iloc[:,0]) provides the validation set for model evaluation during training
# - use_best_model=True stops training after the iteration with the best validation score
# - plot=True displays a graphical representation of training and validation scores over iterations
cat_model.fit(
    X_train, Y_train.iloc[:, 0],   
    eval_set=(X_val, Y_val.iloc[:, 0]), 
    use_best_model=True, 
    plot=True
)

# Step 4: Model Evaluation on training, validation, and test sets
# - Generate predictions for each dataset using the trained model
pred_train = cat_model.predict(X_train)
pred_val = cat_model.predict(X_val)
pred_test = cat_model.predict(X_test)

# Calculate and print RMSE (Root Mean Squared Error) for each dataset to evaluate model accuracy
# RMSE is calculated as the square root of the mean squared error between actual and predicted values
print(np.sqrt(np.mean((Y_train.iloc[:, 0] - pred_train) ** 2)))  # Training RMSE
print(np.sqrt(np.mean((Y_val.iloc[:, 0] - pred_val) ** 2)))      # Validation RMSE
print(np.sqrt(np.mean((Y_test.iloc[:, 0] - pred_test) ** 2)))    # Test RMSE

# Calculate and print R-squared (R²) scores for each dataset to evaluate model fit
# R² score indicates the proportion of variance in the target explained by the model; higher is better
print('\n')
print(r2_score(Y_train.iloc[:, 0], pred_train))  # Training R² score
print(r2_score(Y_val.iloc[:, 0], pred_val))      # Validation R² score
print(r2_score(Y_test.iloc[:, 0], pred_test))    # Test R² score


In [None]:
# Apply SHAP analysis to the CatBoost model for feature importance and interpretability
# SHAP (SHapley Additive exPlanations) helps interpret the model by showing how each feature contributes to predictions

# Step 1: Generate predictions on the test set
# - all_preds contains the predicted values for each instance in the test set
all_preds = cat_model.predict(X_test)

# Step 2: Prepare data for SHAP analysis
# Convert X_test to a DataFrame for SHAP compatibility and easier manipulation
X_df = pd.DataFrame(X_test)

# Create deep copies of the DataFrame to avoid affecting the original data
x_df = X_df.copy(deep=True)
x_df_1st = x_df.copy(deep=True)

# Add predictions as a new column in x_df_1st for reference or further analysis if needed
x_df_1st['1st'] = all_preds

# Reset indices in x_df and x_df_1st for a clean structure
x_df = x_df.reset_index().drop('index', axis=1)
x_df_1st = x_df_1st.reset_index().drop('index', axis=1)

# Step 3: Generate SHAP values using TreeExplainer for the CatBoost model
# TreeExplainer calculates SHAP values, showing each feature's contribution to predictions
shap_values = shap.TreeExplainer(cat_model).shap_values(x_df)

# Step 4: Display SHAP summary plot for feature importance
# The summary plot shows the impact of each feature on the model's output
# - plot_size specifies dimensions, max_display limits the number of displayed features to top 10, and plot_type is set to 'dot'
shap.summary_plot(shap_values, x_df, plot_size=(10, 10), show=False, plot_type='dot', max_display=10)
plt.title('SHAP for CatBoost', weight='bold', size=20)  # Add title with custom font settings
plt.xticks(size=20, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=20, weight='bold')  # Customize y-axis tick labels
plt.savefig('Reg SHAP 06', dpi=100, bbox_inches='tight')  # Save plot as 'Reg SHAP 06.png'

# Step 5: Calculate mean absolute SHAP values for overall feature importance
# We compute the mean absolute SHAP values across all instances to get an importance score for each feature
feature_imp = np.mean(np.abs(shap_values), axis=0)
feature_imp.shape  # Check the shape to ensure compatibility with features

# Step 6: Identify the top 10 most important features based on SHAP values
ind = feature_imp.argsort()[-10:]  # Get indices of the top 10 features by importance
ind = ind[::-1]  # Reverse indices for descending order

# Display the names of the top 10 features and their importance scores
np.array(x_df.columns)[ind]  # Names of the top 10 important features
feature_imp[ind]  # Corresponding SHAP importance scores for the top features


# Plot top features by SHAP absolute importance as a bar chart
# This bar chart shows the top features contributing to model predictions in terms of SHAP importance

plt.figure(figsize=(10, 8))  # Set the figure size for the plot

# Create a bar plot for the top features
# - x specifies feature names (top features based on SHAP importance)
# - y specifies the SHAP importance scores for each feature
# - color applies a custom color setting for visual appeal
plot = sns.barplot(x=np.array(x_df.columns)[ind], y=feature_imp[ind], color=[0.1, 0.2, 0.1])

# Customize x-axis labels for clarity and appearance
plot.set_xticklabels(plot.get_xticklabels(), horizontalalignment='center', size=12)

# Set font size and weight for y-axis and x-axis ticks
plt.yticks(size=15, weight='bold')
plt.xticks(size=20, rotation=20, weight='bold')

# Set y-axis label for the plot with custom font size and weight
plt.ylabel('SHAP Absolute Feature Importance', size=15, weight='bold')

# Save the feature importance bar plot to a file with specified resolution and tight bounding box
plt.savefig('Reg SHAP Feature Importance CB 06', dpi=100, bbox_inches='tight')  # Save as 'Reg SHAP Feature Importance CB 06.png'

# Display the plot
plt.show()


In [None]:
# Define a Sequential Artificial Neural Network (ANN) model for regression
# This model includes several Dense layers with L2 regularization to prevent overfitting

# Step 1: Initialize the Sequential model
ANN_reg = models.Sequential()

# Step 2: Add Dense layers with ReLU activation and L2 regularization
# - Each Dense layer has a specified number of neurons and L2 regularization to avoid overfitting
# - input_dim=1024 in the first layer specifies the input feature size

# First Dense layer with 100 neurons
ANN_reg.add(layers.Dense(100, input_dim=1024, activation='relu', kernel_regularizer=l2(0.01)))

# Second Dense layer with 200 neurons
ANN_reg.add(layers.Dense(200, activation='relu', kernel_regularizer=l2(0.01)))

# Third Dense layer with 300 neurons
ANN_reg.add(layers.Dense(300, activation='relu', kernel_regularizer=l2(0.01)))

# Fourth Dense layer with 100 neurons
ANN_reg.add(layers.Dense(100, activation='relu', kernel_regularizer=l2(0.01)))

# Fifth Dense layer with 10 neurons
ANN_reg.add(layers.Dense(10, activation='relu', kernel_regularizer=l2(0.01)))

# Output layer with a single neuron (no activation) for regression output
ANN_reg.add(layers.Dense(1))

# Step 3: Compile the model
# - optimizer='adam' uses the Adam optimization algorithm, which is efficient for large datasets
# - loss='mse' specifies Mean Squared Error as the loss function, suitable for regression tasks
# - metrics=['mse'] evaluates Mean Squared Error on each batch and epoch
ANN_reg.compile(optimizer='adam', loss='mse', metrics=['mse'])

# Step 4: Define EarlyStopping callback to prevent overfitting
# - monitor='val_loss' stops training when the validation loss stops improving
# - patience=25 specifies that training will stop if no improvement is observed for 25 consecutive epochs
early_stopping = EarlyStopping(monitor='val_loss', patience=25, verbose=1, mode='min')
callbacks = [early_stopping]

# Step 5: Train (fit) the ANN model on the training data with validation data and early stopping
# - epochs=500 specifies the maximum number of epochs
# - batch_size=50 defines the number of samples per gradient update
# - validation_data=(X_val, Y_val.iloc[:, 0]) provides validation data for monitoring performance
history2 = ANN_reg.fit(X_train, Y_train.iloc[:, 0], epochs=500, batch_size=50, validation_data=(X_val, Y_val.iloc[:, 0]), callbacks=callbacks)

# Step 6: Generate predictions on training, validation, and test data
# - These predictions will be used to evaluate model performance
pred_train = ANN_reg.predict(X_train)  # Predictions on training data
pred_val = ANN_reg.predict(X_val)      # Predictions on validation data
pred_test = ANN_reg.predict(X_test)    # Predictions on test data


In [None]:
# Develop and train a Random Forest (RF) regression model with specified hyperparameters
# This model will be used to predict a continuous target variable and interpret feature importance using SHAP

# Step 1: Define and train the Random Forest model with specified parameters
# - n_estimators=3000 specifies the number of trees in the forest
# - max_depth=12 limits the depth of each tree to control overfitting
rf_default = RandomForestRegressor(n_estimators=3000, max_depth=12)
rf_default.fit(X_train, Y_train.iloc[:, 0])  # Train the model on the training data

# Step 2: Print R² score (coefficient of determination) for training, validation, and test sets
# - rf_default.score() computes R², indicating the proportion of variance explained by the model
print(rf_default.score(X_train, Y_train.iloc[:, 0]))  # R² score for training set
print(rf_default.score(X_val, Y_val.iloc[:, 0]))      # R² score for validation set
print(rf_default.score(X_test, Y_test.iloc[:, 0]))    # R² score for test set

# Step 3: Generate predictions for training, validation, and test sets
pred_train = rf_default.predict(X_train)
pred_val = rf_default.predict(X_val)
pred_test = rf_default.predict(X_test)

# Step 4: Apply SHAP analysis to interpret feature importance for the RF model

# Generate predictions on the test set and prepare data for SHAP analysis
all_preds = rf_default.predict(X_test)
X_df = pd.DataFrame(X_test)  # Convert X_test to a DataFrame for easier manipulation

# Create deep copies of the test data for SHAP analysis and reference
x_df = X_df.copy(deep=True)
x_df_1st = x_df.copy(deep=True)
x_df_1st['1st'] = all_preds  # Add predictions to the DataFrame for reference

# Reset indices in x_df and x_df_1st for a clean structure
x_df = x_df.reset_index().drop('index', axis=1)
x_df_1st = x_df_1st.reset_index().drop('index', axis=1)

# Step 5: Generate SHAP values using TreeExplainer for the Random Forest model
# TreeExplainer calculates SHAP values, indicating each feature's contribution to predictions
shap_values = shap.TreeExplainer(rf_default).shap_values(x_df)

# Step 6: Display SHAP summary plot for feature importance
# - This plot shows the effect of each feature on the model's output for individual predictions
# - plot_size specifies dimensions, max_display limits the number of displayed features, and plot_type='dot' uses dots to represent values
shap.summary_plot(shap_values, x_df, plot_size=(10, 10), show=False, plot_type='dot', max_display=10)
plt.title('SHAP for Random Forest', weight='bold', size=20)  # Set plot title with custom font size
plt.xticks(size=20, weight='bold')  # Customize x-axis tick labels
plt.yticks(size=20, weight='bold')  # Customize y-axis tick labels
plt.savefig('RF Reg SHAP 06', dpi=100, bbox_inches='tight')  # Save plot as 'RF Reg SHAP 06.png'

# Step 7: Calculate mean absolute SHAP values for overall feature importance
# We compute the mean absolute SHAP values across all instances to get an importance score for each feature
feature_imp = np.mean(np.abs(shap_values), axis=0)
feature_imp.shape  # Check the shape to ensure compatibility with features

# Step 8: Identify the top 10 most important features based on SHAP values
ind = feature_imp.argsort()[-10:]  # Get indices of the top 10 features by importance
ind = ind[::-1]  # Arrange indices in descending order of importance

# Display the names of the top 10 features and their SHAP importance scores
np.array(x_df.columns)[ind]  # Names of the top 10 important features
feature_imp[ind]  # Corresponding SHAP importance scores for the top features

# Step 9: Plot top features by SHAP absolute importance as a bar chart
plt.figure(figsize=(10, 8))  # Set the figure size for the plot
plot = sns.barplot(x=np.array(x_df.columns)[ind], y=feature_imp[ind], color=[0.1, 0.2, 0.1])  # Bar plot of feature importance

# Customize x-axis labels for clarity and appearance
plot.set_xticklabels(plot.get_xticklabels(), horizontalalignment='center', size=12)

# Set font size and weight for y-axis and x-axis ticks
plt.yticks(size=15, weight='bold')
plt.xticks(size=20, rotation=20, weight='bold')

# Set y-axis label for the plot with custom font size and weight
plt.ylabel('SHAP Absolute Feature Importance', size=15, weight='bold')

# Save the feature importance bar plot to a file with specified resolution and tight bounding box
plt.savefig('RF Reg SHAP Feature Importance CB 06', dpi=100, bbox_inches='tight')  # Save as 'RF Reg SHAP Feature Importance CB 06.png'

# Display the plot
plt.show()


In [None]:
# Gradient Boosting Regressor (GBT) model development and tuning

# Step 1: Initialize the target variable for training
# - Select the 'Values' column from Y_train as the target variable for regression
Y_train = Y_train['Values']

# Step 2: Initialize the Gradient Boosting Regressor with a random seed for reproducibility
Ada = GradientBoostingRegressor(random_state=42)
Ada.fit(X_train, Y_train)  # Fit the model on the training data

# Step 3: Evaluate the model on the test set by calculating the R² score
# - Ada.score() calculates R², indicating the proportion of variance explained by the model
Ada.score(X_test, Y_test)

# Step 4: Set up a parameter grid for hyperparameter tuning
# - max_depth controls the depth of each tree (higher values increase complexity)
# - n_estimators specifies the number of boosting stages (trees) to build
# - learning_rate scales the contribution of each tree, controlling the speed of learning
param_grid = {
    'max_depth': [4, 6, 10],
    'n_estimators': [30, 100, 1000, 3000],
    'learning_rate': [0.1, 0.01, 0.001]
}

# Step 5: Initialize the Gradient Boosting Regressor for grid search
ada = GradientBoostingRegressor(random_state=42)

# Step 6: Perform Grid Search with cross-validation to find the best parameters
# - GridSearchCV systematically tests all parameter combinations from param_grid
# - cv=10 specifies 10-fold cross-validation
# - n_jobs=-1 utilizes all available CPU cores for parallel processing
# - scoring='r2' optimizes for the R² metric
grid_search = GridSearchCV(estimator=ada, param_grid=param_grid, cv=10, n_jobs=-1, verbose=2, scoring='r2')

# Step 7: Fit the grid search to the training data
grid_search.fit(X_train, Y_train)

# Retrieve the best parameters and model from grid search
grid_search.best_params_  # Display the best parameter combination
best_grid_GBT = grid_search.best_estimator_  # Get the best estimator based on cross-validation
best_grid_GBT

# Step 8: Evaluate the best model on the test set using the R² score
best_grid_GBT.score(X_test, Y_test)

# Step 9: Train a new GBT model with manually selected hyperparameters
# - n_estimators=3000 and max_depth=12 for high-capacity model
Ada = GradientBoostingRegressor(n_estimators=3000, max_depth=12, random_state=42)
Ada.fit(X_train, Y_train.iloc[:, 0])  # Fit the model to the training data

# Step 10: Evaluate the final model on the test set
Ada.score(X_test, Y_test.iloc[:, 0])  # Calculate the R² score on test data

# Step 11: Generate predictions for training, validation, and test sets for further evaluation
pred_train = Ada.predict(X_train)
pred_val = Ada.predict(X_val)
pred_test = Ada.predict(X_test)


In [None]:
# Develop and train a Support Vector Regression (SVR) model
# SVR is a type of Support Vector Machine (SVM) tailored for regression tasks

# Step 1: Initialize the SVR model with specified hyperparameters
# - kernel='rbf' specifies the use of a radial basis function (RBF) kernel, suitable for non-linear relationships
# - degree=3 is relevant only for polynomial kernels; here, it’s kept at the default value
# - C=3 controls the regularization strength, with higher values potentially leading to more overfitting
SVR = svm.SVR(kernel='rbf', degree=3, C=3)

# Step 2: Train the SVR model on the training data
# - X_train contains the features, and Y_train.iloc[:, 0] is the target variable for regression
SVR.fit(X_train, Y_train.iloc[:, 0])

# Step 3: Generate predictions on the training, validation, and test sets for model evaluation
pred_train = SVR.predict(X_train)  # Predictions on training data
pred_val = SVR.predict(X_val)      # Predictions on validation data
pred_test = SVR.predict(X_test)    # Predictions on test data
