# Chapter 11. Deep Learning in Chemistry

## 11.3. Application of Deep Learning in Chemistry

MLP networks can be used to predict various chemical systems. In the following section, we will apply MLP to predict molecular properties and reaction outcomes.

### 11.3.1. Prediction of Molecular Properies

#### 11.3.1.1. Regression

In this section, we will build an MLP network to predict the solubility of molecules:

**a. Import modules**

In [None]:
# Import modules
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem import Descriptors
import matplotlib.pyplot as plt
from tqdm import tqdm

**b. Load data**

In [None]:
# Load the dataset
data_file_path = './datasets/Solubility.csv'
df = pd.read_csv(data_file_path)
df.head()

In [None]:
# Get the list of SMILES
smiles_arr = df['smiles'].to_numpy()

# Get the output column
y = df['solubility'].to_numpy()

**c. Feature extraction**

In [None]:
# Define a function to extract a list of features from a molecule
def extract_features(mol):
    features = []
    features.append(Descriptors.MolWt(mol))          # molecular weight
    features.append(Descriptors.NumHeteroatoms(mol)) # number of heteroatoms
    features.append(Descriptors.RingCount(mol))      # number of rings
    features.append(Descriptors.NumHAcceptors(mol))  # number of hydrogen bond donor
    features.append(Descriptors.NumHDonors(mol))     # number of hydrogen bond accepter
    features.append(Descriptors.FractionCSP3(mol))   # fraction of SP3-hybridized carbons
    features.append(Descriptors.TPSA(mol))           # topological polar surface area
    features.append(Descriptors.MolLogP(mol))        # partition coefficient
    features.append(Descriptors.MolMR(mol))        # molar refractivity
    
    return features

In [None]:
# Create the list of features for molecules
x = []

# Loop through the SMILES list
pbar = tqdm(range(len(smiles_arr)))
for i in pbar:
    # Get the SMILES for each molecule
    smiles = smiles_arr[i]
    
    # Create a molecule object from the SMILES
    mol = Chem.MolFromSmiles(smiles)
    
    # Get descriptors
    features = extract_features(mol)
    x.append(features)
    
    # Print progress
    pbar.set_description('{}/{} molecules processed |'.format(i + 1, len(smiles_arr)))
    
# Convert list to numpy array
x = np.array(x)

**d. Data processing**

In [None]:
# Set the random seed
random_seed = 0
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
# Define input and output scalers
input_scaler = MinMaxScaler(feature_range=(0, 1))
output_scaler = MinMaxScaler(feature_range=(0, 1))

# Scale the data
x_scaled = input_scaler.fit_transform(x)
y_scaled = output_scaler.fit_transform(y.reshape(-1, 1)).reshape(-1)

In [None]:
# Split the data into training and testing sets
x_train_scaled, x_test_scaled, y_train_scaled, y_test_scaled = train_test_split(x_scaled, y_scaled, test_size=0.2, random_state=random_seed)

# Split the training dataset into training and validation sets
x_train_scaled, x_val_scaled, y_train_scaled, y_val_scaled = train_test_split(x_train_scaled, y_train_scaled, test_size=0.25, random_state=random_seed)

**e. Create model**

In [None]:
# Define the regression model class
class RegressionModel(nn.Module):
    def __init__(self, n_inputs, n_layers, n_hiddens, n_outputs):
        super(RegressionModel, self).__init__()
        self.hiddens = nn.ModuleList()
        self.hiddens.append(nn.Linear(n_inputs, n_hiddens))
        for _ in range(1, n_layers):
            self.hiddens.append(nn.Linear(n_hiddens, n_hiddens))
        self.output = nn.Linear(n_hiddens, n_outputs)

    def forward(self, x):
        for hidden in self.hiddens:
            x = torch.relu(hidden(x))
        return self.output(x)

In [None]:
# Instantiate the model
n_inputs = x_train_scaled.shape[1]
n_layers = 2
n_hiddens = 32
n_outputs = 1
model = RegressionModel(n_inputs, n_layers, n_hiddens, n_outputs)

In [None]:
# Choose the device to train the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Send the model and all tensors to device (except y_test_scaled because it is not needed)
model.float().to(device)
x_train_scaled = torch.tensor(x_train_scaled, device=device, dtype=torch.float)
y_train_scaled = torch.tensor(y_train_scaled, device=device, dtype=torch.float)
x_val_scaled = torch.tensor(x_val_scaled, device=device, dtype=torch.float)
y_val_scaled = torch.tensor(y_val_scaled, device=device, dtype=torch.float)
x_test_scaled = torch.tensor(x_test_scaled, device=device, dtype=torch.float)

**f. Training**

In [None]:
# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [None]:
# Define the train function
def train(x, y):
    # Set the model to train mode
    model.train()
    
    # Forward pass
    output = model(x)
    loss = criterion(output, y)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [None]:
# Define the validation function
@torch.no_grad()
def validation(x, y):
     # Set the model to evaluation mode
    model.eval()
    
    # Forward pass
    output = model(x)
    loss = criterion(output, y)
        
    return loss.item()

In [None]:
# Create lists of losses for visualization
train_losses = []
val_losses = []

# Train the model
num_epochs = 1000
progress_bar = tqdm(range(num_epochs))
for epoch in progress_bar:
    train_loss = train(x_train_scaled, y_train_scaled.unsqueeze(1))
    val_loss = validation(x_val_scaled, y_val_scaled.unsqueeze(1))
    
    # Add loss to lists for visualization
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    # Print progress
    progress_bar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Train loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}') 

In [None]:
# Visualize MSE loss values over time
plt.plot(train_losses)
plt.plot(val_losses)
plt.xlabel('epoch')
plt.ylabel('MSE loss')

**g. Evaluation**

In [None]:
# Set the model to evaluation mode
model.eval()

# Forward the test set
y_pred_scaled = model(x_test_scaled)

# Scale output back to original range
y_pred = output_scaler.inverse_transform(y_pred_scaled.detach().cpu().numpy().reshape(-1, 1)).reshape(-1)

# Transform the test set back to original scale
y_test = output_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).reshape(-1)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R^2 Score: {r2:.4f}")

In [None]:
# Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', edgecolor='k', alpha=0.7, s=40)
plt.plot(y_test, y_test, color='red', linewidth=2)  # Ideal line for perfect predictions
plt.xlabel('Actual Solubility')
plt.ylabel('Predicted Solubility')
plt.title('Predictions vs Actual')
plt.grid(True)
plt.show()

**h. Save and load model**

In [None]:
# Save model
model_name = 'mlp_model'
file_name = f'{model_name}_{num_epochs}.ckpt'
torch.save(model.state_dict(), file_name)

In [None]:
# Load model
file_name = f'{model_name}_1000.ckpt'
loaded_model = RegressionModel(n_inputs, n_layers, n_hiddens, n_outputs)
loaded_model.load_state_dict(torch.load(file_name))

#### 11.3.1.2. Classification

In this section, we will build an MLP network to predict whether a molecule can penetrate the blood-brain barrier.

**a. Import modules**

In [None]:
# Import modules
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, ConfusionMatrixDisplay
from rdkit import Chem
from rdkit.Avalon import pyAvalonTools
from rdkit.Chem import Descriptors
import matplotlib.pyplot as plt
from tqdm import tqdm

**b. Load data**

In [None]:
# Load the dataset
data_file_path = './datasets/BBBP.csv'
df = pd.read_csv(data_file_path)
df.head()

In [None]:
# Get the list of SMILES
smiles_arr = df['smiles'].to_numpy()

# Get the output column
output_arr = df['p_np'].to_numpy()

**c. Feature extraction**

In [None]:
# Define a function to extract a list of features from a molecule
def extract_features(mol):
    avalon_fp = pyAvalonTools.GetAvalonFP(mol, nBits=512) # Avalon fingerprint
    features = np.array(avalon_fp)
        
    return features

In [None]:
# Create the list of features for molecules
x = []
y = []

# Loop through the SMILES list
pbar = tqdm(range(len(smiles_arr)))
for i in pbar:
    # Get the SMILES for each molecule
    smiles = smiles_arr[i]
    
    # Create a molecule object from the SMILES
    mol = Chem.MolFromSmiles(smiles)
    
    # Get descriptors (skip if can't extract)
    try:
        features = extract_features(mol)
        x.append(features)
        y.append(output_arr[i])
    except:
        continue
    
    # Print progress
    pbar.set_description('{}/{} molecules processed |'.format(i + 1, len(smiles_arr)))
    
# Convert list to numpy array
x = np.array(x)
y = np.array(y)

**d. Data processing**

In [None]:
# Set the random seed
random_seed = 0
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
# Apply a variance threshold to remove input columns with only one value.
var_thresholder = VarianceThreshold(threshold=0.01)
x_var_thresh = var_thresholder.fit_transform(x)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=32)
x_pca = pca.fit_transform(x_var_thresh)

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x_pca, y, test_size=0.2, random_state=random_seed)

# Split the training dataset into training and validation sets
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.25, random_state=random_seed)

**e. Create model**

In [None]:
# Define the classification model class
class ClassificationModel(nn.Module):
    def __init__(self, n_inputs, n_layers, n_hiddens, n_outputs):
        super(ClassificationModel, self).__init__()
        self.hiddens = nn.ModuleList()
        self.hiddens.append(nn.Linear(n_inputs, n_hiddens))
        for _ in range(1, n_layers):
            self.hiddens.append(nn.Linear(n_hiddens, n_hiddens))
        self.output = nn.Linear(n_hiddens, n_outputs)

    def forward(self, x):
        for hidden in self.hiddens:
            x = torch.sigmoid(hidden(x))
        return self.output(x)

In [None]:
# Instantiate the model
n_inputs = x_train.shape[1]
n_layers = 3
n_hiddens = 64
n_outputs = 1
model = ClassificationModel(n_inputs, n_layers, n_hiddens, n_outputs)

In [None]:
# Choose the device to train the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Send the model and all tensors to device (except y_test because it is not needed)
model.float().to(device)

x_train = torch.tensor(x_train, device=device, dtype=torch.float)
y_train = torch.tensor(y_train, device=device, dtype=torch.float)
x_val = torch.tensor(x_val, device=device, dtype=torch.float)
y_val = torch.tensor(y_val, device=device, dtype=torch.float)
x_test = torch.tensor(x_test, device=device, dtype=torch.float)

**f. Training**

In [None]:
# Define the loss function and optimizer
criterion = nn.BCEWithLogitsLoss()
optimizer = optim.AdamW(model.parameters(), lr=0.05)

In [None]:
# Define the train function
def train(x, y):
    # Set the model to train mode
    model.train()
    
    # Forward pass
    output = model(x)
    loss = criterion(output, y)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [None]:
# Define the validation function
def validation(x, y):
    # Set the model to evaluation mode
    model.eval()
    
    # Forward pass
    outputs = model(x)
    loss = criterion(outputs, y)
    
    return loss.item()

In [None]:
# Create lists of losses for visualization
train_losses = []
val_losses = []

# Train the model
num_epochs = 100
progress_bar = tqdm(range(num_epochs))
for epoch in progress_bar:
    train_loss = train(x_train, y_train.unsqueeze(1))
    val_loss = validation(x_val, y_val.unsqueeze(1))
    
    # Add loss to lists for visualization
    train_losses.append(train_loss)
    val_losses.append(val_loss)
        
    # Print progress
    progress_bar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Train loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}') 

In [None]:
# Visualize MSE loss values over time
plt.plot(train_losses)
plt.plot(val_losses)
plt.xlabel('epoch')
plt.ylabel('BCE loss with logits')

**g. Evaluation**

In [None]:
# Set the model to evaluation mode
model.eval()

# Forward the test set
logits = model(x_test)
probabilities = torch.sigmoid(logits)
y_pred = torch.round(probabilities).detach().cpu().numpy().reshape(-1)

In [None]:
# Evaluate the model
cm = confusion_matrix(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot()

**h. Make prediction**

In [None]:
# Define a new molecule
mol = Chem.MolFromSmiles('CCO') # ethanol

# Extract features
features = extract_features(mol)
features = np.array(features)

# Apply variance threshold
features_var_thresh = var_thresholder.transform(features.reshape(1, -1))

# Apply PCA
features_pca = pca.transform(features_var_thresh)

# Convert to tensor
x_pred = torch.tensor(features_pca, device=device, dtype=torch.float)

# Run model forward
logits = model(x_pred)
probabilities = torch.sigmoid(logits)
y_pred = torch.round(probabilities).int().detach().cpu().numpy().reshape(-1)[0]

# Show the prediction
print(y_pred)

**i. Save and load model**

In [None]:
# Save model
model_name = 'mlp_model'
file_name = f'./{model_name}_{num_epochs}.ckpt'
torch.save(model.state_dict(), file_name)

In [None]:
# Load model
file_name = f'./{model_name}_100.ckpt'
loaded_model = ClassificationModel(n_inputs, n_layers, n_hiddens, n_outputs)
loaded_model.load_state_dict(torch.load(file_name))

### 11.3.2. Prediction of Reaction Outcomes

In this section, we will build an MLP network to predict the yield of C-N coupling reactions (Buchwald-Hartwig reaction):

**a. Import modules**

In [None]:
# Import modules
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
import matplotlib.pyplot as plt
from tqdm import tqdm

**b. Load data**

In [None]:
# Load the dataset
data_file_path = './datasets/BuchwaldHartwigReactionYield.csv'
df = pd.read_csv(data_file_path)
df.head()

In [None]:
# Get the list of SMILES
ligand_smiles_arr = df['Ligand'].to_numpy()
additive_smiles_arr = df['Additive'].to_numpy()
base_smiles_arr = df['Base'].to_numpy()
aryl_halide_smiles_arr = df['Aryl halide'].to_numpy()

# Get the output
y = df['Yield'].to_numpy()

**c. Feature extraction**

In [None]:
# Define a function to extract a list of features from a molecule
def extract_features(mol):
    ap_fp = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol, nBits=512) # Atom-pairs fingerprint
    
    return ap_fp

In [None]:
# Create the list of features for molecules
x = []

# Loop through the SMILES list
pbar = tqdm(range(len(df)))
for i in pbar:
    # Create molecule objects from their SMILES
    ligand = Chem.MolFromSmiles(ligand_smiles_arr[i])
    additive = Chem.MolFromSmiles(additive_smiles_arr[i])
    base = Chem.MolFromSmiles(base_smiles_arr[i])
    aryl_halide = Chem.MolFromSmiles(aryl_halide_smiles_arr[i])
    
    # Get the atom-pairs fingerprints for each compound
    ligand_features = extract_features(ligand)
    additive_features = extract_features(additive)
    base_features = extract_features(base)
    aryl_halide_features = extract_features(aryl_halide)
    
    # Concatenate the fingerprints then add them to the list
    x.append(ligand_features + additive_features + base_features + aryl_halide_features)
    
    # Print progress
    pbar.set_description('{}/{} reactions processed |'.format(i + 1, len(df)))
    
# Convert list to numpy array
x = np.array(x)

**d. Data processing**

In [None]:
# Set the random seed
random_seed = 0
np.random.seed(random_seed)
torch.manual_seed(random_seed)
torch.cuda.manual_seed(random_seed)

In [None]:
# Apply a variance threshold to remove input columns with only one value.
var_thresholder = VarianceThreshold(threshold=0.01)
x_var_thresh = var_thresholder.fit_transform(x)

# Define output scaler and scale output data
output_scaler = MinMaxScaler(feature_range=(0, 1))
y_scaled = output_scaler.fit_transform(y.reshape(-1, 1)).reshape(-1)

# Apply PCA to reduce the dimensionality
pca = PCA(n_components=64)
x_pca = pca.fit_transform(x)

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train_scaled, y_test_scaled = train_test_split(x_pca, y_scaled, test_size=0.2, random_state=random_seed)

# Split the training dataset into training and validation sets
x_train, x_val, y_train_scaled, y_val_scaled = train_test_split(x_train, y_train_scaled, test_size=0.25, random_state=random_seed)

**e. Create model**

In [None]:
# Define the regression model class
class RegressionModel(nn.Module):
    def __init__(self, n_inputs, n_layers, n_hiddens, n_outputs):
        super(RegressionModel, self).__init__()
        self.hiddens = nn.ModuleList()
        self.hiddens.append(nn.Linear(n_inputs, n_hiddens))
        for _ in range(1, n_layers):
            self.hiddens.append(nn.Linear(n_hiddens, n_hiddens))
        self.output = nn.Linear(n_hiddens, n_outputs)

    def forward(self, x):
        for hidden in self.hiddens:
            x = torch.relu(hidden(x))
        return self.output(x)

In [None]:
# Instantiate the model
n_inputs = x_train.shape[1]
n_layers = 3
n_hiddens = 256
n_outputs = 1
model = RegressionModel(n_inputs, n_layers, n_hiddens, n_outputs)

In [None]:
# Choose the device to train the model
device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Send the model and all tensors to device (except y_test_scaled because it is not needed)
model.float().to(device)
x_train = torch.tensor(x_train, device=device, dtype=torch.float)
y_train_scaled = torch.tensor(y_train_scaled, device=device, dtype=torch.float)
x_val = torch.tensor(x_val, device=device, dtype=torch.float)
y_val_scaled = torch.tensor(y_val_scaled, device=device, dtype=torch.float)
x_test = torch.tensor(x_test, device=device, dtype=torch.float)

**f. Training**

In [None]:
# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = optim.AdamW(model.parameters(), lr=0.001)

In [None]:
# Define the train function
def train(x, y):
    # Set the model to train mode
    model.train()
    
    # Forward pass
    output = model(x)
    loss = criterion(output, y)
    
    # Backward and optimize
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    return loss.item()

In [None]:
# Define the validation function
@torch.no_grad()
def validation(x, y):
     # Set the model to evaluation mode
    model.eval()
    
    # Forward pass
    output = model(x)
    loss = criterion(output, y)
        
    return loss.item()

In [None]:
# Create lists of losses for visualization
train_losses = []
val_losses = []

# Train the model
num_epochs = 1000
progress_bar = tqdm(range(num_epochs))
for epoch in progress_bar:
    train_loss = train(x_train, y_train_scaled.unsqueeze(1))
    val_loss = validation(x_val, y_val_scaled.unsqueeze(1))
    
    # Add loss to lists for visualization
    train_losses.append(train_loss)
    val_losses.append(val_loss)
    
    # Print progress
    progress_bar.set_description(f'Epoch [{epoch+1}/{num_epochs}], Train loss: {train_loss:.4f}, Validation loss: {val_loss:.4f}') 

In [None]:
# Visualize MSE loss values over time
plt.plot(train_losses)
plt.plot(val_losses)
plt.xlabel('epoch')
plt.ylabel('MSE loss')

**g. Evaluation**

In [None]:
# Set the model to evaluation mode
model.eval()

# Forward the test set
y_pred_scaled = model(x_test)

# Scale output back to original range
y_pred = output_scaler.inverse_transform(y_pred_scaled.detach().cpu().numpy().reshape(-1, 1)).reshape(-1)

# Transform the test set back to original scale
y_test = output_scaler.inverse_transform(y_test_scaled.reshape(-1, 1)).reshape(-1)

In [None]:
# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = math.sqrt(mse)
r2 = r2_score(y_test, y_pred)

print(f"Mean Absolute Error (MAE): {mae:.4f}")
print(f"Mean Squared Error (MSE): {mse:.4f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.4f}")
print(f"R^2 Score: {r2:.4f}")

In [None]:
# Plotting the results
plt.figure(figsize=(8, 6))
plt.scatter(y_test, y_pred, color='blue', edgecolor='k', alpha=0.7, s=40)
plt.plot(y_test, y_test, color='red', linewidth=2)  # Ideal line for perfect predictions
plt.xlabel('Actual Yield')
plt.ylabel('Predicted Yield')
plt.title('Predictions vs Actual')
plt.grid(True)
plt.show()

**h. Save and load model**

In [None]:
# Save model
model_name = 'mlp_rxn_model'
file_name = f'./{model_name}_{num_epochs}.ckpt'
torch.save(model.state_dict(), file_name)

In [None]:
# Load model
file_name = f'./{model_name}_1000.ckpt'
loaded_model = RegressionModel(n_inputs, n_layers, n_hiddens, n_outputs)
loaded_model.load_state_dict(torch.load(file_name))