# Serotonin 3D GNN Project


This project builds upon research done by Łapińska et al. (2024): https://doi.org/10.3390/pharmaceutics16030349

Data used: https://ftp.ebi.ac.uk/pub/databases/chembl/ChEMBLdb/releases/chembl_35/

Move the unpacked chembl_35_sqlite.tar.gz file into the data/ dir.

The research linked above presents two Quantitative Structure-Activity Relationship (QSAR) models to predict serotonergic binding affinity and selectivity, respectively, using Mordred molecular 2D descriptors. Specifically, one model classifies compounds binarily as "active" or "inactive", with a cutoff of pKi = 7. Another model does multiclass classification to predict the serotonergic selectivity of compounds previously classified as "active".

I am following a similar approach, but using 3D molecular graph representations instead of 2D molecular descriptors as input modality and using only the ChEMBL database, not ZINC.


## Google Colab Setup


### Configuration


In [1]:
from pathlib import Path

IN_COLAB = False

PATH_NOTEBOOK = (
    Path("/content/drive/MyDrive/Colab Notebooks/serotonin-3d-gnn.ipynb")
    if IN_COLAB
    else Path(
        "/Users/paul/Library/CloudStorage/GoogleDrive-unoutsch@gmail.com/My Drive/Colab Notebooks/serotonin-3d-gnn.ipynb"
    )
)
PATH_REPO = (
    Path("/content/drive/MyDrive/Repositories/serotonin-3d-gnn")
    if IN_COLAB
    else Path.cwd()
)
PATH_DATA = PATH_REPO / "data"

### Syncing Google Drive with Google Colab Content


In [2]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

### Installing Requirements


In [5]:
%pip install -r "$PATH_REPO/requirements.txt"

Note: you may need to restart the kernel to use updated packages.


## Imports


In [38]:
import os
import pandas as pd
from rdkit import Chem
from rdkit.Chem import AllChem
import shutil
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

## Utils


### Syncing this file between Colab and local Git repo


Make sure the paths exist.


In [7]:
if IN_COLAB:
    shutil.copyfile(PATH_NOTEBOOK, PATH_REPO / "serotonin-3d-gnn.ipynb")
else:
    shutil.copyfile(PATH_REPO / "serotonin-3d-gnn.ipynb", PATH_NOTEBOOK)

## Data


### Note on Data Aquisition from chembl_35.db

In order to collect the desired data from the ChEMBL SQL database and transform it into a .csv file, I undertook the steps detailed in `data/README.md`.


### Loading the Data


#### If dataframe exists: Load pickled dataframe


In [None]:
pickle_file_path = PATH_DATA / "serotonin_binding_df_3d_mol.pkl"

if os.path.exists(pickle_file_path):
    df = pd.read_pickle(pickle_file_path)
    print("DataFrame loaded from pickle.")
else:
    print("Pickle file not found. Please generate the DataFrame and save it first.")

/Users/paul/My Drive/Repositories/serotonin-3d-gnn/data/serotonin_binding_df_3d_mol.pkl
DataFrame loaded from pickle.


#### Otherwise: Create dataframe from .csv file


In [None]:
if df is None:
    df = pd.read_csv(PATH_DATA / "serotonin_binding_summary.csv")
# remove columns that have less than 1000 non-NaN values
df = df.dropna(axis=1, thresh=1000)
df.describe()

Unnamed: 0,molecule_id,Serotonin 1a (5-HT1a) receptor,Serotonin 1b (5-HT1b) receptor,Serotonin 1d (5-HT1d) receptor,Serotonin 2 (5-HT2) receptor,Serotonin 2a (5-HT2a) receptor,Serotonin 2b (5-HT2b) receptor,Serotonin 2c (5-HT2c) receptor,Serotonin 3a (5-HT3a) receptor,Serotonin 4 (5-HT4) receptor,Serotonin 6 (5-HT6) receptor,Serotonin 7 (5-HT7) receptor
count,23456.0,9462.0,1492.0,1472.0,1469.0,7378.0,2337.0,4343.0,1040.0,1009.0,4221.0,3100.0
mean,1003325.0,7.258523,6.952528,7.554968,7.053201,6.995423,6.603829,6.81021,7.04752,7.645809,7.311171,6.977487
std,898658.3,1.152004,1.226482,1.36588,1.159567,1.138558,0.981462,1.032874,1.535413,1.179482,1.143388,1.016128
min,97.0,4.0,4.0,4.0,4.03,4.0,4.19,4.0,4.0,5.0,4.12,4.0
25%,229157.0,6.48,6.05,6.47,6.24,6.16,5.9,6.05,5.7,6.81,6.47,6.285
50%,575761.5,7.28,6.85,7.64,6.92,6.94,6.523333,6.74,7.185,7.64,7.36,6.99
75%,1965967.0,8.06,7.85,8.7,8.0,7.8,7.21,7.5125,8.41,8.4,8.11,7.7
max,2881244.0,11.0,10.0,10.7,10.3,11.0,10.1,10.7,10.4,10.8,10.4,10.0


Appending the 3D Graphical Representations to the Dataframe using RDKit.


In [None]:
def mol_to_graph(smiles: str) -> Data:
    print(f"Converting SMILES '{smiles}'")

    # getting RDKit molecule object
    mol = Chem.MolFromSmiles(smiles)

    if mol is None:
        return None

    # add explicit hydrogen atoms to the molecule (are not included in the SMILES string) so that its 3D structure is complete
    mol = Chem.AddHs(mol)

    # EmbedMolecule positions atoms of mol in 3D space stochastically; if it fails (returning -1) return None
    if AllChem.EmbedMolecule(mol, randomSeed=42) == -1:
        return None

    # optimize the 3D structure using Universal Force Field (UFF) to lower mol's energy
    AllChem.UFFOptimizeMolecule(mol)

    # conformer contains 3D coordinates for mol's atoms
    conformer = mol.GetConformer()

    # atom-level features and 3D positions
    atom_features, positions = [], []
    for atom in mol.GetAtoms():
        features = [
            atom.GetAtomicNum(),  # atomic number: uid of element (e.g., 6 for carbon, 8 for oxygen)
            atom.GetDegree(),  # degree: number of bonds connecting the atom
            atom.GetFormalCharge(),  # formal charge: atom's electrical charge
            int(
                atom.GetHybridization()
            ),  # hybridization: type of atom's orbital hybridization (e.g., sp, sp2) as int
        ]
        atom_features.append(features)

        # 3D coordinates of atom from conformer
        pos = conformer.GetAtomPosition(atom.GetIdx())
        positions.append([pos.x, pos.y, pos.z])

    # transform to PyTorch tensors
    x = torch.tensor(atom_features, dtype=torch.float)
    pos = torch.tensor(positions, dtype=torch.float)

    # bonds between atoms – indices of connected atoms as well as types and conjugation
    edge_index, edge_attr = [], []
    for bond in mol.GetBonds():
        # indices of bonded atoms
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()

        bond_feat = [
            bond.GetBondTypeAsDouble(),  # bond type as float (e.g., 1.0 for single, 2.0 for double bonds)
            (
                1.0 if bond.GetIsConjugated() else 0.0
            ),  # conjugation flag indicating whether the bond is conjugated (1.0 if true, else 0.0) (conjugated means that electrons are delocalized, moving freely between multiple atoms)
        ]
        # for undirected graph, add bond in both directions
        edge_index += [[i, j], [j, i]]
        edge_attr += [bond_feat, bond_feat]

    # transform to PyTorch tensors
    # edge_index tensor is transposed to fit PyTorch Geometric's expected shape (2, number_of_edges).
    edge_index = torch.tensor(edge_index, dtype=torch.long).t().contiguous()
    edge_attr = torch.tensor(edge_attr, dtype=torch.float)

    # graph as PyTorch Geometric Data object
    # x: atom features, [atomic number, degree, formal charge, hybridization]
    # pos: 3D positions of atoms, [x, y, z]
    # edge_index: connectivity indices between atoms, [[i, j], [j, i]]
    # edge_attr: features per bond, [[bond type, conjugation], [bond type, conjugation]]
    return Data(x=x, pos=pos, edge_index=edge_index, edge_attr=edge_attr)

In [None]:
df["3d_mol"] = df["canonical_smiles"].apply(mol_to_graph)

Save the dataframe as a pickle file.


In [30]:
if not os.path.exists(pickle_file_path):
    df.to_pickle(PATH_DATA / "serotonin_binding_df_3d_mol.pkl")
    print("DataFrame loaded from pickle.")

Create training and test sets.


In [69]:
data_graph = df["3d_mol"].to_list()

df_targets = df.drop(columns=["molecule_id", "canonical_smiles", "3d_mol"])
print(f"Number of targets oper sample: {len(df_targets.columns)}")

targets = [df_targets.iloc[i].values for i in range(len(df_targets))]
print(f"Number of samples: {len(targets)}")
print(f"Example target vector: {targets[0]}")

for data, target in zip(data_graph, targets):
    data.y = torch.tensor(target)

split_idx = int(0.8 * len(data_graph))

data_graph_train = DataLoader(data_graph[:split_idx], batch_size=32, shuffle=True)
data_graph_test = DataLoader(data_graph[split_idx:], batch_size=32, shuffle=False)

print(
    f"# training graphs: {len(data_graph_train)}\n# test graphs: {len(data_graph_test)}"
)

Number of targets oper sample: 11
Number of samples: 23456
Example target vector: [5.44333333        nan 9.48       4.38       5.82       5.2825
        nan        nan        nan        nan        nan]


AttributeError: 'NoneType' object has no attribute 'y'

## Models


### Model 1: PyTorch Implementation of a 3D GCN

In this section, a 3D graph convolutional network is created using PyTorch. The model takes as input a 3D molecular graph and outputs predictions of the serotonergic binding affinity of the molecule.

Information about the graph input the model will receive and process:

-   The feature matrix H contains the node (atom) features. Each row corresponds to a node, and each column corresponds to a feature.
-   The adjacency matrix A is built from the edge_index tensor, which contains the indices of the edges in the graph. The matrix A is built under the hood of the GCNConv class.


#### Model Architecture


In [47]:
from torch_geometric.nn import GCNConv, global_mean_pool
from torch.nn import Linear
import torch.nn.functional as F


class SeroGCN(torch.nn.Module):
    def __init__(self, n_in, n_hidden, n_out):
        super(SeroGCN, self).__init__()

        self.conv1 = GCNConv(n_in, n_hidden)
        self.conv2 = GCNConv(n_hidden, n_hidden)

        self.fc = Linear(n_hidden, n_out)

    def forward(self, mol: Data):
        x, pos, edge_index, edge_attr = mol.x, mol.pos, mol.edge_index, mol.edge_attr

        x = self.conv1(x, edge_index)  # TODO: include pos and edge_attr in computation
        x = F.relu(x)

        x = self.conv2(x, edge_index)
        x = F.relu(x)

        x = global_mean_pool(
            x, mol.batch
        )  # global mean pooling aggregates node features, returning a single graph-level vectorial representation

        x = self.fc(x)

        return x

### Model 2: Pretrained 3D GNN (...)


## Training


In [56]:
epochs = 10
n_in = data_graph_train.dataset[0].num_features
n_hidden = 32
n_out = len(df.columns) - 3  # excluding molecule_id, canonical_smiles, and 3d_mol
print(f"Node features: {n_in}, number of outputs: {n_out}")

Node features: 4, number of outputs: 21


In [48]:
if torch.cuda.is_available():
    print("Using CUDA")
    device = torch.device("cuda")
elif torch.backends.mps.is_available():
    print("Using MPS")
    device = torch.device("mps")
else:
    print("Using CPU")
    device = torch.device("cpu")

Using MPS


In [58]:
def train(model: torch.nn.Module, data_loader: DataLoader, optimizer, criterion):
    model.train()

    for epoch in range(epochs):
        for data in data_loader:
            data = data.to(device)
            optimizer.zero_grad()
            out = model(data)
            loss = criterion(out, data.y)
            loss.backward()
            optimizer.step()

        print(f"Epoch {epoch}: Loss = {loss.item()}")

In [59]:
sero_gcn = SeroGCN(n_in=n_in, n_hidden=n_hidden, n_out=n_out).to(device)
sero_gcn_optimizer = torch.optim.Adam(sero_gcn.parameters(), lr=0.01)
sero_gcn_criterion = torch.nn.MSELoss()

In [60]:
train(sero_gcn, data_graph_train, sero_gcn_optimizer, sero_gcn_criterion)

AttributeError: 'NoneType' object has no attribute 'size'