Model taken from here
https://colab.research.google.com/drive/1fLJbFPz0yMCQg81DdCP5I8jXw9LoggKO?usp=sharing#scrollTo=F1op-CbyLuN4

In [None]:
import pkasolver as ps
from pkasolver import util
from pkasolver import analysis
import pandas as pd
import numpy as np
from torch_geometric.data import Data

In [None]:
data_folder_Bal = "../data/Baltruschat/"
SDFfile1 = data_folder_Bal + "combined_training_datasets_unique.sdf"
SDFfile2 = data_folder_Bal + "novartis_cleaned_mono_unique_notraindata.sdf"
SDFfile3 = data_folder_Bal + "AvLiLuMoVe_cleaned_mono_unique_notraindata.sdf"

df1 = ps.util.import_sdf(SDFfile1)
df2 = ps.util.import_sdf(SDFfile2)
df3 = ps.util.import_sdf(SDFfile3)

#Data corrections:
df1.marvin_atom[90] = "3"

df1 = util.conjugates_to_DataFrame(df1)
df1 = util.sort_conjugates(df1)
df1 = util.pka_to_ka(df1)
df1.head()

In [211]:
def mol_to_pyg(prot):
    """Take protonated molecules and return a Pytorch Geometric Data object."""
    i = 0
    num_atoms = prot.GetNumAtoms()
    nodes = []
    edges = []
    edges_attr = []

    for mol in [prot]:

        # ComputeGasteigerCharges(mol)

        for atom in mol.GetAtoms():
            nodes.append(
                list(
                    (
                        #atom.GetIdx() + num_atoms * i,
                        atom.GetSymbol() == "C",
                        atom.GetSymbol() == "O",
                        atom.GetSymbol() == "N",
                        atom.GetSymbol() == "P",
                        atom.GetSymbol() == "F",
                        atom.GetSymbol() == "Cl",
                        atom.GetSymbol() == "I",
                        atom.GetFormalCharge(),
                        atom.GetChiralTag(),
                        atom.GetHybridization(),
                        atom.GetNumExplicitHs(),
                        atom.GetIsAromatic(),
                        atom.GetTotalValence(),
                        atom.GetTotalDegree(),
                        # atom.GetProp("_GasteigerCharge")
                    )
                )
            )

        for bond in mol.GetBonds():
            edges.append(
                np.array(
                    [
                        [bond.GetBeginAtomIdx() + num_atoms * i],
                        [bond.GetEndAtomIdx() + num_atoms * i],
                    ]
                )
            )
            edges.append(
                np.array(
                    [
                        [bond.GetEndAtomIdx() + num_atoms * i],
                        [bond.GetBeginAtomIdx() + num_atoms * i],
                    ]
                )
            )
            bond_type = bond.GetBondTypeAsDouble()
            edges_attr.append(bond_type)
            edges_attr.append(bond_type)

        i += 1

    X = torch.tensor(np.array([np.array(xi) for xi in nodes]), dtype=torch.float)
    edge_index = torch.tensor(np.hstack(np.array(edges)), dtype=torch.long)
    edge_attr = torch.tensor(np.array(edges_attr).reshape(-1, 1), dtype=torch.float)

    return Data(x=X, edge_index=edge_index, edge_attr=edge_attr)

In [212]:
#create pyG Dataset
dataset = []
for i in range(len(df1.index)):
    dataset.append(mol_to_pyg(df1.protonated[i]))
    dataset[i].y = torch.tensor([float(df1.pKa[i])], dtype=torch.float32)

print(dataset[0], '\n\n' ,dataset[0].x,'\n\n', dataset[0].edge_index)

Data(edge_attr=[48, 1], edge_index=[2, 48], x=[21, 14], y=[1]) 

 tensor([[1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 3., 0., 0., 3., 3.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 1., 4., 3.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 1., 4., 3.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 1., 4., 3.],
        [0., 0., 1., 0., 0., 0., 0., 1., 0., 3., 1., 1., 4., 3.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 3., 0., 1., 4., 3.],
        [0., 0., 1., 0., 0., 0., 0., 0., 0., 3., 0., 1., 3., 2.],
        [1., 0., 0., 0., 0., 0., 0., 0., 0., 4., 0., 0., 4., 4.],
        [1

In [214]:
#set Hyperparameters
train_test_split = 0.8
#hidden_channels = 32
#learning_rate = 0.01
batch_size = 10
#num_epochs = 200

In [215]:
#split train and test set
import random

#random.shuffle(dataset)

split_length=int(len(dataset)*train_test_split)
train_dataset = dataset[:split_length]
test_dataset = dataset[split_length:]

In [216]:
#create Dataloader objects that contain batches 
from torch_geometric.data import DataLoader

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

# for step, data in enumerate(train_loader):
#     print(f'Step {step + 1}:')
#     print('=======')
#     print(f'Number of graphs in the current batch: {data.num_graphs}')
#     print(data)
#     print()

In [234]:
import torch
import torch.nn.functional as F
from torch.nn import Linear
criterion = torch.nn.MSELoss()

from torch_geometric.nn import global_add_pool, GraphConv, global_max_pool

class Net(torch.nn.Module):
    def __init__(self, dim):
        super(Net, self).__init__()

        num_features = dataset[0].num_features
        self.dim = dim

        self.conv1 = GraphConv(num_features, dim)
        self.conv2 = GraphConv(dim, dim)
        self.conv3 = GraphConv(dim, dim)
        self.conv4 = GraphConv(dim, dim)
        self.conv5 = GraphConv(dim, dim)
        self.conv6 = GraphConv(dim, dim)

        self.fc1 = Linear(dim, dim)
        self.fc2 = Linear(dim, 1)

    def forward(self, x, edge_index, batch, edge_weight=None):
        x = F.relu(self.conv1(x, edge_index, edge_weight))
        x = F.relu(self.conv2(x, edge_index, edge_weight))
        x = F.relu(self.conv3(x, edge_index, edge_weight))
        x = F.relu(self.conv4(x, edge_index, edge_weight))
        x = F.relu(self.conv5(x, edge_index, edge_weight))
        x = F.relu(self.conv6(x, edge_index, edge_weight))
        x = global_max_pool(x, batch)
        x = F.relu(self.fc1(x))
        #x = F.dropout(x, p=0.5, training=self.training)
        x = self.fc2(x)
        return x

In [235]:
#device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch.device('cpu') # Because if torch.device=cuda --> RuntimeError: CUDA error: out of memory

model = Net(dim=96).to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

Net(
  (conv1): GraphConv(14, 96)
  (conv2): GraphConv(96, 96)
  (conv3): GraphConv(96, 96)
  (conv4): GraphConv(96, 96)
  (conv5): GraphConv(96, 96)
  (conv6): GraphConv(96, 96)
  (fc1): Linear(in_features=96, out_features=96, bias=True)
  (fc2): Linear(in_features=96, out_features=1, bias=True)
)


#### Overfitting on one Batch

In [236]:
print(data.x)

tensor([[1., 0., 0.,  ..., 0., 4., 4.],
        [1., 0., 0.,  ..., 0., 4., 4.],
        [1., 0., 0.,  ..., 0., 4., 4.],
        ...,
        [1., 0., 0.,  ..., 1., 4., 3.],
        [1., 0., 0.,  ..., 0., 4., 4.],
        [1., 0., 0.,  ..., 0., 4., 4.]])


In [237]:
data = next(iter(train_loader))

def train(data):
    model.train()

#    if epoch == 51:
#        for param_group in optimizer.param_groups:
#            param_group['lr'] = 0.5 * param_group['lr']
    
    data = data.to(device)
    optimizer.zero_grad()
    output = model(data.x, data.edge_index, data.batch, edge_weight = data.edge_attr)
    #print(output.shape,data.y.shape)
    loss = criterion(output, data.y)
    loss.backward()
    optimizer.step()
    return loss

def test(data):
    model.eval()
    
    data = data.to(device)
    out = model(data.x, data.edge_index, data.batch, edge_weight = data.edge_attr)
    loss = criterion(out, data.y)
    return loss

In [238]:
for epoch in range(1, 2001):
    train_loss = train(data)
    test_loss = test(data)
    print(f'Epoch: {epoch:03d}, Train MSE: {train_loss:.4f}, Test MSE: {test_loss:.4f}')

Epoch: 001, Train MSE: 51.3964, Test MSE: 157.6923
Epoch: 002, Train MSE: 157.6923, Test MSE: 21.8035
Epoch: 003, Train MSE: 21.8035, Test MSE: 17.1343
Epoch: 004, Train MSE: 17.1343, Test MSE: 29.3300
Epoch: 005, Train MSE: 29.3300, Test MSE: 18.8513
Epoch: 006, Train MSE: 18.8513, Test MSE: 11.5246
Epoch: 007, Train MSE: 11.5246, Test MSE: 17.6306
Epoch: 008, Train MSE: 17.6306, Test MSE: 13.7887
Epoch: 009, Train MSE: 13.7887, Test MSE: 10.8812
Epoch: 010, Train MSE: 10.8812, Test MSE: 12.1407
Epoch: 011, Train MSE: 12.1407, Test MSE: 11.8220
Epoch: 012, Train MSE: 11.8220, Test MSE: 10.4087
Epoch: 013, Train MSE: 10.4087, Test MSE: 10.7956
Epoch: 014, Train MSE: 10.7956, Test MSE: 10.7914
Epoch: 015, Train MSE: 10.7914, Test MSE: 9.4871
Epoch: 016, Train MSE: 9.4871, Test MSE: 9.9670
Epoch: 017, Train MSE: 9.9670, Test MSE: 9.8673
Epoch: 018, Train MSE: 9.8673, Test MSE: 8.6000
Epoch: 019, Train MSE: 8.6000, Test MSE: 9.7154
Epoch: 020, Train MSE: 9.7154, Test MSE: 8.1607
Epoch: 02

#### Fitting the whole Training and Test set

In [None]:
def train(epoch):
    model.train()

#    if epoch == 51:
#        for param_group in optimizer.param_groups:
#            param_group['lr'] = 0.5 * param_group['lr']

    for data in train_loader:
        data = data.to(device)
        optimizer.zero_grad()
        output = model(data.x, data.edge_index, data.batch)
        #print(output.shape,data.y.shape)
        loss = criterion(output, data.y)
        loss.backward()
        optimizer.step()
    return loss

def test(loader):
    model.eval()
    for data in loader:
        data = data.to(device)
        out = model(data.x, data.edge_index, data.batch) 
        loss = criterion(out, data.y)
    return loss

In [None]:
for epoch in range(1, 101):
    train_loss = train(epoch)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    print(f'Epoch: {epoch:03d}, Train MSE: {train_acc:.4f}, Test MSE: {test_acc:.4f}')