In [1]:
import pkasolver as ps
from pkasolver import util
from pkasolver import analysis
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data
from torch_geometric.data import DataLoader
import random

# specify device
#device = 'cpu'
device = 'cuda'

#define PairData Class
class PairData(Data):
    def __init__(self, edge_index, x, edge_index2, x2):
        super(PairData, self).__init__()
        self.edge_index = edge_index
        self.x = x
        self.edge_index2 = edge_index2
        self.x2 = x2

    def __inc__(self, key, value):
        if key == 'edge_index':
            return self.x.size(0)
        if key == 'edge_index2':
            return self.x2.size(0)
        else:
            return super().__inc__(key, value)
        
# data = PairData(edge_index_p, x_p, edge_index_d, x_d)
# data_list = [data, data]
# loader = DataLoader(data_list, batch_size=2, follow_batch=['x_p', 'x_d'] )
# batch = next(iter(loader))


def make_nodes(mol, marvin_atom):
    x = []
    i = 0
    for atom in mol.GetAtoms():
        x.append(
            np.array(
                [
                    #atom.GetIdx() + num_atoms * i,
                    #float(atom.GetProp("_GasteigerCharge"))
                    atom.GetSymbol() == "C",
                    atom.GetSymbol() == "O",
                    atom.GetSymbol() == "N",
                    atom.GetSymbol() == "S",
                    atom.GetSymbol() == "P",
                    atom.GetSymbol() == "F",
                    atom.GetSymbol() == "Cl",
                    atom.GetSymbol() == "Br",
                    atom.GetSymbol() == "I",
                    atom.GetSymbol() == "As",
                    atom.GetSymbol() == "H",
                    atom.GetFormalCharge(),
                    atom.GetChiralTag(),
                    atom.GetHybridization(),
                    atom.GetNumExplicitHs(),
                    atom.GetIsAromatic(),
                    atom.GetTotalValence(),
                    atom.GetTotalDegree(),
                    i == int(marvin_atom)
                ]
            )
        )
        i += 1
    return torch.tensor(np.array([np.array(xi) for xi in x]), dtype=torch.float)

def make_edges_and_attr(mol):
    edges = []
    edge_attr = []
    for bond in mol.GetBonds():
        edges.append(
            np.array(
                [
                    [bond.GetBeginAtomIdx()],
                    [bond.GetEndAtomIdx()],
                ]
            )
        )
        edge_attr.append(
            [bond.GetBondTypeAsDouble(), bond.GetIsConjugated()]
        )
    edge_index = torch.tensor(np.hstack(np.array(edges)), dtype=torch.long)
    edge_attr = torch.tensor(np.array(edge_attr), dtype=torch.float)
    return edge_index, edge_attr

def Mol_to_PairData(prot, deprot, marvin_atom):
    x_p = make_nodes(prot, marvin_atom)
    edge_index_p, edge_attr_p = make_edges_and_attr(prot)
    
    x_d = make_nodes(deprot, marvin_atom)
    edge_index_d, edge_attr_d = make_edges_and_attr(deprot)
    
    data = PairData(edge_index_p, x_p, edge_index_d, x_d)
    data.edge_attr = edge_attr_p
    data.edge_attr2 = edge_attr_d
    return data.to(device=device)

In [2]:
data_folder_Bal = "../data/Baltruschat/"
SDFfile1 = data_folder_Bal + "combined_training_datasets_unique.sdf"
SDFfile2 = data_folder_Bal + "novartis_cleaned_mono_unique_notraindata.sdf"
SDFfile3 = data_folder_Bal + "AvLiLuMoVe_cleaned_mono_unique_notraindata.sdf"



df1 = ps.util.import_sdf(SDFfile1)
df2 = ps.util.import_sdf(SDFfile2)
df3 = ps.util.import_sdf(SDFfile3)

#Data corrections:
df1.marvin_atom[90] = "3"

df1 = util.conjugates_to_DataFrame(df1)
df1 = util.sort_conjugates(df1)
df1 = util.pka_to_ka(df1)
df1.head(10)

Unnamed: 0,pKa,marvin_pKa,marvin_atom,marvin_pKa_type,original_dataset,ID,smiles,protonated,deprotonated,ka
0,6.21,6.09,10,basic,['chembl25'],1702768,Brc1c(NC2CC2)nc(C2CC2)nc1N1CCCCCC1,,,6.16595e-07
1,7.46,8.2,9,basic,['chembl25'],273537,Brc1cc(Br)c(NC2=[NH+]CCN2)c(Br)c1,,,3.467369e-08
2,4.2,3.94,9,basic,['datawarrior'],7175,Brc1cc2cccnc2c2ncccc12,,,6.309573e-05
3,3.73,5.91,8,acidic,['datawarrior'],998,Brc1ccc(-c2nn[n-]n2)cc1,,,0.0001862087
4,11.0,8.94,13,basic,['chembl25'],560562,Brc1ccc(Br)c(N(CC2CC2)C2=[NH+]CCN2)c1,,,1e-11
5,5.05,7.57,7,basic,['datawarrior'],1138,Brc1ccc(C2[NH2+]CCS2)cc1,,,8.912509e-06
6,7.0,6.36,13,basic,['chembl25'],14699136,Brc1ccc(Cc2c3ccccc3nc3ccccc23)cc1,,,1e-07
7,4.62,5.73,9,acidic,['chembl25'],6372403,Brc1ccc(Cc2nn[n-]n2)cc1,,,2.398833e-05
8,11.2,8.96,14,basic,['chembl25'],575043,Brc1ccc(N(CC2CC2)C2=[NH+]CCN2)c(Br)c1,,,6.309573e-12
9,3.0,4.6,15,basic,['chembl25'],871891,Brc1ccc(N2CCc3ccccc3C2)nc1,,,0.001


In [3]:
n=2
print(dataset[n], '\n\n' ,dataset[n].x,'\n\n', dataset[n].edge_index, dataset[n].y)

NameError: name 'dataset' is not defined

In [4]:
#set Hyperparameters
train_test_split = 0.8
hidden_channels = 64
learning_rate = 0.001
batch_size = 64
num_epochs = 10000



In [5]:
#create pyG Dataset

dataset = []
for i in range(len(df1.index)):
    dataset.append(Mol_to_PairData(df1.protonated[i],df1.deprotonated[i], df1.marvin_atom[i]))
    dataset[i].y = torch.tensor([float(df1.pKa[i])], dtype=torch.float32, device=device)


print(dataset[0], '\n\n' ,dataset[0].x,'\n\n', dataset[0].edge_index, dataset[0].y)

#split train and test set

#random.shuffle(dataset)

split_length=int(len(dataset)*train_test_split)
train_dataset = dataset[:split_length]
test_dataset = dataset[split_length:]
#create Dataloader objects that contain batches 

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, follow_batch=['x', 'x2'])
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, follow_batch=['x', 'x2'])

PairData(edge_attr=[24, 2], edge_attr2=[24, 2], edge_index=[2, 24], edge_index2=[2, 24], x=[21, 19], x2=[21, 19], y=[1]) 

 

RuntimeError: CUDA error: no kernel image is available for execution on the device

In [None]:
def atomcheck(dataset):
    i=0
    for data in dataset:
        for row in data.x:
            if torch.sum(row[0:11]) != 1:
                print(i)
        i+=1
    print('done')
atomcheck(dataset)    

In [None]:
df1.loc[[1387,2866,5733]]

In [None]:
import torch
from torch.nn import Linear
import torch.nn.functional as F
from torch.nn import Sequential as Seq, Linear as Lin, ReLU
from torch_geometric.nn import GCNConv
from torch_geometric.nn import NNConv
from torch_geometric.nn import GraphConv
from torch_geometric.nn import global_mean_pool
from torch_geometric.nn import global_max_pool
from torch_geometric.nn import global_add_pool
from torch import optim

class GCN(torch.nn.Module):
    def __init__(self, hidden_channels):
        super(GCN, self).__init__()
        torch.manual_seed(1)
        
        num_features = dataset[0].num_features
        num_edge_features = dataset[0].num_edge_features
        
        
        nn = Seq(Lin(num_edge_features, 16), ReLU(), Lin(16, dataset[0].num_node_features* 96))
        self.conv1 = NNConv(dataset[0].num_node_features, 96, nn=nn)
        nn = Seq(Lin(num_edge_features, 16), ReLU(), Lin(16, 96* hidden_channels))
        self.conv2 = NNConv(96, hidden_channels, nn=nn)
        nn = Seq(Lin(num_edge_features, 16), ReLU(), Lin(16, hidden_channels* hidden_channels))
        self.conv3 = NNConv(hidden_channels, hidden_channels, nn=nn)
        self.conv4 = NNConv(hidden_channels, hidden_channels, nn=nn)
        self.lin = Linear(hidden_channels, 1)

    def forward(self, x, edge_index, batch, edge_attr):
        # 1. Obtain node embeddings 
        x = self.conv1(x, edge_index, edge_attr)
        x = x.relu()
        x = self.conv2(x, edge_index, edge_attr)
        x = x.relu()
        x = self.conv3(x, edge_index, edge_attr)
        x = x.relu()
        x = self.conv4(x, edge_index, edge_attr)
        x = x.relu()
        # 2. Readout layer
        x = global_mean_pool(x, batch)  # [batch_size, hidden_channels]

        # 3. Apply a final classifier
        x = F.dropout(x, p=0.5, training=self.training)
        x = self.lin(x)
        x = x.relu() + 0.000001
        
        return x

model_p = GCN(hidden_channels=hidden_channels).to(device=device)
model_d = GCN(hidden_channels=hidden_channels).to(device=device)
print(model_p, model_d)

In [None]:
params = list(model_p.parameters()) + list(model_d.parameters())
optimizer = torch.optim.Adam(params, lr=learning_rate)
criterion = torch.nn.MSELoss()
criterion_v = torch.nn.L1Loss() # that's the MAE Loss
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, patience=5, verbose=True)

def train(loader):
    model_p.train()
    model_d.train()
    for data in loader:  # Iterate in batches over the training dataset. 
        prot_out = model_p(data.x, data.edge_index, data.x_batch,  data.edge_attr)  # Perform a single forward pass.
        #print(data.x, data.edge_index, data.x_batch,  data.edge_attr) 
        deprot_out = model_d(data.x2, data.edge_index2, data.x2_batch,  data.edge_attr2)
        #out = prot_out 
        out = torch.log10(torch.div(deprot_out, prot_out))
        #print('prot_out',out)
        loss = criterion(out.flatten(), data.y)  # Compute the loss.
        loss.backward()  # Derive gradients.
        optimizer.step()  # Update parameters based on gradients.
        optimizer.zero_grad() # Clear gradients.
        
def test(loader):
    model_p.eval()
    model_d.eval()
    loss = torch.Tensor([0]).to(device=device)
    for data in loader:  # Iterate in batches over the training dataset.
        #data.to(device="cpu")
        prot_out = model_p(data.x, data.edge_index, data.x_batch,  data.edge_attr)  # Perform a single forward pass.
        deprot_out = model_d(data.x2, data.edge_index2, data.x2_batch,  data.edge_attr2)
        out = torch.log10(torch.div(deprot_out, prot_out))
        loss += criterion_v(out.flatten(), data.y)
    return loss/len(loader) # MAE loss of batches can be summed and divided by the number of batches
     
for epoch in range(0, num_epochs):
    if epoch != 0: 
        train(train_loader)
    train_acc = test(train_loader)
    test_acc = test(test_loader)
    if epoch % 10 == 0:
        print(f'Epoch: {epoch:03d}, Train MAE: {train_acc.item():.4f}, Test MAE: {test_acc.item():.4f}')

In [4]:
#!echo $HOSTNAME

a7pc98.pch.univie.ac.at
