In [1]:
import numpy as np
import pandas as pd
import torch
from rdkit import Chem
from sklearn import model_selection as sk_modelselection
from torch.nn import functional as F
from torch.utils.data import DataLoader, Dataset

from graphmodels import constants, datasets

device = "cpu"
if torch.cuda.is_available():
    print("GPU is available")
    device = "cuda"
else:
    print("Running on CPU")

GPU is available


# Load data

In [2]:
sol_df = pd.read_csv("../datasets/chemistry/delaney-processed.csv")

In [3]:
sol_df["mol"] = sol_df["smiles"].apply(Chem.MolFromSmiles)
sol_df["num_atoms"] = sol_df["mol"].apply(lambda x : x.GetNumAtoms())
MAX_NUM_ATOMS = sol_df["num_atoms"].max()
print(f"Max num atoms = {MAX_NUM_ATOMS}")

Max num atoms = 55


In [4]:
# Split data
train_df, test_df = sk_modelselection.train_test_split(sol_df, test_size=0.2, random_state=42, shuffle=True)
train_df, valid_df = sk_modelselection.train_test_split(train_df, test_size=0.15, random_state=42, shuffle=True)

In [5]:
train_df.shape, test_df.shape, valid_df.shape

((766, 12), (226, 12), (136, 12))

In [6]:
train_df

Unnamed: 0,Compound ID,ESOL predicted log solubility in mols per litre,Minimum Degree,Molecular Weight,Number of H-Bond Donors,Number of Rings,Number of Rotatable Bonds,Polar Surface Area,measured log solubility in mols per litre,smiles,mol,num_atoms
456,Etomidate,-3.359,1,244.294,0,2,4,44.12,-4.735,CCOC(=O)c1cncn1C(C)c2ccccc2,<rdkit.Chem.rdchem.Mol object at 0x7f1811e6d5b0>,18
938,isoguanine,-1.740,1,151.129,3,2,0,100.71,-3.401,Nc1nc(O)nc2nc[nH]c12,<rdkit.Chem.rdchem.Mol object at 0x7f1811e7aa40>,11
925,RTI 6,-3.335,1,313.361,2,3,4,81.59,-3.360,CCN2c1nc(N(C)(CCO))ccc1NC(=O)c3cccnc23,<rdkit.Chem.rdchem.Mol object at 0x7f1811e7a490>,23
810,Tetrahydropyran,-0.978,2,86.134,0,1,0,9.23,-0.030,C1CCOCC1,<rdkit.Chem.rdchem.Mol object at 0x7f1811e771b0>,6
495,Minoxidil,-1.809,1,209.253,2,2,1,95.11,-1.989,Nc1cc(nc(N)n1=O)N2CCCCC2,<rdkit.Chem.rdchem.Mol object at 0x7f1811e6e6c0>,15
...,...,...,...,...,...,...,...,...,...,...,...,...
837,Metoxuron,-2.683,1,228.679,1,1,2,41.57,-2.564,COc1ccc(NC(=O)N(C)C)cc1Cl,<rdkit.Chem.rdchem.Mol object at 0x7f1811e77d80>,15
587,benodanil,-4.245,1,323.133,1,2,2,29.10,-4.210,c1c(NC(=O)c2ccccc2(I))cccc1,<rdkit.Chem.rdchem.Mol object at 0x7f1811e70f90>,16
776,1-Methylcyclohexene,-2.574,1,96.173,0,1,0,0.00,-3.270,CC1=CCCCC1,<rdkit.Chem.rdchem.Mol object at 0x7f1811e762d0>,7
153,1-Chloropentane,-2.294,1,106.596,0,0,3,0.00,-2.730,CCCCCCl,<rdkit.Chem.rdchem.Mol object at 0x7f1811e65150>,6


# Create datasets

In [8]:
train_dset = datasets.NeuralFingerprintDataset(smiles=tuple(train_df["smiles"]),
                                               targets=tuple(train_df["measured log solubility in mols per litre"]))

valid_dset = datasets.NeuralFingerprintDataset(smiles=tuple(valid_df["smiles"]),
                                               targets=tuple(valid_df["measured log solubility in mols per litre"]))

test_dset = datasets.NeuralFingerprintDataset(smiles=tuple(test_df["smiles"]),
                                               targets=tuple(test_df["measured log solubility in mols per litre"]))

In [9]:
def neuralgraph_collate(batch, max_num_atoms: int):
    all_atom_features = []
    all_bond_features = []
    all_adj_matrices = []
    all_targets = []

    # Get max number of atoms in data
    for (atom_feats, bond_feats, adj_matrix), target in batch:
        num_to_pad = max_num_atoms - atom_feats.shape[0]
        atom_feats_padded = F.pad(atom_feats, pad=(0,0,0,num_to_pad), value=0)
        bond_feats_padded = F.pad(bond_feats, pad=(0,0,0,num_to_pad,0,num_to_pad), value=0)
        adj_matrix_padded = F.pad(adj_matrix, pad=(0,num_to_pad,0,num_to_pad), value=0)
        all_targets.append(target)

        all_bond_features.append(bond_feats_padded)
        all_adj_matrices.append(adj_matrix_padded)
        all_atom_features.append(atom_feats_padded)
    return tuple(map(torch.stack, [all_atom_features, all_bond_features, all_adj_matrices, all_targets]))

tuple[tuple[torch.Tensor, torch.Tensor, torch.Tensor], torch.Tensor]

In [34]:
from functools import partial

In [35]:
ff = partial(neuralgraph_collate, max_num_atoms=MAX_NUM_ATOMS)

In [10]:
len(first_elem)

NameError: name 'first_elem' is not defined

In [37]:
atom_features, bond_features, adj_matrices, targets = ff(test_batch)

In [39]:
targets.shape

torch.Size([64])