In [1]:
from rdkit.Chem import rdFingerprintGenerator
from rdkit.Chem.Draw import IPythonConsole
from rdkit import Chem
import numpy as np
from collections.abc import Generator
import pandas as pd
from sklearn import cluster as sk_clustering
from sklearn import datasets as sk_datasets
from sklearn import model_selection as sk_model_selection
import matplotlib.pyplot as plt
import seaborn as sns
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils import data as torch_data
if torch.cuda.is_available():
    print("CUDA AVAILABLE")
from neuralfingerprint import featurizer

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
device = torch.accelerator.current_accelerator().type if torch.accelerator.is_available() else "cpu"
print(f"Using {device} device")


Using cpu device


In [3]:
device

'cpu'

In [182]:
df = pd.read_csv("delaney-processed.csv")

In [765]:
df["mol"] = df["smiles"].apply(Chem.MolFromSmiles)
df["num_atoms"] = df["mol"].apply(lambda x : x.GetNumAtoms())
df_filtered = df[df["num_atoms"]>=6].reset_index(drop=True)

In [767]:
len(df_filtered),len(df)

(1022, 1128)

In [None]:
100 -> 70, 20, 10

In [768]:
train_df, test_df = sk_model_selection.train_test_split(df_filtered, test_size=0.2, random_state=32)
train_df, valid_df = sk_model_selection.train_test_split(train_df, test_size=0.15, random_state=32)

In [769]:
train_df.shape, valid_df.shape, test_df.shape, df.shape

((694, 12), (123, 12), (205, 12), (1128, 12))

In [770]:
class MolDataset(torch_data.Dataset):
    def __init__(self, smiles: tuple[str, ...], targets: tuple[float, ...]):
        self.smiles = smiles
        self.targets = targets
        
    def __len__(self):
        return len(self.smiles)
        
    def transform(self, smiles: str):
        mol = Chem.MolFromSmiles(smiles)

        atom_features = featurizer.featurize_atoms(mol).to(torch.float32)
        bond_features = featurizer.featurize_bonds(mol).to(torch.float32)

        return atom_features, bond_features
    
    def __getitem__(self, idx):
        atom_features, bond_features = self.transform(self.smiles[idx])
        target = self.targets[idx]
        return atom_features, bond_features, target

In [771]:
for smi in df_filtered["smiles"]:
    mol = Chem.MolFromSmiles(smi)
    num_atoms = mol.GetNumAtoms()
    num_bonds = mol.GetNumBonds()
    if num_atoms == 0 or num_bonds == 0:
        print(smi)

# Convolutional Networks on Graphs for Learning Molecular Fingerprints

In [772]:
molecule = Chem.MolFromSmiles("O=C1OC(CN1c1ccc(cc1)N1CCOCC1=O)CNC(=O)c1ccc(s1)Cl")
R = 3
hidden_weights = torch.zeros(size=(1,100))

In [773]:
atom_features = featurizer.featurize_atoms(molecule).to(torch.float32)#.T
bond_features = featurizer.featurize_bonds(molecule).to(torch.float32)#.T

In [774]:
atom_features[0].shape, bond_features[0].shape

(torch.Size([5]), torch.Size([3]))

In [775]:
atom_features.size()

torch.Size([29, 5])

In [776]:
atom_features.shape[0]

29

In [777]:
F.pad(atom_features, pad=(0,0,50-atom_features.shape[0], 0), value=0).shape

torch.Size([50, 5])

In [781]:
def molecule_collate_fn(batch):
    atom_batch = []
    bond_batch = []
    labels = []


    max_atoms = max(atom.shape[0] for atom, _, _ in batch)
    max_bonds = max(bond.shape[0] for _, bond, _ in batch)

    for atom_features, bond_features, label in batch:
        atoms_to_pad = max_atoms - atom_features.shape[0]
        bonds_to_pad = max_bonds - bond_features.shape[0]
        
        atom_features_padded = F.pad(atom_features, pad=(0, 0,  0, atoms_to_pad), value=0)
        bond_features_padded = F.pad(bond_features, pad=(0, 0, 0, bonds_to_pad), value=0)
        atom_batch.append(atom_features_padded)
        bond_batch.append(bond_features_padded)
        labels.append(label)

    return torch.stack(atom_batch), torch.stack(bond_batch), torch.tensor(labels)

In [795]:
train_moldataset = MolDataset(smiles=tuple(train_df["smiles"]), targets=tuple(train_df["ESOL predicted log solubility in mols per litre"]))

test_moldataset = MolDataset(smiles=tuple(test_df["smiles"]), targets=tuple(test_df["ESOL predicted log solubility in mols per litre"]))

valid_moldataset = MolDataset(smiles=tuple(valid_df["smiles"]), targets=tuple(valid_df["ESOL predicted log solubility in mols per litre"]))

train_dataloader = torch_data.DataLoader(train_moldataset, batch_size=64, shuffle=True, collate_fn=molecule_collate_fn)
test_dataloader = torch_data.DataLoader(test_moldataset, batch_size=64, shuffle=False, collate_fn=molecule_collate_fn)
valid_dataloader = torch_data.DataLoader(valid_moldataset, batch_size=64, shuffle=False, collate_fn=molecule_collate_fn)

In [754]:
first_batch = next(iter(train_dataloader))

In [755]:
axs,bxs,ys = first_batch

In [756]:
ys.shape, axs.shape, bxs.shape

(torch.Size([64]), torch.Size([64, 54, 5]), torch.Size([64, 61, 3]))

In [834]:
class VanillaNet(nn.Module):
    def __init__(self, n_atom_features: int, n_bond_features: int, n_out_features: int=100):
        super().__init__()
        self.atom_layer = nn.Linear(n_atom_features, n_out_features)
        self.bond_layer = nn.Linear(n_bond_features, n_out_features)
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(n_out_features*2, 1)
    def forward(self, x):
        atom_features, bond_features = x
        atom_x = torch.mean(self.atom_layer(atom_features), dim=1)
        bond_x = torch.mean(self.bond_layer(bond_features), dim=1)
        
        concat_x = self.activation(torch.cat((atom_x, bond_x), dim=-1))

        return self.output_layer(concat_x).squeeze(-1)

# Model definition
device = "cuda"
class VanillaNetCUDA(nn.Module):
    def __init__(self, n_atom_features: int, n_bond_features: int, n_out_features: int = 100):
        super().__init__()
        self.atom_layer = nn.Linear(n_atom_features, n_out_features)
        self.bond_layer = nn.Linear(n_bond_features, n_out_features)
        self.activation = nn.ReLU()
        self.output_layer = nn.Linear(n_out_features * 2, 1)

    def forward(self, x):
        atom_features, bond_features = x

        # Optional: force input tensors onto same device as model
        device = next(self.parameters()).device
        atom_features = atom_features.to(device)
        bond_features = bond_features.to(device)

        atom_x = torch.mean(self.atom_layer(atom_features), dim=1)
        bond_x = torch.mean(self.bond_layer(bond_features), dim=1)
        concat_x = self.activation(torch.cat((atom_x, bond_x), dim=-1))
        return self.output_layer(concat_x).squeeze(-1)


In [827]:
model = VanillaNet(n_atom_features=5, n_bond_features=3)

torch.Size([64, 35, 5]) torch.Size([64, 39, 3]) torch.Size([64])
✅ Forward pass OK: torch.Size([64])
✅ Loss OK: 15.64881706237793


In [830]:
def train_model(model, train_loader, val_loader, epochs=20, lr=1e-3, device='cpu'):
    import os
    os.environ["CUDA_LAUNCH_BLOCKING"] = "1"  # Optional: helps locate CUDA crash source

    model = model.to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()
    best_val_loss = float('inf')

    for epoch in range(epochs):
        model.train()
        total_train_loss = 0.0

        for batch_idx, (atom_batch, bond_batch, labels) in enumerate(train_loader):
            try:
                atom_batch = atom_batch.to(device)
                bond_batch = bond_batch.to(device)
                labels = labels.to(device).float()

                optimizer.zero_grad()
                preds = model((atom_batch, bond_batch))

                # Debug shape
                if preds.shape != labels.shape:
                    print(f"🚨 Shape mismatch at batch {batch_idx}")
                    print(f"Preds shape: {preds.shape}, Labels shape: {labels.shape}")
                    preds = preds.view_as(labels)  # try to fix shape if needed

                # Check for NaNs
                if torch.isnan(preds).any() or torch.isinf(preds).any():
                    print(f"⚠️ NaN or INF detected in predictions at batch {batch_idx}")
                    continue

                loss = criterion(preds, labels)
                loss.backward()
                optimizer.step()

                total_train_loss += loss.item() * len(labels)

            except RuntimeError as e:
                print(f"🔥 RuntimeError at batch {batch_idx}: {e}")
                continue  # skip batch

        avg_train_loss = total_train_loss / len(train_loader.dataset)

        # Validation
        model.eval()
        total_val_loss = 0.0
        with torch.no_grad():
            for atom_batch, bond_batch, labels in val_loader:
                atom_batch = atom_batch.to(device)
                bond_batch = bond_batch.to(device)
                labels = labels.to(device).float()

                preds = model((atom_batch, bond_batch))

                if preds.shape != labels.shape:
                    preds = preds.view_as(labels)

                loss = criterion(preds, labels)
                total_val_loss += loss.item() * len(labels)

        avg_val_loss = total_val_loss / len(val_loader.dataset)

        print(f"Epoch {epoch+1}/{epochs} | Train Loss: {avg_train_loss:.4f} | Val Loss: {avg_val_loss:.4f}")

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "best_model.pt")
            print("📦 Saved best model.")

    print("✅ Training complete.")

In [831]:
train_model(model, train_dataloader, valid_dataloader, epochs=30, device="cpu")

🔥 RuntimeError at batch 0: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

🔥 RuntimeError at batch 1: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

🔥 RuntimeError at batch 2: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.

🔥 RuntimeError at batch 3: CUDA error: unspecified launch failure
CUDA ke