In [1]:
# import de nodige packages
import os
import sys
import re
import math
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split

import networkx as nx
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdmolfiles import MolFromXYZFile
from functions.data_loader import data_loader
from classes.smiles_to_graph import MolecularGraphFromSMILES
from classes.MPNN import MPNN
from functions.compute_loss import compute_loss
from functions.evaluations import evaluate_yield
from functions.evaluations import evaluate_borylation_site
from functions.evaluations import evaluate_reactivity
from functions.evaluations import evaluate_model
from functions.train import train_MPNN_model

# Load the data and couple the SMILES to the yields and remove nan's

In [2]:
yields_path = "data/compounds_yield.csv"
smiles_path = "data/compounds_smiles.csv"

df_merged = data_loader(yields_path, smiles_path)


#print("Merged DataFrame:")
#print(df_merged)


Convert the SMILES to Graphs

## Zet de SMILES om naar graphs

In [3]:
from rdkit import Chem

graphs = []
for _, row in tqdm(df_merged.iterrows(), total=len(df_merged), desc="Converting SMILES to graphs"):
    try:
        mol_graph = MolecularGraphFromSMILES(row['smiles_raw'])
        mol = Chem.MolFromSmiles(row['smiles_raw'])  # extra RDKit mol object
        num_atoms = mol.GetNumAtoms() if mol is not None else -1

        borylation_index = row['borylation_site']

        # Debug print vóór de fout
        if not (0 <= borylation_index < num_atoms):
            raise IndexError(f"index {borylation_index} is out of bounds for molecule with {num_atoms} atoms")

        graph = mol_graph.to_pyg_data(
            borylation_index=borylation_index,
            yield_value=row['yield']
        )
        graphs.append(graph)

    except Exception as e:
        print(f"\n🚨 Fout bij SMILES: {row['smiles_raw']}")
        print(f"  - borylation_site: {row['borylation_site']}")
        mol = Chem.MolFromSmiles(row['smiles_raw'])
        if mol:
            print(f"  - aantal atomen in RDKit mol: {mol.GetNumAtoms()}")
        else:
            print("  - RDKit kon mol niet parsen!")
        print(f"  - foutmelding: {e}")

# Verdeel de data in train, validatie en test sets
from sklearn.model_selection import train_test_split

# Eerste splitsing: 85% train+val, 15% test
train_val_graphs, test_graphs = train_test_split(
    graphs, test_size=0.15, random_state=42
)

# Tweede splitsing: 70/15 = 70/85 ≈ 0.8235 voor train
train_graphs, val_graphs = train_test_split(
    train_val_graphs, test_size=0.1765, random_state=42
)


Converting SMILES to graphs: 100%|██████████| 83/83 [00:00<00:00, 462.38it/s]


## Zet de graphs in een dataloader zodat het de GNN in kan

In [None]:
import torch
from torch_geometric.loader import DataLoader

# Instellingen
batch_size = 16
num_epochs = 1000
#learning_rate = 0.0001
learning_rate = 10E-5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Aantal kenmerken per node en edge
node_in_feats = train_graphs[0].x.shape[1]
edge_in_feats = train_graphs[0].edge_attr.shape[1]

# Grootte van de verborgen laag
hidden_feats = 64
#hidden_feats = 128 

# DataLoaders
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)

# Initialiseer model
model = MPNN(node_in_feats=node_in_feats, edge_in_feats=edge_in_feats, hidden_feats=hidden_feats)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
#optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# Training loop met validatie
for epoch in range(num_epochs):
    train_losses = train_MPNN_model(model, train_loader, optimizer, device)
    val_metrics = evaluate_model(model, val_loader, device)

    print(f"[Epoch {epoch+1}] Train loss: {train_losses['total']:.4f} | "
        f"Site: {train_losses['site']:.4f}, "
        f"Reactivity: {train_losses['react']:.4f}, "
        f"Yield: {train_losses['yield']:.4f}")


# Evaluatie op testset na training
print("\n✅ Evaluatie op testset na training:")
test_metrics = evaluate_model(model, test_loader, device)

print("📊 Testresultaten:")
print(f"🔹 Borylation site prediction:")
print(f"   - Accuracy      : {test_metrics['site_Accuracy']:.3f}")
print(f"   - Precision     : {test_metrics['site_Precision']:.3f}")
print(f"   - Recall        : {test_metrics['site_Recall']:.3f}")
print(f"   - F1-score      : {test_metrics['site_F1']:.3f}")
print(f"   - ROC AUC       : {test_metrics['site_AUC']:.3f}")

print(f"\n🔹 Reactivity prediction:")
print(f"   - MSE           : {test_metrics['react_MSE']:.3f}")
print(f"   - Pearson R     : {test_metrics['react_Pearson']:.3f}")
print(f"   - Spearman Rho  : {test_metrics['react_Spearman']:.3f}")

print(f"\n🔹 Yield prediction:")
print(f"   - MSE           : {test_metrics['yield_MSE']:.3f}")
print(f"   - MAE           : {test_metrics['yield_MAE']:.3f}")
print(f"   - R²            : {test_metrics['yield_R2']:.3f}")



[Epoch 1] Train loss: 416.1893 | Site: 1.7033, Reactivity: 0.2754, Yield: 4142.1068
[Epoch 2] Train loss: 409.8299 | Site: 1.6461, Reactivity: 0.0247, Yield: 4081.5910
[Epoch 3] Train loss: 417.2753 | Site: 1.5337, Reactivity: 0.0236, Yield: 4157.1806
[Epoch 4] Train loss: 415.2088 | Site: 1.4819, Reactivity: 0.0299, Yield: 4136.9704
[Epoch 5] Train loss: 387.6134 | Site: 1.4992, Reactivity: 0.0387, Yield: 3860.7561
[Epoch 6] Train loss: 394.0830 | Site: 1.4720, Reactivity: 0.0462, Yield: 3925.6482
[Epoch 7] Train loss: 386.7329 | Site: 1.4876, Reactivity: 0.0569, Yield: 3851.8833
[Epoch 8] Train loss: 353.4750 | Site: 1.4625, Reactivity: 0.0698, Yield: 3519.4266
[Epoch 9] Train loss: 346.6138 | Site: 1.4719, Reactivity: 0.0776, Yield: 3450.6427
[Epoch 10] Train loss: 322.8033 | Site: 1.4883, Reactivity: 0.0799, Yield: 3212.3508
[Epoch 11] Train loss: 297.0529 | Site: 1.4691, Reactivity: 0.0770, Yield: 2955.0674
[Epoch 12] Train loss: 243.3054 | Site: 1.4710, Reactivity: 0.0680, Yield:

In [5]:
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import KFold
import numpy as np

# Hyperparameters
batch_size = 32
num_epochs = 20
learning_rate = 0.001
k_folds = 5
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Zorg dat je een lijst hebt met alle graphs
all_graphs = graphs  # <-- Zorg dat dit bestaat

# Node/edge feature sizes
node_in_feats = all_graphs[0].x.shape[1]
edge_in_feats = all_graphs[0].edge_attr.shape[1]
hidden_feats = 64

# Init cross-validator
kfold = KFold(n_splits=k_folds, shuffle=True, random_state=42)

# Resultaten opslaan
all_test_metrics = []

for fold, (train_idx, test_idx) in enumerate(kfold.split(all_graphs)):
    print(f"\n📂 Fold {fold + 1}/{k_folds}")
    
    # Split data
    train_subset = [all_graphs[i] for i in train_idx]
    test_subset = [all_graphs[i] for i in test_idx]

    # Je kunt hier ook nog een validation split maken vanuit train_subset indien gewenst
    val_split = int(0.1 * len(train_subset))
    val_subset = train_subset[:val_split]
    train_subset = train_subset[val_split:]

    # DataLoaders
    train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_subset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_subset, batch_size=batch_size, shuffle=False)

    # Init model en optimizer
    model = MPNN(node_in_feats=node_in_feats, edge_in_feats=edge_in_feats, hidden_feats=hidden_feats).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Training
    for epoch in range(num_epochs):
        train_losses = train_MPNN_model(model, train_loader, optimizer, device)
        val_metrics = evaluate_model(model, val_loader, device)

        print(f"[Epoch {epoch+1}] Train loss: {train_losses['total']:.4f} | "
              f"Site: {train_losses['site']:.4f}, "
              f"Reactivity: {train_losses['react']:.4f}, "
              f"Yield: {train_losses['yield']:.4f}")

    # Evaluatie op testset
    print(f"✅ Evaluatie op testset (Fold {fold+1}):")
    test_metrics = evaluate_model(model, test_loader, device)
    all_test_metrics.append(test_metrics)

# Gemiddelde resultaten
print("\n📊 Gemiddelde testresultaten over alle folds:")
avg_metrics = {}
for key in all_test_metrics[0]:
    avg_metrics[key] = np.mean([m[key] for m in all_test_metrics])
    print(f"{key:20s}: {avg_metrics[key]:.4f}")


📂 Fold 1/5
[Epoch 1] Train loss: 407.9492 | Site: 1.5373, Reactivity: 0.0349, Yield: 4063.7698
[Epoch 2] Train loss: 393.9246 | Site: 1.4828, Reactivity: 0.0332, Yield: 3924.0861
[Epoch 3] Train loss: 372.7148 | Site: 1.4813, Reactivity: 0.0163, Yield: 3712.1713
[Epoch 4] Train loss: 338.5950 | Site: 1.4781, Reactivity: 0.0172, Yield: 3370.9978
[Epoch 5] Train loss: 281.4046 | Site: 1.4919, Reactivity: 0.0151, Yield: 2798.9769
[Epoch 6] Train loss: 206.8408 | Site: 1.5087, Reactivity: 0.0118, Yield: 2053.2023
[Epoch 7] Train loss: 129.4067 | Site: 1.5116, Reactivity: 0.0095, Yield: 1278.8554
[Epoch 8] Train loss: 87.3621 | Site: 1.5131, Reactivity: 0.0087, Yield: 858.4025
[Epoch 9] Train loss: 107.7482 | Site: 1.5131, Reactivity: 0.0084, Yield: 1062.2668
[Epoch 10] Train loss: 123.2232 | Site: 1.5031, Reactivity: 0.0089, Yield: 1217.1132
[Epoch 11] Train loss: 100.6415 | Site: 1.4875, Reactivity: 0.0113, Yield: 991.4271
[Epoch 12] Train loss: 80.6829 | Site: 1.4726, Reactivity: 0.0241

New version of code used above