In [1]:
# import de nodige packages
import os
import sys
import re
import math
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split

import networkx as nx
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdmolfiles import MolFromXYZFile
from functions.data_loader import data_loader
from classes.smiles_to_graph import MolecularGraphFromSMILES
from classes.MPNN import MPNN
from functions.compute_loss import compute_loss
from functions.evaluations import evaluate_yield
from functions.evaluations import evaluate_borylation_site
from functions.evaluations import evaluate_model
from functions.train import train_MPNN_model

# Load the data and couple the SMILES to the yields and remove nan's

In [2]:
yields_path = "data/compounds_yield.csv"
smiles_path = "data/compounds_smiles.csv"

df_merged = data_loader(yields_path, smiles_path)


#print("Merged DataFrame:")
#print(df_merged)


Convert the SMILES to Graphs

## Zet de SMILES om naar graphs

In [3]:
from rdkit import Chem

graphs = []
for _, row in tqdm(df_merged.iterrows(), total=len(df_merged), desc="Converting SMILES to graphs"):
    try:
        mol_graph = MolecularGraphFromSMILES(row['smiles_raw'])
        mol = Chem.MolFromSmiles(row['smiles_raw'])  # extra RDKit mol object
        num_atoms = mol.GetNumAtoms() if mol is not None else -1

        borylation_index = row['borylation_site']

        # Debug print vóór de fout
        if not (0 <= borylation_index < num_atoms):
            raise IndexError(f"index {borylation_index} is out of bounds for molecule with {num_atoms} atoms")

        graph = mol_graph.to_pyg_data(
            borylation_index=borylation_index,
            yield_value=row['yield']
        )
        graphs.append(graph)

    except Exception as e:
        print(f"\n🚨 Fout bij SMILES: {row['smiles_raw']}")
        print(f"  - borylation_site: {row['borylation_site']}")
        mol = Chem.MolFromSmiles(row['smiles_raw'])
        if mol:
            print(f"  - aantal atomen in RDKit mol: {mol.GetNumAtoms()}")
        else:
            print("  - RDKit kon mol niet parsen!")
        print(f"  - foutmelding: {e}")

# Verdeel de data in train, validatie en test sets
from sklearn.model_selection import train_test_split

# Eerste splitsing: 85% train+val, 15% test
train_val_graphs, test_graphs = train_test_split(
    graphs, test_size=0.15, random_state=42
)

# Tweede splitsing: 70/15 = 70/85 ≈ 0.8235 voor train
train_graphs, val_graphs = train_test_split(
    train_val_graphs, test_size=0.1765, random_state=42
)


Converting SMILES to graphs:   0%|          | 0/83 [00:00<?, ?it/s]

Converting SMILES to graphs: 100%|██████████| 83/83 [00:00<00:00, 526.73it/s]


## Zet de graphs in een dataloader zodat het de GNN in kan

In [None]:
import torch
from torch_geometric.loader import DataLoader

#dus reactivity fixen
# hebben we dingen genormaliseerd?->zeker yield enzo wel handig denk ik

# Instellingen
batch_size = 16
num_epochs = 500
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Architecture MPNN
node_in_feats=train_graphs[0].x.shape[1]                # Aantal input features per node
edge_in_feats=train_graphs[0].edge_attr.shape[1]        # Aantal input features per edge
hidden_feats=128                                         # Aantal verborgen features
num_step_message_passing=3                              # Aantal stappen voor message passing
num_step_set2set=3                                      # Aantal stappen voor Set2Set
num_layer_set2set=1                                     # Aantal lagen voor Set2Set
readout_feats=1024                                      # Aantal features voor readout
activation='leaky_relu'                                # Activatiefunctie
dropout=0.2                                             # Dropout percentage

# DataLoaders
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)

# Initialiseer model
model = MPNN(
    node_in_feats=node_in_feats,
    edge_in_feats=edge_in_feats,
    hidden_feats=hidden_feats,
    num_step_message_passing=num_step_message_passing,
    num_step_set2set=num_step_set2set,
    num_layer_set2set=num_layer_set2set,
    readout_feats=readout_feats,
    activation=activation,
    dropout=dropout
)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop met validatie
for epoch in range(num_epochs):
    train_losses = train_MPNN_model(model, train_loader, optimizer, device)
    val_metrics = evaluate_model(model, val_loader, device)

    print(f"[Epoch {epoch+1}] Train loss: {train_losses['total']:.4f} | "
        f"Site: {train_losses['site']:.4f}, "
        f"Yield: {train_losses['yield']:.4f}")


# Evaluatie op testset na training
print("\n✅ Evaluatie op testset na training:")
test_metrics = evaluate_model(model, test_loader, device)

print("📊 Testresultaten:")
print(f"🔹 Borylation site prediction:")
print(f"   - Accuracy      : {test_metrics['site_Accuracy']:.3f}")
print(f"   - Precision     : {test_metrics['site_Precision']:.3f}")
print(f"   - Recall        : {test_metrics['site_Recall']:.3f}")
print(f"   - F1-score      : {test_metrics['site_F1']:.3f}")
print(f"   - ROC AUC       : {test_metrics['site_AUC']:.3f}")

print(f"\n🔹 Yield prediction:")
print(f"   - MSE           : {test_metrics['yield_MSE']:.3f}")
print(f"   - MAE           : {test_metrics['yield_MAE']:.3f}")
print(f"   - R²            : {test_metrics['yield_R2']:.3f}")



[Epoch 1] Train loss: 412.3953 | Site: 1.5166, Yield: 4098.7866
[Epoch 2] Train loss: 324.9587 | Site: 1.4845, Yield: 3224.7413
[Epoch 3] Train loss: 121.7636 | Site: 1.5094, Yield: 1192.5419
[Epoch 4] Train loss: 119.2185 | Site: 1.5454, Yield: 1166.7308
[Epoch 5] Train loss: 82.2481 | Site: 1.5508, Yield: 796.9731
[Epoch 6] Train loss: 87.4296 | Site: 1.5429, Yield: 848.8676
[Epoch 7] Train loss: 77.9235 | Site: 1.5608, Yield: 753.6267
[Epoch 8] Train loss: 76.9761 | Site: 1.5175, Yield: 744.5856
[Epoch 9] Train loss: 76.8026 | Site: 1.5364, Yield: 742.6620
[Epoch 10] Train loss: 69.8646 | Site: 1.5157, Yield: 673.4894
[Epoch 11] Train loss: 67.3415 | Site: 1.5330, Yield: 648.0845
[Epoch 12] Train loss: 72.0481 | Site: 1.4907, Yield: 695.5743
[Epoch 13] Train loss: 68.5908 | Site: 1.4805, Yield: 661.1026
[Epoch 14] Train loss: 67.4278 | Site: 1.4951, Yield: 649.3269
[Epoch 15] Train loss: 71.6655 | Site: 1.4870, Yield: 691.7848
[Epoch 16] Train loss: 69.0867 | Site: 1.4512, Yield: 66

In [9]:
import torch
from torch_geometric.loader import DataLoader
from sklearn.model_selection import KFold

def k_fold_training(graphs, model_class, k=5, num_epochs=500, batch_size=16, learning_rate=0.001, device=None):
    if device is None:
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # K-fold splits maken
    kf = KFold(n_splits=k, shuffle=True, random_state=42)
    fold_results = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(graphs)):
        print(f"\n🔁 Fold {fold + 1}/{k}")

        train_graphs = [graphs[i] for i in train_idx]
        val_graphs = [graphs[i] for i in val_idx]

        # Dataloaders
        train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)

        # Architectuurinstellingen
        node_in_feats = train_graphs[0].x.shape[1]
        edge_in_feats = train_graphs[0].edge_attr.shape[1]

        model = model_class(
            node_in_feats=node_in_feats,
            edge_in_feats=edge_in_feats,
            hidden_feats=128,
            num_step_message_passing=3,
            num_step_set2set=3,
            num_layer_set2set=1,
            readout_feats=1024,
            activation='leaky_relu',
            dropout=0.2
        ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            train_losses = train_MPNN_model(model, train_loader, optimizer, device)
            val_metrics = evaluate_model(model, val_loader, device)

            print(f"[Fold {fold+1} | Epoch {epoch+1}] Train loss: {train_losses['total']:.4f} | "
                  f"Site: {train_losses['site']:.4f}, Yield: {train_losses['yield']:.4f}")

        # Opslaan van metrics van deze fold
        fold_results.append(evaluate_model(model, val_loader, device))

    return fold_results


# === Gebruik de functie ===
# Bijvoorbeeld: k=5 fold cross-validatie over alle data

all_graphs = train_graphs + val_graphs + test_graphs  # Voeg alles samen
results = k_fold_training(all_graphs, model_class=MPNN, k=5)

# === Resultaten samenvatten ===
print("\n✅ Gemiddelde resultaten over folds:")
from collections import defaultdict
import numpy as np

avg_metrics = defaultdict(list)
for result in results:
    for key, val in result.items():
        avg_metrics[key].append(val)

for key, values in avg_metrics.items():
    mean_val = np.mean(values)
    std_val = np.std(values)
    print(f"{key:20s}: {mean_val:.4f} ± {std_val:.4f}")


🔁 Fold 1/5
[Fold 1 | Epoch 1] Train loss: 431.4134 | Site: 1.5700, Yield: 4288.4347
[Fold 1 | Epoch 2] Train loss: 246.6707 | Site: 1.5769, Yield: 2440.9380
[Fold 1 | Epoch 3] Train loss: 158.1240 | Site: 1.5811, Yield: 1555.4292
[Fold 1 | Epoch 4] Train loss: 98.0734 | Site: 1.5617, Yield: 955.1164
[Fold 1 | Epoch 5] Train loss: 89.9714 | Site: 1.5442, Yield: 874.2722
[Fold 1 | Epoch 6] Train loss: 109.3277 | Site: 1.5376, Yield: 1067.9002
[Fold 1 | Epoch 7] Train loss: 129.1999 | Site: 1.5506, Yield: 1266.4937
[Fold 1 | Epoch 8] Train loss: 75.2092 | Site: 1.5418, Yield: 726.6739
[Fold 1 | Epoch 9] Train loss: 104.0017 | Site: 1.5215, Yield: 1014.8017
[Fold 1 | Epoch 10] Train loss: 90.3259 | Site: 1.5268, Yield: 877.9913
[Fold 1 | Epoch 11] Train loss: 116.8068 | Site: 1.4789, Yield: 1143.2786
[Fold 1 | Epoch 12] Train loss: 117.5929 | Site: 1.5149, Yield: 1150.7802
[Fold 1 | Epoch 13] Train loss: 98.7599 | Site: 1.4578, Yield: 963.0213
[Fold 1 | Epoch 14] Train loss: 86.3879 | Sit

New version of code used above