In [1]:
# import de nodige packages
import os
import sys
import re
import math
from collections import defaultdict

import pandas as pd
import numpy as np
from tqdm import tqdm

import torch
from torch_geometric.data import Data, DataLoader
from sklearn.model_selection import train_test_split

import networkx as nx
import matplotlib.pyplot as plt

from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.rdmolfiles import MolFromXYZFile
from functions.data_loader import data_loader
from classes.smiles_to_graph import MolecularGraphFromSMILES
from classes.MPNN import MPNN
from functions.compute_loss import compute_loss
from functions.evaluations import evaluate_yield
from functions.evaluations import evaluate_borylation_site
from functions.evaluations import evaluate_reactivity
from functions.evaluations import evaluate_model
from functions.train import train_MPNN_model

# Load the data and couple the SMILES to the yields and remove nan's

In [2]:

yields_path = "data/compounds_yield.csv"
smiles_path = "data/compounds_smiles.csv"

df_merged = data_loader(yields_path, smiles_path)


#print("Merged DataFrame:")
#print(df_merged)


Convert the SMILES to Graphs

## Zet de SMILES om naar graphs

In [3]:
from rdkit import Chem

graphs = []
for _, row in tqdm(df_merged.iterrows(), total=len(df_merged), desc="Converting SMILES to graphs"):
    try:
        mol_graph = MolecularGraphFromSMILES(row['smiles_raw'])
        mol = Chem.MolFromSmiles(row['smiles_raw'])  # extra RDKit mol object
        num_atoms = mol.GetNumAtoms() if mol is not None else -1

        borylation_index = row['borylation_site']

        # Debug print vóór de fout
        if not (0 <= borylation_index < num_atoms):
            raise IndexError(f"index {borylation_index} is out of bounds for molecule with {num_atoms} atoms")

        graph = mol_graph.to_pyg_data(
            borylation_index=borylation_index,
            yield_value=row['yield']
        )
        graphs.append(graph)

    except Exception as e:
        print(f"\n🚨 Fout bij SMILES: {row['smiles_raw']}")
        print(f"  - borylation_site: {row['borylation_site']}")
        mol = Chem.MolFromSmiles(row['smiles_raw'])
        if mol:
            print(f"  - aantal atomen in RDKit mol: {mol.GetNumAtoms()}")
        else:
            print("  - RDKit kon mol niet parsen!")
        print(f"  - foutmelding: {e}")

# Verdeel de data in train, validatie en test sets
from sklearn.model_selection import train_test_split

# Eerste splitsing: 85% train+val, 15% test
train_val_graphs, test_graphs = train_test_split(
    graphs, test_size=0.15, random_state=42
)

# Tweede splitsing: 70/15 = 70/85 ≈ 0.8235 voor train
train_graphs, val_graphs = train_test_split(
    train_val_graphs, test_size=0.1765, random_state=42
)


Converting SMILES to graphs: 100%|██████████| 83/83 [00:00<00:00, 153.72it/s]


## Zet de graphs in een dataloader zodat het de GNN in kan

In [4]:
import torch
from torch_geometric.loader import DataLoader


# Instellingen
batch_size = 32
num_epochs = 20
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Aantal kenmerken per node en edge
node_in_feats = train_graphs[0].x.shape[1]
edge_in_feats = train_graphs[0].edge_attr.shape[1]

# Grootte van de verborgen laag
hidden_feats = 64 

# Dataloader
train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)

# Initialiseer model
model = MPNN(node_in_feats=node_in_feats, edge_in_feats=edge_in_feats, hidden_feats=hidden_feats)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Training loop
for epoch in range(num_epochs):
    # Dit werkt volgens mij niet helemaal zo want we hebben ook nog compute_loss->want 
    # want nu doet ie loss in train maar train is includes al de loss
    loss = train_MPNN_model(model, train_loader, optimizer, device)
    print(f"[Epoch {epoch+1}] Training loss: {loss:.4f}")

# Evaluatie na training
metrics = evaluate_model(model, train_loader, device)
print("Train metrics:", metrics)




[Epoch 1] Training loss: 427.7886
[Epoch 2] Training loss: 408.7964
[Epoch 3] Training loss: 393.6932
[Epoch 4] Training loss: 358.3435
[Epoch 5] Training loss: 307.5023
[Epoch 6] Training loss: 238.3885
[Epoch 7] Training loss: 157.9392
[Epoch 8] Training loss: 93.4265
[Epoch 9] Training loss: 84.3032
[Epoch 10] Training loss: 118.3759
[Epoch 11] Training loss: 121.0190
[Epoch 12] Training loss: 87.8077
[Epoch 13] Training loss: 70.7335
[Epoch 14] Training loss: 75.5974
[Epoch 15] Training loss: 81.4142
[Epoch 16] Training loss: 84.2907
[Epoch 17] Training loss: 82.8874
[Epoch 18] Training loss: 74.6385
[Epoch 19] Training loss: 72.3350
[Epoch 20] Training loss: 70.3302
Train metrics: {'yield_MSE': 713.937, 'yield_MAE': 21.340105, 'yield_R2': -0.014420280016468556, 'site_Accuracy': 0.9027303754266212, 'site_Precision': 0.0, 'site_Recall': 0.0, 'site_F1': 0.0, 'site_AUC': 0.3997114714953736, 'react_MSE': 0.005867216, 'react_Spearman': nan, 'react_Pearson': nan}




In [None]:
from sklearn.model_selection import KFold
from torch_geometric.loader import DataLoader
import torch

# Instellingen
k_folds = 5
batch_size = 16
num_epochs = 20
learning_rate = 0.001
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
all_metrics = []

for fold, (train_idx, val_idx) in enumerate(kf.split(graphs)):
    print(f"\n🟦 Fold {fold+1}/{k_folds}")
    
    # Split data
    train_graphs = [graphs[i] for i in train_idx]
    val_graphs = [graphs[i] for i in val_idx]

    train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)

    # Init model and optimizer
    model = MPNN(node_in_feats=128, edge_in_feats=64, hidden_feats=64).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

    # Train
    for epoch in range(num_epochs):
        loss = train(model, train_loader, optimizer, device)
        print(f"  [Epoch {epoch+1}] Loss: {loss:.4f}")

    # Evaluate
    fold_metrics = evaluate_model(model, val_loader, device)
    print(f"  🔎 Metrics Fold {fold+1}:", fold_metrics)
    all_metrics.append(fold_metrics)

# Gemiddelde prestaties
print("\n📊 Gemiddelde metrics over alle folds:")
from collections import defaultdict
import numpy as np

avg_metrics = defaultdict(list)
for metric_dict in all_metrics:
    for k, v in metric_dict.items():
        avg_metrics[k].append(v)

for k, v_list in avg_metrics.items():
    print(f"{k}: {np.mean(v_list):.4f} ± {np.std(v_list):.4f}")

New version of code used above

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.loader import DataLoader
from sklearn.model_selection import KFold
from collections import defaultdict
import numpy as np

# --- Training Function ---
def train(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for data in loader:
        data = data.to(device)

        optimizer.zero_grad()
        p_borylation, reactivity_score, predicted_yield = model(
            data.x, data.edge_index, data.edge_attr, data.batch
        )

        # Losses
        loss_bce = F.binary_cross_entropy_with_logits(p_borylation, data.p_borylation.float())
        loss_mse_node = F.mse_loss(reactivity_score, data.reactivity_score)
        loss_mse_graph = F.mse_loss(predicted_yield, data.y)

        loss = loss_bce + loss_mse_node + loss_mse_graph
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    return total_loss / len(loader)

# --- Evaluation Function ---
@torch.no_grad()
def evaluate_model(model, loader, device):
    model.eval()
    bce_losses = []
    node_mse_losses = []
    graph_mse_losses = []

    for data in loader:
        data = data.to(device)
        p_borylation, reactivity_score, predicted_yield = model(
            data.x, data.edge_index, data.edge_attr, data.batch
        )

        bce = F.binary_cross_entropy_with_logits(p_borylation, data.p_borylation.float()).item()
        node_mse = F.mse_loss(reactivity_score, data.reactivity_score).item()
        graph_mse = F.mse_loss(predicted_yield, data.y).item()

        bce_losses.append(bce)
        node_mse_losses.append(node_mse)
        graph_mse_losses.append(graph_mse)

    return {
        "BCE_loss": np.mean(bce_losses),
        "Node_MSE": np.mean(node_mse_losses),
        "Graph_MSE": np.mean(graph_mse_losses)
    }

# --- Main K-Fold Loop ---
def run_k_fold(graphs, k_folds=5, batch_size=16, num_epochs=20, learning_rate=1e-3):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)
    all_metrics = []

    for fold, (train_idx, val_idx) in enumerate(kf.split(graphs)):
        print(f"\n🟦 Fold {fold+1}/{k_folds}")

        train_graphs = [graphs[i] for i in train_idx]
        val_graphs = [graphs[i] for i in val_idx]

        train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
        val_loader = DataLoader(val_graphs, batch_size=batch_size, shuffle=False)

        model = MPNN(
            node_in_feats=graphs[0].x.size(-1),
            edge_in_feats=graphs[0].edge_attr.size(-1),
            hidden_feats=64
        ).to(device)

        optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

        for epoch in range(num_epochs):
            loss = train(model, train_loader, optimizer, device)
            print(f"  [Epoch {epoch+1}] Loss: {loss:.4f}")

        fold_metrics = evaluate_model(model, val_loader, device)
        print(f"  🔎 Metrics Fold {fold+1}:", fold_metrics)
        all_metrics.append(fold_metrics)

    print("\n📊 Gemiddelde metrics over alle folds:")
    avg_metrics = defaultdict(list)
    for metric_dict in all_metrics:
        for k, v in metric_dict.items():
            avg_metrics[k].append(v)

    for k, v_list in avg_metrics.items():
        print(f"{k}: {np.mean(v_list):.4f} ± {np.std(v_list):.4f}")

# --- Example usage (assuming your dataset is ready) ---
# run_k_fold(graphs, k_folds=5, batch_size=16, num_epochs=20)


In [None]:
# ik moet nog even kijken wat ik hiervan nodig heb als ik de k fold wil introduceren -> en of ik uberhaupt
# hier iets van nodig heb
# Split de lijst met graphs
train_graphs, test_graphs = train_test_split(graphs, test_size=0.2, random_state=42)

# Maak DataLoaders aan voor training en evaluatie
train_loader = DataLoader(train_graphs, batch_size=32, shuffle=True)
test_loader = DataLoader(test_graphs, batch_size=32, shuffle=False)

In [None]:
# Dit wordt miss de k fold validation geintrepeteerd-> dit zou betekenen denk ik dat
# de vorige cell overboden zou worden
# graphs zijn onze smiles in een lijst

from sklearn.model_selection import KFold
from torch_geometric.loader import DataLoader

# Deze getallen zijn nog arbitrary, maar dit zijn de nodige dingen die we kunnen veranderen
# Initialisering voor de k_fold
k_folds = 5
batch_size = 16
kf = KFold(n_splits=k_folds, shuffle=True, random_state=42)


# Aantal kenmerken per node en edge
node_in_feats = 128 
edge_in_feats = 64  
# Grootte van de verborgen laag
hidden_feats = 64 


# Initialisering model
model = MPNN(node_in_feats=node_in_feats, edge_in_feats=edge_in_feats, hidden_feats=hidden_feats)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

for fold, (train_idx, test_idx) in enumerate(kf.split(graphs)):
    print(f'Fold {fold + 1}')
    
    train_graphs = [graphs[i] for i in train_idx]
    test_graphs = [graphs[i] for i in test_idx]

    train_loader = DataLoader(train_graphs, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_graphs, batch_size=batch_size, shuffle=False)

    # of initialiseer ik het model zo
    model = train(model, train_loader, optimizer, device)

    # Voor elke batch in de train_loader (trainen per fold)
    for batch in train_loader:
        x = batch.x
        edge_index = batch.edge_index
        edge_attr = batch.edge_attr
        y = batch.y
        batch_vector = batch.batch

        # Hier voer je het model uit, verliesfunctie berekenen, backpropagation etc.
        # Bijvoorbeeld:
        # output = model(x, edge_index, edge_attr, batch_vector)
        # loss = loss_fn(output, y)
        # optimizer.zero_grad()
        # loss.backward()
        # optimizer.step()

    