In [1]:
import pandas as pd
import numpy as np
import re 
import os
import sys

#Deze staan ook in rdkithelperdinges
# noinspection PyPackageRequirements
import rdkit

# noinspection PyPackageRequirements
import rdkit.Chem.AllChem

# noinspection PyPackageRequirements
import rdkit.Chem.Draw

# noinspection PyPackageRequirements
import rdkit.Chem.rdmolfiles

from rdkit.Chem.rdmolfiles import  MolFromXYZFile
from rdkit.Chem import AllChem

# Load the data and couple the SMILES to the yields and remove nan's

In [2]:
# --- 1. Bestanden inlezen als ruwe tekst ---
yields_path = "data/compounds_yield.csv"
smiles_path = "data/compounds_smiles.csv"

# --- 2. Parser voor yields: hoogste percentage extraheren ---
yield_data = []
with open(yields_path, "r") as f:
    for line in f:
        line = line.strip()
        if not line:
            continue
        parts = line.split(" ", 1)
        if len(parts) == 2:
            compound_id, yield_info = parts
            percentages = re.findall(r'(\d+)%', yield_info)
            if percentages:
                max_yield = max(map(int, percentages))
                yield_data.append((compound_id, str(max_yield)))

df_yields_clean = pd.DataFrame(yield_data, columns=["compound_id", "yield"])

# --- 3. Parser voor SMILES ---
smiles_data = []
with open(smiles_path, "r") as f:
    for line in f:
        parts = [p.strip() for p in line.strip().split(",")]
        if len(parts) == 4:
            compound_id, smiles_raw, number, _ = parts  # ignore smiles_normalized
            smiles_data.append((compound_id, smiles_raw, number))

df_smiles_clean = pd.DataFrame(
    smiles_data,
    columns=["compound_id", "smiles_raw", "some_number"]
)

# --- 4. Merge op compound_id ---
df_merged = pd.merge(df_smiles_clean, df_yields_clean, on="compound_id", how="inner")

print("Merged DataFrame:")
print(df_merged)


Merged DataFrame:
   compound_id                                        smiles_raw some_number  \
0        comp1                            C1C2=C(N=C(O2)C)C=CC=1           1   
1        comp2                        C1C2=C(N=C(O2)C)C=C(Br)C=1          11   
2        comp3                            C1C2=C(N=C(N2)C)C=CC=1           2   
3        comp4                          C1(Cl)=CC=NC2NC(C)=CC1=2          10   
4        comp5                        C1C2=C(N=C(O2)C)C=C(OC)C=1           1   
..         ...                                               ...         ...   
78      comp88  C1C=CC=C2N(C(=O)OC(C)(C)C)C=C([Si]([H])(C)C)C=12           1   
79      comp91                                    C1=CC(F)=NC=C1           1   
80      comp92                                    C1=CC(F)=NC=C1           2   
81      comp93                                    C1=CC(F)=NC=C1           7   
82      comp97                           N1=CC=C(C(F)(F)F)C=C1Cl           2   

   yield  
0     68  

Convert the SMILES to Graphs

In [3]:
def smiles_to_rdkit_molecule(smiles_str):
    # Convert SMILES string to RDKit mol object
    mol = rdkit.Chem.MolFromSmiles(smiles_str)
    
    # Add hydrogens to the molecule
    mol = rdkit.Chem.AddHs(mol)

    AllChem.EmbedMolecule(mol, AllChem.ETKDGv3())
    
    return mol

In [4]:
def rdkit_molecule_to_xyz_data (molecule):
   xyz_molecule = rdkit.Chem.MolToXYZBlock(molecule)
   return xyz_molecule

In [5]:
smiles_test="C1C2=C(N=C(O2)C)C=C(Br)C=1"
test1=smiles_to_rdkit_molecule(smiles_test)
print(test1)

test1_xyz=rdkit_molecule_to_xyz_data(test1)
print(test1_xyz)


<rdkit.Chem.rdchem.Mol object at 0x108237840>
17

C     -1.372177   -0.952395   -1.082560
C     -0.176280   -0.454749   -0.564692
C     -0.208677    0.394170    0.523757
N      1.089626    0.711154    0.800539
C      1.878803    0.062712   -0.106496
O      1.088560   -0.630564   -0.914587
C      3.364574    0.139175   -0.155743
C     -1.382347    0.785248    1.139820
C     -2.565536    0.292564    0.627042
Br    -4.202656    0.817841    1.460523
C     -2.545752   -0.564451   -0.470510
H     -1.350857   -1.614712   -1.931367
H      3.776957   -0.759321    0.346060
H      3.738839    0.196224   -1.216588
H      3.685350    1.058735    0.407003
H     -1.341482    1.450747    1.986537
H     -3.476944   -0.932379   -0.848739

