# GEOM DRUGS processing

In [None]:
import os
import sys

In [None]:
project_dir = os.path.join(os.path.abspath(""))
sys.path.append(project_dir)

In [None]:
from functools import partial
from pathlib import Path

import pandas as pd
import rdkit
from rdkit import Chem, rdBase
from rdkit.Chem import rdDepictor
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem.rdmolfiles import SDMolSupplier, SDWriter
from rdkit.Chem.rdmolops import RemoveHs, SanitizeMol, AddHs
from tqdm import tqdm

import semlaflow.scriptutil as util
from semlaflow.data.datasets import GeometricDataset


In [None]:
rdBase.DisableLog("rdApp.warning")  # disable waning on MorganGenerator
rdDepictor.SetPreferCoordGen(True)
print(rdkit.__version__)

## Load dataset from Semla files

In [None]:
dataset_path = project_dir + "/data/geom-drugs/smol/test.smol"

n_molecules = 100
n_bond_types = 5
vocab = util.build_vocab()
coord_std = util.GEOM_COORDS_STD_DEV
transform = partial(
    util.mol_transform, vocab=vocab, n_bonds=n_bond_types, coord_std=coord_std
)
dataset = GeometricDataset.load(dataset_path, transform=transform)

## Convert one to RDKit

In [None]:
dataset._data[0].to_rdkit(vocab)

## Convert smol files to sdf files

In [None]:
dataset_path = project_dir + "/data/geom-drugs/smol/test.smol"

n_bond_types = 5
vocab = util.build_vocab()
coord_std = util.GEOM_COORDS_STD_DEV
transform = partial(
    util.mol_transform, vocab=vocab, n_bonds=n_bond_types, coord_std=coord_std
)
dataset = GeometricDataset.load(dataset_path, transform=transform)

file = "test.sdf"
writer = SDWriter(file)
for data in tqdm(dataset._data):
    mol = data.to_rdkit(vocab)
    writer.write(mol)

In [None]:
dataset_path = project_dir + "/data/geom-drugs/smol/val.smol"

n_bond_types = 5
vocab = util.build_vocab()
coord_std = util.GEOM_COORDS_STD_DEV
transform = partial(
    util.mol_transform, vocab=vocab, n_bonds=n_bond_types, coord_std=coord_std
)
dataset = GeometricDataset.load(dataset_path, transform=transform)

file = "val.sdf"
writer = SDWriter(file)
for data in tqdm(dataset._data):
    mol = data.to_rdkit(vocab)
    writer.write(mol)

In [None]:
dataset_path = project_dir + "/data/geom-drugs/smol/train.smol"

n_bond_types = 5
vocab = util.build_vocab()
coord_std = util.GEOM_COORDS_STD_DEV
transform = partial(
    util.mol_transform, vocab=vocab, n_bonds=n_bond_types, coord_std=coord_std
)
dataset = GeometricDataset.load(dataset_path, transform=transform)

file = "train.sdf"
writer = SDWriter(file)
for data in tqdm(dataset._data):
    mol = data.to_rdkit(vocab)
    writer.write(mol)

## Convert smol files to smiles

In [None]:
dataset_path = project_dir + "/data/geom-drugs/smol/train.smol"

n_bond_types = 5
vocab = util.build_vocab()
coord_std = util.GEOM_COORDS_STD_DEV
transform = partial(
    util.mol_transform, vocab=vocab, n_bonds=n_bond_types, coord_std=coord_std
)
dataset = GeometricDataset.load(dataset_path, transform=transform)


In [None]:
def smol_to_smiles(smol):
    try:
        mol = smol.to_rdkit(vocab)
        # mol = AddHs(mol, addCoords=True)
        mol = RemoveHs(mol)
        smiles = Chem.MolToSmiles(mol, canonical=True)
    except:
        return None
    return smiles

In [None]:
smiles = [smol_to_smiles(smol) for smol in tqdm(dataset._data)]
smiles = set(smiles) - set([None, ""])
len(smiles)

In [None]:
with open("evaluation/truth/train.smiles", "w") as f:
    for s in smiles:
        f.write(s + "\n")