In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw

# chemistry libraries
from openbabel import openbabel as ob
from openbabel import pybel

import numpy as np
import pandas as pd
import multiprocessing as mp

# file reading / saving
import pickle

from molecule_functions import bernoulli_generator, sdf_to_smi
from dg_functions import mols_dg_errors, chemspace_paths, chemspace_root, chemspace_files

Defining locations of saved, enumerated chemical spaces. 

Note that RDKit does not deal with pentavalent nitrogens, and these will not be read in.

Converting some SDF files to SMILES format, which is more compact. 

Sampling the database:
$CHNOPS, w \leq 125$, originally $\approx 33,000,000$ entries, at $\frac{1}{100}, \frac{1}{300}$ and $\frac{1}{600}$ to get $\approx 330,000, \approx 113,000$ and $\approx 56,000$

and converting the PubChem molecules directly to SMILES.

In [3]:
# PubChem molecules conversion
sdf_to_smi(f'{chemspace_root}pubchem_90k.sdf', 
           f"{chemspace_root}{chemspace_files['pubchem_90k']}", sample=1.0)
sdf_to_smi(f'{chemspace_root}chembl_100k.sdf', 
           f"{chemspace_root}{chemspace_files['chembl_100k']}", sample=1.0)

Converted 86106 molecules successfully and written to: ./molecules/pubchem_90k.smi
Converted 93350 molecules successfully and written to: ./molecules/chembl_100k.smi


In [5]:
# converting the <= 125 chemical space: replacing the function to do in 1 pass since there's 33 million to go through
samples_data = [ 
    {
        'name': '125_338k',
        'writer': pybel.Outputfile('smi', chemspace_files['125_338k'], overwrite=True),
        'sampler': bernoulli_generator(1/100),
        'size': 0
    },
    {
        'name': '125_113k',
        'writer': pybel.Outputfile('smi', chemspace_files['125_113k'], overwrite=True),
        'sampler': bernoulli_generator(1/300),
        'size': 0
    },
    {
        'name': '125_56k',
        'writer': pybel.Outputfile('smi', chemspace_files['125_56k'], overwrite=True),
        'sampler': bernoulli_generator(1/600),
        'size': 0
    }
]

for i, molecule in enumerate(pybel.readfile('sdf', f'{chemspace_root}chemspace_125.sdf')):
    print(f'Progress: {i}', end='\r')
    for data in samples_data:
        if next(data['sampler']):
            data['size'] += 1
            data['writer'].write(molecule)
    
for data in samples_data:
    print(f"Converted {data['size']} molecules successfully and written to: {chemspace_files[data['name']]}")

Converted 337956 molecules successfully and written to: chemspace_125_338k.smi
Converted 113221 molecules successfully and written to: chemspace_125_113k.smi
Converted 56356 molecules successfully and written to: chemspace_125_56k.smi


In [2]:
# c6h6 and c6h5 isomers
#sdf_to_smi(f'{chemspace_root}C6H6.sdf', 
#           f"{chemspace_root}{chemspace_files['c6h6']}", sample=1.0)
sdf_to_smi(f'{chemspace_root}C5H6.sdf', 
           f"{chemspace_root}{chemspace_files['c5h6']}", sample=1.0)

Converted 40 molecules successfully and written to: ./molecules/C5H6.smi
