In [2]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['font.family'] = "sans-serif"
plt.rcParams.update({'font.size': 10})

#### COCONUT DB: Extraction

SQL query to extract the required data from the COCONUT database.

    \COPY (
        SELECT
            m.identifier AS identifier,
            m.canonical_smiles AS smiles,
            STRING_AGG(c.doi, ‘, ’) AS dois,
            p.chemical_class,
            p.chemical_sub_class,
            p.chemical_super_class
        FROM
            citations c
        JOIN
            citables ct ON c.id = ct.citation_id
        JOIN
            molecules m ON m.id = ct.citable_id
        JOIN
            properties p ON p.molecule_id = m.id
        WHERE
            ct.citable_type = ‘App\Models\Molecule’
            AND c.doi IS NOT NULL
        GROUP BY
            m.identifier, m.canonical_smiles, p.chemical_class, p.chemical_sub_class, p.chemical_super_class
    ) TO ‘output.csv’ WITH CSV HEADER;

#### COCONUT DB: Formating

Import extracted dataset

In [2]:
df = pd.read_csv('data/coconut/coconut_raw.csv')

Add absolute SMILES

In [3]:
df['absolute_smiles'] = df['smiles'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False))

Remove compounds with incomplete stereochemistry

In [4]:
def get_stereocenters(mol):
    stereo_info = Chem.FindPotentialStereo(mol)
    chiral_centers = []
    for info in stereo_info:
        chiral_centers.append((f'{info.centeredOn}', f'{info.type}', f'{info.specified}'))
    return chiral_centers

def has_complete_stereochemistry(smiles):
    mol = Chem.MolFromSmiles(smiles)
    chiral_centers = get_stereocenters(mol)
    for center in chiral_centers:
        if center[2] == 'Unspecified':
            return False
    return True

df = df[df['smiles'].apply(has_complete_stereochemistry)].reset_index(drop=True)

Group by absolute SMILES

In [5]:
grouped_df = df.groupby('absolute_smiles').agg({
    'smiles': lambda x: list(x),
    'identifier': lambda x: list(x),
    'chemical_class': 'first',
    'chemical_sub_class': 'first',
    'chemical_super_class': 'first',
    'dois': lambda x: list(x)
}).reset_index()

Calculate number of stereocenters; determine number of stereoisomers

In [6]:
grouped_df['mol'] = grouped_df['absolute_smiles'].apply(Chem.MolFromSmiles)
grouped_df['num_stereocenters'] = grouped_df['mol'].apply(lambda x: len(get_stereocenters(x)))
grouped_df['num_stereoisomers'] = grouped_df['smiles'].apply(len)

Export dataset

In [7]:
grouped_df.to_csv('data/coconut/coconut_clean.csv', index=False)

#### COCONUT DB: Analysis

Import clean dataset

In [8]:
df = pd.read_csv('data/coconut/coconut_clean.csv')

Show stereocenter counts

In [9]:
df['num_stereocenters'].value_counts()

num_stereocenters
0      12095
5       5770
4       5490
2       4972
3       4693
       ...  
72         1
55         1
61         1
58         1
105        1
Name: count, Length: 67, dtype: int64

Show stereoisomer counts

In [10]:
df['num_stereoisomers'].value_counts()

num_stereoisomers
1     56648
2      6107
3       800
4       303
5        67
6        22
7        14
8         4
10        3
9         2
11        1
Name: count, dtype: int64

#### COCONUT DB: Data Splits

Import clean dataset

In [3]:
df = pd.read_csv('data/coconut/coconut_clean.csv')

df['smiles'] = df['smiles'].apply(eval)
df['identifier'] = df['identifier'].apply(eval)

Shuffle dataset

In [4]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

Train-validation-test split on the level of absolute SMILES

In [5]:
np.random.seed(42)
df_shuffled['split'] = np.random.choice(['train', 'test', 'validation'], df_shuffled.shape[0], p=[0.8, 0.1, 0.1])

Assign selected compounds to the test set

In [6]:
test_structures = {
    'Colchicine': 'COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2',
    'Epothilone B': 'CC(=Cc1csc(C)n1)C1CC2OC2(C)CCCC(C)C(O)C(C)C(=O)C(C)(C)C(O)CC(=O)O1',
    'Taxol': 'CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(=O)c2ccccc2)C2(O)CC(OC(=O)C(O)C(NC(=O)c3ccccc3)c3ccccc3)C(C)=C1C2(C)C',
    'Griseofulvin': 'COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O',
    'Monomethyl Auristatin E': 'CCC(C)C(C(CC(=O)N1CCCC1C(OC)C(C)C(=O)NC(C)C(O)c1ccccc1)OC)N(C)C(=O)C(NC(=O)C(NC)C(C)C)C(C)C'
}

for structure, smiles in test_structures.items():
    idx = df_shuffled[df_shuffled['absolute_smiles'] == smiles].index
    df_shuffled.loc[idx, 'split'] = 'test'

In [7]:
df_shuffled.to_csv('data/coconut/coconut_clean_splits.csv', index=False)

Explode the dataset to the level of stereoisomers

In [15]:
df_exploded = df_shuffled.explode('smiles').explode('identifier').reset_index(drop=True)
df_exploded.drop_duplicates(subset=['smiles'], inplace=True)

Export dataset

In [16]:
df_exploded.to_csv('data/coconut/coconut_split.csv', index=False)