In [None]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')

import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = "Arial"
plt.rcParams['font.family'] = "sans-serif"
plt.rcParams.update({'font.size': 10})

#### COCONUT DB: Extraction

SQL query to extract the required data from the COCONUT database.

    \COPY (
        SELECT
            m.identifier AS identifier,
            m.canonical_smiles AS smiles,
            STRING_AGG(c.doi, ‘, ’) AS dois,
            p.chemical_class,
            p.chemical_sub_class,
            p.chemical_super_class
        FROM
            citations c
        JOIN
            citables ct ON c.id = ct.citation_id
        JOIN
            molecules m ON m.id = ct.citable_id
        JOIN
            properties p ON p.molecule_id = m.id
        WHERE
            ct.citable_type = ‘App\Models\Molecule’
            AND c.doi IS NOT NULL
        GROUP BY
            m.identifier, m.canonical_smiles, p.chemical_class, p.chemical_sub_class, p.chemical_super_class
    ) TO ‘output.csv’ WITH CSV HEADER;

#### COCONUT DB: Formating

Import extracted dataset

In [None]:
df = pd.read_csv('data/coconut/coconut-raw.csv')

Add absolute SMILES

In [None]:
df['absolute_smiles'] = df['smiles'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=False, kekuleSmiles=False))

Remove compounds with incomplete stereochemistry

In [None]:
def get_stereocenters(mol):
    stereo_info = Chem.FindPotentialStereo(mol)
    chiral_centers = []
    for info in stereo_info:
        chiral_centers.append((f'{info.centeredOn}', f'{info.type}', f'{info.specified}'))
    return chiral_centers

def has_complete_stereochemistry(smiles):
    mol = Chem.MolFromSmiles(smiles)
    chiral_centers = get_stereocenters(mol)
    for center in chiral_centers:
        if center[2] == 'Unspecified':
            return False
    return True

df['is_complete'] = df['smiles'].apply(has_complete_stereochemistry)
df_complete = df[df['is_complete'] == True].drop(columns=['is_complete'])
df_incomplete = df[df['is_complete'] == False].drop(columns=['is_complete'])

print(f'Complete (isomeric): {len(df_complete)}')
print(f'Incomplete (isomeric): {len(df_incomplete)}')

Export dataset with incomplete assignments for later analysis

In [None]:
def count_total_stereocenters(smiles):
    mol = Chem.MolFromSmiles(smiles)
    stereo_info = Chem.FindPotentialStereo(mol)
    return len(stereo_info)

def count_unassigned_stereocenters(smiles):
    mol = Chem.MolFromSmiles(smiles)
    stereo_info = Chem.FindPotentialStereo(mol)
    unassigned_count = 0
    for info in stereo_info:
        if info.specified == Chem.StereoSpecified.Unspecified:
            unassigned_count += 1
    return unassigned_count

df_incomplete['num_stereocenters'] = df_incomplete['smiles'].apply(count_total_stereocenters)
df_incomplete['num_unassigned'] = df_incomplete['smiles'].apply(count_unassigned_stereocenters)
df_incomplete[['identifier', 'smiles', 'dois', 'num_stereocenters', 'num_unassigned']].to_csv('data/coconut/coconut-incomplete.csv', index=False)

Group by absolute SMILES

In [None]:
grouped_df_complete = df_complete.groupby('absolute_smiles').agg({
    'smiles': lambda x: list(x),
    'identifier': lambda x: list(x),
    'chemical_class': 'first',
    'chemical_sub_class': 'first',
    'chemical_super_class': 'first',
    'dois': lambda x: list(x)
}).reset_index()

grouped_df_incomplete = df_incomplete.groupby('absolute_smiles').agg({
    'smiles': lambda x: list(x),
    'identifier': lambda x: list(x),
    'chemical_class': 'first',
    'chemical_sub_class': 'first',
    'chemical_super_class': 'first',
    'dois': lambda x: list(x)
}).reset_index()

print(f'Complete (absolute): {len(grouped_df_complete)}')
print(f'Incomplete (absolute): {len(grouped_df_incomplete)}')

Calculate number of stereocenters; determine number of stereoisomers

In [None]:
grouped_df_complete['num_stereocenters'] = grouped_df_complete['absolute_smiles'].apply(count_total_stereocenters)
grouped_df_complete['num_stereoisomers'] = grouped_df_complete['smiles'].apply(len)

grouped_df_incomplete['num_stereocenters'] = grouped_df_incomplete['absolute_smiles'].apply(count_total_stereocenters)
grouped_df_incomplete['num_stereoisomers'] = grouped_df_incomplete['smiles'].apply(len)

Export dataset

In [None]:
grouped_df_complete.to_csv('data/coconut/coconut-complete-clean.csv', index=False)
grouped_df_incomplete.to_csv('data/coconut/coconut-incomplete-clean.csv', index=False)

#### COCONUT DB: Data Splits

Import clean dataset

In [None]:
df = pd.read_csv('data/coconut/coconut-complete-clean.csv')

df['smiles'] = df['smiles'].apply(eval)
df['identifier'] = df['identifier'].apply(eval)

Shuffle dataset

In [None]:
df_shuffled = df.sample(frac=1, random_state=42).reset_index(drop=True)

Train-validation-test split on the level of absolute SMILES

In [None]:
def generate_split_dataset(df, seed=42):
    np.random.seed(seed)
    
    df['split'] = np.random.choice(
        ['train', 'test', 'validation'],
        size=df.shape[0],
        p=[0.8, 0.1, 0.1]
    )
    
    test_structures = {
        'Colchicine': 'COc1cc2c(c(OC)c1OC)-c1ccc(OC)c(=O)cc1C(NC(C)=O)CC2',
        'Epothilone B': 'CC(=Cc1csc(C)n1)C1CC2OC2(C)CCCC(C)C(O)C(C)C(=O)C(C)(C)C(O)CC(=O)O1',
        'Taxol': 'CC(=O)OC1C(=O)C2(C)C(O)CC3OCC3(OC(C)=O)C2C(OC(=O)c2ccccc2)C2(O)CC(OC(=O)C(O)C(NC(=O)c3ccccc3)c3ccccc3)C(C)=C1C2(C)C',
        'Griseofulvin': 'COC1=CC(=O)CC(C)C12Oc1c(Cl)c(OC)cc(OC)c1C2=O',
        'Monomethyl Auristatin E': 'CCC(C)C(C(CC(=O)N1CCCC1C(OC)C(C)C(=O)NC(C)C(O)c1ccccc1)OC)N(C)C(=O)C(NC(=O)C(NC)C(C)C)C(C)C'
    }
    
    for structure, smiles in test_structures.items():
        idx = df[df['absolute_smiles'] == smiles].index
        df.loc[idx, 'split'] = 'test'
    
    df_exploded = df.explode('smiles').explode('identifier').reset_index(drop=True)
    df_exploded.drop_duplicates(subset=['smiles'], inplace=True)
    df_exploded.to_csv(f'data/coconut/coconut-split-{seed}.csv', index=False)
    
    return df_exploded

for i in [0, 1, 42]:
    generate_split_dataset(df_shuffled, seed=i)