# Prepare Data Sets

Prepare the necessary reference data set.

1. Natural products from ChEMBL (ChEMBL NPs)
2. Approved and experimental drugs from DrugBank (DrugBank)

In [1]:
%reload_ext autoreload
%autoreload 2
# def warn(*args, **kwargs):
# warn    pass  # to silence scikit-learn warnings

import warnings
warnings.filterwarnings('ignore')
# warnings.warn = warn

# Global Imports
# from collections import Counter
# import glob
from pathlib import Path
import sys

import pandas as pd
import numpy as np
# from pandarallel import pandarallel
# pandarallel.initialize()

# from scipy.stats import median_absolute_deviation as mad

from rdkit import DataStructs
from rdkit.Chem import AllChem as Chem
# from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole

from Contrib.NP_Score import npscorer

import matplotlib.pyplot as plt
import seaborn as sns

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

# Project-local Imports
PROJECT_DIR = list(Path("..").absolute().parents)[1]
sys.path.append(str(PROJECT_DIR))
import plt_style
import utils as u
from utils import lp

Path("../tmp").mkdir(exist_ok=True)

## ChEMBL NP

TODO: Describe the process of obtaining the records from ChEMBL.

* 40525 cpds.
* 35026 cpds. after deglycosylation, standardization and deduplication

### Loading

Load the result of the SQL query, the ChEMBL IDs that have "J. Nat. Prod." or "J Nat Prod" listed as journal:

In [2]:
chembl_np_ids = set(pd.read_csv("../Input Data/chembl_26_natprot_ids_only.tsv", sep="\t")["Chembl_Id"].values)
lp(chembl_np_ids)

Length                                            :      40547


Load the downloaded ChEMBL 26 SD file (gzipped) and keep only the natural product IDs:

In [4]:
# Takes 10min
chembl_np = u.read_sdf("/home/pahl/comas/notebooks/sdf/cpd_libs_2003/chembl_26.sdf.gz", merge_prop="chembl_id", merge_list=chembl_np_ids)

RDKit ERROR: [14:56:56] Unexpected error hit on line 39551099


SystemError: <Boost.Python.function object at 0x55f99114b270> returned a result with an error set

In [16]:
lp(chembl_np, "ChEMBL NP")
chembl_np.head()

Shape ChEMBL NP                                   :      40525 /    3  [ Smiles, Name, chembl_id ]  


Unnamed: 0,Smiles,Name,chembl_id
0,CC(=O)O[C@H]1/C(C)=C\[C@@H]2OC(=O)[C@]3(C)O[C@...,CHEMBL503865,CHEMBL503865
1,CO[C@H]1C[C@H](COC[C@H]2[C@@H](OC)C[C@H](O[C@H...,CHEMBL503873,CHEMBL503873
2,COc1ccc2c(c1OC)C(CC1(C)C=Cc3c(c4cccc(OC)c4n(C)...,CHEMBL504077,CHEMBL504077
3,CC(C)=C[C@H]1C[C@](C)(O)[C@@H]2[C@H]3CC[C@@H]4...,CHEMBL501665,CHEMBL501665
4,CCCCC[C@H]1CCCCCCCCCC(=O)O[C@@H]2[C@@H](O[C@@H...,CHEMBL501672,CHEMBL501672


### Initial Standardization

Standardization implemented in MolVS:
* standardization
* remove stereo information
* choose largest fragment

In [17]:
# Takes 3min
chembl_np = u.apply_to_smiles(chembl_np, "Smiles", {"Smiles_Std": lambda x: u.standardize_mol(x, remove_stereo=True)})
chembl_np.drop("Smiles", axis=1, inplace=True)
chembl_np = chembl_np.rename(columns={"Smiles_Std": "Smiles"})

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40525.0), HTML(value='')))

RDKit ERROR: [17:31:36] ERROR: Explicit valence for atom # 0 N, 4, is greater than permitted
RDKit ERROR: [18:19:29] Explicit valence for atom # 26 B, 5, is greater than permitted





In [18]:
lp(chembl_np, "ChEMBL NP")

Shape ChEMBL NP                                   :      40525 /    3  [ Name, chembl_id, Smiles ]  ( NAN values in 1 col(s) )


### Filters (MedChem, Duplicates)

The data set is filtered for Isotopes, MedChemAtoms, MinHeavyAtoms and Duplicates.  
The deduplication is performed here as well to remove as many compounds as possible at early stages of the processing.  
The filter for MaxHeavyAtoms is deferred until after the deglycosylation.

In [19]:
# Takes 2min
chembl_np = u.filter_mols(chembl_np, "Smiles", ["Isotopes", "MedChemAtoms", "MinHeavyAtoms", "Duplicates"])
lp(chembl_np, "ChEMBL NP (MedChem filters)")

chembl_np.head(10)

Applying filters (4)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40525.0), HTML(value='')))


Applied filter Isotopes: 40518


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40518.0), HTML(value='')))


Applied filter MedChemAtoms: 40481


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40481.0), HTML(value='')))


Applied filter MinHeavyAtoms: 40478


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=40478.0), HTML(value='')))


Applied filter Duplicates: 36489
Shape ChEMBL NP (MedChem filters)                 :      36489 /    3  [ Name, chembl_id, Smiles ]  


Unnamed: 0,Name,chembl_id,Smiles
0,CHEMBL503865,CHEMBL503865,CC(=O)OC1C(C)=CC2OC(=O)C3(C)OC23C(OC(C)=O)C2C(...
1,CHEMBL503873,CHEMBL503873,COC1CC(COCC2C(C)OC(OC3CCC4(C)C(=CCC5(O)C4CC(OC...
2,CHEMBL504077,CHEMBL504077,COc1ccc2c(c1OC)C(CC1(C)C=Cc3c(c4cccc(OC)c4n(C)...
3,CHEMBL501665,CHEMBL501665,CC(C)=CC1CC(C)(O)C2C3CCC4C5(C)CCC(OC6OCC(O)C(O...
4,CHEMBL501672,CHEMBL501672,CCCCCC1CCCCCCCCCC(=O)OC2C(OC3OC(C)C(OC(=O)C(C)...
5,CHEMBL501674,CHEMBL501674,COC1C(OCC2C(C)OC(=O)C=CC(C)C(OC3OC(C)CC(O)C3O)...
6,CHEMBL501943,CHEMBL501943,COc1cc(C(O)C(COC(=O)C=Cc2ccc(O)cc2)Oc2c(OC)cc(...
7,CHEMBL501944,CHEMBL501944,COc1cc(O)c(-c2cc(-c3cc(-c4c(O)cc(O)c5c4CC(C)NC...
8,CHEMBL501918,CHEMBL501918,CC1COC2(CC1O)OC1CC3C4CC=C5CC(O)CC(OC6OCC(O)C(O...
9,CHEMBL501922,CHEMBL501922,COC(=O)CC1C(C)(C)C(OC(C)=O)C2C(=O)C1(C)C1CCC3(...


In [20]:
chembl_np.to_csv("../tmp/chembl_np.tsv", sep="\t", index=False)

### Deglycosylation

Deglycosylation is performed using the KNIME Sugar Remover node (CDK). KNIME v 4.2.3.  
Here, the result of the deglycosylation is loaded again.

In [None]:
chembl_np = pd.read_csv("../tmp/chembl_np_deglyco.tsv", sep="\t")
lp(chembl_np, "ChEMBL NP (Deglyco)")

### Standardization

Cleanup after deglycosylation:

* standardization
* choose largest fragment


In [21]:
# Takes 1:20min
chembl_np_std = u.apply_to_smiles(chembl_np, "Smiles", {"Smiles_Std": u.standardize_mol})
chembl_np_std.drop("Smiles", axis=1, inplace=True)
chembl_np_std = chembl_np_std.rename(columns={"Smiles_Std": "Smiles"})
lp(chembl_np_std, "ChEMBL NP (Deglyco)")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=36489.0), HTML(value='')))


Shape ChEMBL NP (Deglyco)                         :      36489 /    3  [ Name, chembl_id, Smiles ]  


In [22]:
chembl_np_std = u.filter_mols(chembl_np_std, "Smiles", ["MinHeavyAtoms", "MaxHeavyAtoms", "Duplicates"])
lp(chembl_np_std, "ChEMBL NP (Filters)")

Applying filters (3)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=36489.0), HTML(value='')))


Applied filter MinHeavyAtoms: 36489
Applied filter MaxHeavyAtoms: 35172


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=35172.0), HTML(value='')))


Applied filter Duplicates: 35168
Shape ChEMBL NP (Filters)                         :      35168 /    3  [ Name, chembl_id, Smiles ]  


### Canonical Tautomer

In [23]:
# Takes 28min
chembl_np_std_tauto = u.apply_to_smiles(chembl_np_std, "Smiles", {"Smiles_Std": lambda x: u.standardize_mol(x, canonicalize_tautomer=True)}, parallel=True)

INFO: Pandarallel will run on 6 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=5862), Label(value='0 / 5862'))), …

In [24]:
chembl_np_std_tauto.drop("Smiles", axis=1, inplace=True)
chembl_np_std_tauto = chembl_np_std_tauto.rename(columns={"Smiles_Std": "Smiles"})

### Filter Duplicates

Remove duplicates that might have been formed by canonicalization.

In [25]:
chembl_np_std_tauto = u.filter_mols(chembl_np_std_tauto, "Smiles", "Duplicates")
lp(chembl_np_std_tauto, "ChEMBL NP (Filter duplicates)")

chembl_np_std_tauto.head()

Applying filters (1)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=35168.0), HTML(value='')))


Applied filter Duplicates: 35026
Shape ChEMBL NP (Filter duplicates)               :      35026 /    3  [ Name, chembl_id, Smiles ]  


Unnamed: 0,Name,chembl_id,Smiles
0,CHEMBL503865,CHEMBL503865,CC(=O)OC1C(C)=CC2OC(=O)C3(C)OC23C(OC(C)=O)C2C(...
2,CHEMBL504077,CHEMBL504077,COc1ccc2c(c1OC)C(CC1(C)C=Cc3c(c4cccc(OC)c4n(C)...
3,CHEMBL501665,CHEMBL501665,CC(C)=CC1CC(C)(O)C2C3CCC4C5(C)CCC(OC6OCC(O)C(O...
4,CHEMBL501672,CHEMBL501672,CCCCCC1CCCCCCCCCC(=O)OC2C(OC3OC(C)C(OC(=O)C(C)...
5,CHEMBL501674,CHEMBL501674,COC1C(OCC2C(C)OC(=O)C=CC(C)C(OC3OC(C)CC(O)C3O)...


### Write Data Set

In [26]:
chembl_np_std_tauto.to_csv("../tmp/chembl_np_std.tsv", sep="\t", index=False)

### Create Subset

In [12]:
chembl_np = pd.read_csv("../tmp/chembl_np_std.tsv", sep="\t")
chembl_np_subset = chembl_np.sample(n=5000, random_state=0xc0ffee)
chembl_np_subset.to_csv("../tmp/chembl_np_std_subset.tsv", sep="\t", index=False)
lp(chembl_np_subset, "ChEMBL NP subset")

Shape ChEMBL NP subset                            :       5000 /    3  [ Name, chembl_id, Smiles ]  


## DrugBank

Approved and experimental drugs, downloaded from DrugBank on 25-Mar-2020, v 5.1.5.

* 8479 cpds.
* 7472 cpds. after standardization and deduplication

### Loading

Load both SD files of approved and experimental drugs.

In [3]:
drugbank = u.read_sdf(["/home/pahl/comas/notebooks/sdf/drugbank/drugbank_5.1.5_appr.sdf", "/home/pahl/comas/notebooks/sdf/drugbank/drugbank_5.1.5_exp.sdf"])
drugbank =drugbank[["DATABASE_ID", "Smiles"]]
lp(drugbank, "DrugBank")

drugbank.head()

RDKit ERROR: [21:16:13] ERROR: Explicit valence for atom # 6 K, 2, is greater than permitted
RDKit ERROR: [21:17:24] Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [21:17:24] ERROR: Could not sanitize molecule ending on line 19217
RDKit ERROR: [21:17:24] ERROR: Explicit valence for atom # 7 N, 4, is greater than permitted
RDKit ERROR: [21:17:25] Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [21:17:25] ERROR: Could not sanitize molecule ending on line 318021
RDKit ERROR: [21:17:25] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [21:17:26] Explicit valence for atom # 14 Cl, 2, is greater than permitted
RDKit ERROR: [21:17:26] ERROR: Could not sanitize molecule ending on line 405070
RDKit ERROR: [21:17:26] ERROR: Explicit valence for atom # 14 Cl, 2, is greater than permitted
RDKit ERROR: [21:17:26] Explicit valence for atom # 13 Cl, 5, is greater than permitted
RDKit ERROR: [21:17:26] ERROR: Cou

{'In': 8479, 'Out': 8462, 'Fail_NoMol': 17}
Shape DrugBank                                    :       8462 /    2  [ DATABASE_ID, Smiles ]  


RDKit ERROR: [21:17:29] ERROR: Explicit valence for atom # 10 N, 4, is greater than permitted
RDKit ERROR: [21:17:30] Explicit valence for atom # 14 Cl, 2, is greater than permitted
RDKit ERROR: [21:17:30] ERROR: Could not sanitize molecule ending on line 925357
RDKit ERROR: [21:17:30] ERROR: Explicit valence for atom # 14 Cl, 2, is greater than permitted
RDKit ERROR: [21:17:30] Explicit valence for atom # 6 K, 2, is greater than permitted
RDKit ERROR: [21:17:30] ERROR: Could not sanitize molecule ending on line 930170


Unnamed: 0,DATABASE_ID,Smiles
0,DB00006,CC[C@H](C)[C@H](NC(=O)[C@H](CCC(=O)O)NC(=O)[C@...
1,DB00007,CCNC(=O)[C@@H]1CCCN1C(=O)[C@H](CCCNC(=N)N)NC(=...
2,DB00014,CC(C)C[C@H](NC(=O)[C@@H](COC(C)(C)C)NC(=O)[C@H...
3,DB00027,CC(C)C[C@@H](NC(=O)CNC(=O)[C@@H](NC=O)C(C)C)C(...
4,DB00035,N=C(N)NCCC[C@@H](NC(=O)[C@@H]1CCCN1C(=O)[C@@H]...


### Standardization

Standardization implemented in MolVS:
* standardization
* remove stereo information
* choose largest fragment
* canonicalize tautomer

In [4]:
drugbank = u.apply_to_smiles(drugbank, "Smiles", {"Smiles_Std": lambda x: u.standardize_mol(x, remove_stereo=True, canonicalize_tautomer=True)})
drugbank.drop("Smiles", axis=1, inplace=True)
drugbank = drugbank.rename(columns={"Smiles_Std": "Smiles"})
lp(drugbank, "DrugBank")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8462.0), HTML(value='')))

RDKit ERROR: [21:17:30] ERROR: Explicit valence for atom # 6 K, 2, is greater than permitted
RDKit ERROR: [21:26:09] Can't kekulize mol.  Unkekulized atoms: 3 10
RDKit ERROR: 
RDKit ERROR: [21:26:09] Can't kekulize mol.  Unkekulized atoms: 3 10
RDKit ERROR: 
RDKit ERROR: [21:28:07] Explicit valence for atom # 6 B, 5, is greater than permitted
RDKit ERROR: [21:28:56] Explicit valence for atom # 13 B, 5, is greater than permitted
RDKit ERROR: [21:29:07] Explicit valence for atom # 16 B, 5, is greater than permitted
RDKit ERROR: [21:30:01] Explicit valence for atom # 16 B, 5, is greater than permitted
RDKit ERROR: [21:30:27] Explicit valence for atom # 13 B, 5, is greater than permitted
RDKit ERROR: [21:30:39] Explicit valence for atom # 6 B, 5, is greater than permitted
RDKit ERROR: [21:31:56] Explicit valence for atom # 6 B, 5, is greater than permitted
RDKit ERROR: [21:34:05] Explicit valence for atom # 16 B, 5, is greater than permitted
RDKit ERROR: [21:34:05] Explicit valence for ato


Shape DrugBank                                    :       8462 /    2  [ DATABASE_ID, Smiles ]  ( NAN values in 1 col(s) )


### Filters (MedChem, Duplicates)

The data set is filtered for Isotopes, MedChemAtoms, MinHeavyAtoms, MaxHeavyAtoms and Duplicates.  

In [5]:
drugbank = u.filter_mols(drugbank, "Smiles", ["Isotopes", "MedChemAtoms", "MinHeavyAtoms", "MaxHeavyAtoms", "Duplicates"])
lp(drugbank, "DrugBank (MedChem filters)")

drugbank.head(10)

Applying filters (5)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8462.0), HTML(value='')))


Applied filter Isotopes: 8404


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8404.0), HTML(value='')))


Applied filter MedChemAtoms: 8187


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8187.0), HTML(value='')))


Applied filter MinHeavyAtoms: 8142
Applied filter MaxHeavyAtoms: 8058


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=8058.0), HTML(value='')))


Applied filter Duplicates: 7472
Shape DrugBank (MedChem filters)                  :       7472 /    2  [ DATABASE_ID, Smiles ]  


Unnamed: 0,DATABASE_ID,Smiles
4,DB00035,N=C(N)NCCCC(NC(=O)C1CCCN1C(=O)C1CSSCCC(=O)NC(C...
8,DB00104,CC(O)C(CO)NC(=O)C1CSSCC(NC(=O)C(N)Cc2ccccc2)C(...
10,DB00114,Cc1ncc(CO[PH](=O)(=O)O)c(C=O)c1O
12,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(=N)[nH]cnc...
13,DB00119,CC(=O)C(=O)O
14,DB00120,NC(Cc1ccccc1)C(=O)O
15,DB00121,O=C(O)CCCCC1SCC2NC(=O)NC21
16,DB00122,C[N+](C)(C)CCO
17,DB00123,NCCCCC(N)C(=O)O
18,DB00126,OCC(O)c1oc(O)c(O)c1O


### Write Data Set

In [8]:
drugbank.to_csv("../tmp/drugbank_std.tsv", sep="\t", index=False)
lp(drugbank, "DrugBank (MedChem filters)")

drugbank.head()

Shape DrugBank (MedChem filters)                  :       7472 /    2  [ DATABASE_ID, Smiles ]  


Unnamed: 0,DATABASE_ID,Smiles
4,DB00035,N=C(N)NCCCC(NC(=O)C1CCCN1C(=O)C1CSSCCC(=O)NC(C...
8,DB00104,CC(O)C(CO)NC(=O)C1CSSCC(NC(=O)C(N)Cc2ccccc2)C(...
10,DB00114,Cc1ncc(CO[PH](=O)(=O)O)c(C=O)c1O
12,DB00118,C[S+](CCC(N)C(=O)[O-])CC1OC(n2cnc3c(=N)[nH]cnc...
13,DB00119,CC(=O)C(=O)O


### Create Subset

In [13]:
drugbank = pd.read_csv("../tmp/drugbank_std.tsv", sep="\t")
drugbank_subset = drugbank.sample(n=5000, random_state=0xc0ffee)
drugbank_subset.to_csv("../tmp/drugbank_std_subset.tsv", sep="\t", index=False)
lp(drugbank_subset, "Drugbank subset")

Shape Drugbank subset                             :       5000 /    2  [ DATABASE_ID, Smiles ]  


## Internal Compounds

Approved and experimental drugs, downloaded from DrugBank on 25-Mar-2020, v 5.1.5.

* 244 individual cpds.
* 168 cpds. after standardization and deduplication, based on InChIKeys after removal of stereo centers
    (for consistency with the preparation of the other data sets)

### Loading

Load Load list of 244 compounds with their Smiles.

In [6]:
cpds = pd.read_csv("../Input Data/internal_cpds.tsv", sep="\t")
lp(cpds, "Internal compounds")

cpds.head()

Shape Internal compounds                          :        244 /    3  [ Compound_Id, CpdClass, Smiles ]  


Unnamed: 0,Compound_Id,CpdClass,Smiles
0,408580,QN-C-S,COc1ccc2nccc([C@@H](O)[C@@H]3C[C@@H]4CCN3C[C@]...
1,408614,QN-C-S,COc1ccc2nccc([C@@H](O)[C@@H]3C[C@@H]4CCN3C[C@]...
2,409679,QN-C-S,COc1ccc2nccc([C@@H](O)[C@@H]3C[C@@H]4CCN3C[C@]...
3,408605,QN-C-S,COc1ccc2nccc([C@@H](O)[C@@H]3C[C@@H]4CCN3C[C@]...
4,408610,QN-C-S,COc1ccc2c(c1)O[C@]1(CC2=O)CN2CC[C@H]1C[C@H]2[C...


### Standardization

Standardization implemented in MolVS:
* standardization
* remove stereo information
* choose largest fragment
* canonicalize tautomer

In [7]:
cpds = u.apply_to_smiles(cpds, "Smiles", {"Smiles_Std": lambda x: u.standardize_mol(x, remove_stereo=True, canonicalize_tautomer=True)})
cpds.drop("Smiles", axis=1, inplace=True)
cpds = cpds.rename(columns={"Smiles_Std": "Smiles"})
lp(cpds, "Internal compounds")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))


Shape Internal compounds                          :        244 /    3  [ Compound_Id, CpdClass, Smiles ]  


### Filters (MedChem, Duplicates)

The data set is filtered for Isotopes, MedChemAtoms, MinHeavyAtoms, MaxHeavyAtoms and Duplicates.  

In [8]:
cpds = u.filter_mols(cpds, "Smiles", ["Isotopes", "MedChemAtoms", "MinHeavyAtoms", "MaxHeavyAtoms", "Duplicates"])
lp(cpds, "Internal compounds (MedChem filters)")

cpds.head(10)

Applying filters (5)...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))


Applied filter Isotopes: 244


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))


Applied filter MedChemAtoms: 244


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))


Applied filter MinHeavyAtoms: 244
Applied filter MaxHeavyAtoms: 244


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=244.0), HTML(value='')))


Applied filter Duplicates: 168
Shape Internal compounds (MedChem filters)        :        168 /    3  [ Compound_Id, CpdClass, Smiles ]  


Unnamed: 0,Compound_Id,CpdClass,Smiles
0,408580,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4ccccc4O3...
1,408614,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(Br)cc...
2,409679,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(Cl)cc...
3,408605,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(F)ccc...
4,408610,QN-C-S,COc1ccc2c(c1)OC1(CC2=O)CN2CCC1CC2C(O)c1ccnc2cc...
5,408643,QN-C-S,COc1cc(OC)c2c(c1)OC1(CC2=O)CN2CCC1CC2C(O)c1ccn...
6,408636,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4ccc(OC)c...
7,408584,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4c(O)cccc...
8,408586,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4ccc(O)cc...
9,408612,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(C)ccc...


In [9]:
cpds.to_csv("../tmp/internal_cpds_std.tsv", sep="\t", index=False)
lp(cpds, "Internal compounds (MedChem filters)")

cpds.head()

Shape Internal compounds (MedChem filters)        :        168 /    3  [ Compound_Id, CpdClass, Smiles ]  


Unnamed: 0,Compound_Id,CpdClass,Smiles
0,408580,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4ccccc4O3...
1,408614,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(Br)cc...
2,409679,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(Cl)cc...
3,408605,QN-C-S,COc1ccc2nccc(C(O)C3CC4CCN3CC43CC(=O)c4cc(F)ccc...
4,408610,QN-C-S,COc1ccc2c(c1)OC1(CC2=O)CN2CCC1CC2C(O)c1ccnc2cc...
