#### AIM: represent HeCaToS molecules in a chemical spce to see if it possible to split them 

In [56]:
from flask import Flask, render_template
app = Flask( __name__ )

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem.Draw import MolDraw2DSVG
from rdkit.Chem.Draw import rdMolDraw2D
from rdkit.Chem import rdDepictor, Descriptors, AllChem, DataStructs
import numpy as np
import pandas as pd
from sklearn.decomposition import PCA
import pickle

from sqlalchemy import create_engine

PandasTools.RenderImagesInAllDataFrames()

from rdkit.Chem.Draw import IPythonConsole

import matplotlib as plt

In [2]:
def calc_fp_arr( mols ):
    fplist = []
    for mol in mols:
        arr = np.zeros( (1,) )
        fp = AllChem.GetMorganFingerprintAsBitVect( mol, 2 )
        DataStructs.ConvertToNumpyArray( fp, arr )
        fplist.append( arr )
    return np.asarray( fplist )

### Database connexion

In [4]:
# Personal login details in correct format for sqlalchemy

with open('/Users/nbosc/notebooks/alchemy_nbosc_login.txt', 'r') as f:
    engine = create_engine(f.read())

### Get HeCaToS molecules

In [5]:
df_cpd = pd.read_excel('/Users/nbosc/Documents/HeCaToS/170223_create_ChEMBL22_data_packages/170223_data_packages/data_tables/hecatos_all_versions_22.xlsx')

In [6]:
df_cpd

Unnamed: 0,PARENT_PREF_NAME,PARENT_MOLREGNO,PARENT_CHEMBL_ID,VERSION_MOLREGNO,VERSION_CHEMBL_ID,ACTIVE_FORM,HECATOS_CATEGORY
0,ACETAMINOPHEN,16450,CHEMBL112,16450,CHEMBL112,0,HEPATOTOXIC
1,AMIODARONE,27185,CHEMBL633,27185,CHEMBL633,0,CARDIOTOXIC
2,AMIODARONE,27185,CHEMBL633,630439,CHEMBL1083993,0,CARDIOTOXIC
3,AZATHIOPRINE,431141,CHEMBL1542,431141,CHEMBL1542,0,HEPATOTOXIC
4,AZATHIOPRINE,431141,CHEMBL1542,674351,CHEMBL1200400,0,HEPATOTOXIC
5,AZATHIOPRINE,431141,CHEMBL1542,675315,CHEMBL1201364,1,HEPATOTOXIC
6,CELECOXIB,18694,CHEMBL118,389859,CHEMBL395268,0,CARDIOTOXIC
7,CELECOXIB,18694,CHEMBL118,18694,CHEMBL118,0,CARDIOTOXIC
8,CELECOXIB,18694,CHEMBL118,1162670,CHEMBL1801594,0,CARDIOTOXIC
9,CLAVULANIC ACID,44876,CHEMBL777,3741,CHEMBL8290,0,HEPATOTOXIC


In [58]:
df_cpd.drop_duplicates(subset=['PARENT_PREF_NAME'])

(42, 7)

In [7]:
df_cpd.PARENT_MOLREGNO.nunique()

42

In [46]:
query = '''
SELECT
    a.molregno
    , canonical_smiles
    , mw_freebase
FROM
    chembl_23.compound_structures a JOIN chembl_23.compound_properties b ON a.molregno = b.molregno
WHERE
    a.molregno IN {}
'''
df_smiles = pd.read_sql(query.format(tuple(df_cpd.PARENT_MOLREGNO.unique())), engine)

#### Infliximab -> antibody = no structure

In [47]:
df_smiles.shape

(41, 3)

In [48]:
PandasTools.AddMoleculeColumnToFrame(df_smiles, smilesCol='canonical_smiles', molCol='mol')

In [49]:
df_smiles

Unnamed: 0,molregno,canonical_smiles,mw_freebase,mol
0,921,OC1=NC(=O)C(N1)(c2ccccc2)c3ccccc3,252.27,
1,1114,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C[C@H](N)[C@@H](O)[C@H](C)O5)c4c(O)c3C(=O)c12)C(=O)CO,543.52,
2,1280,CC(=O)Oc1ccccc1C(=O)O,180.16,
3,1615,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O,334.39,
4,2217,CCC1(C(=O)NC(=O)NC1=O)c2ccccc2,232.24,
5,8062,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C@@]3(OC(=O)C)[C@H]2[C@H](OC(=O)c4ccccc4)[C@]5(O)C[C@H](OC(=O)[C@H](O)[C@@H](NC(=O)c6ccccc6)c7ccccc7)C(=C1C5(C)C)C,853.91,
6,8417,ClCCN(CCCl)P1(=O)NCCCO1,261.09,
7,11968,CN1C(=C(O)c2ccccc2S1(=O)=O)C(=O)Nc3ccccn3,331.35,
8,12506,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@@H]([C@H]3O)N(C)C)[C@](C)(O)C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O,733.93,
9,15217,CCCC(CCC)C(=O)O,144.21,


#### So I want to run this function getsvgtext which does not do anything special, but it fails for some molecules with ValueError: Bad Conformer Id

In [None]:
def getsvgtext( mol ):
    d2d = rdMolDraw2D.MolDraw2DSVG(200,200)
    d2d.DrawMolecule( mol )
    d2d.FinishDrawing()
    svg = d2d.GetDrawingText()
    return svg.replace( "svg:","" )

In [75]:
[getsvgtext(m) for m in df_smiles['mol']]

ValueError: Bad Conformer Id

#### I can identifify the molecules using EmdedMolecule 

In [72]:
i=0
for m in df_smiles.mol:
    print(i, AllChem.EmbedMolecule(m))
    try:
        getsvgtext(m)
    except:
        print('error')
    i+=1

0 0
1 0
2 0
3 0
4 0
5 -1
error
6 0
7 0
8 -1
error
9 0
10 0
11 0
12 0
13 0
14 0
15 -1
error
16 0
17 0
18 0
19 -1
error
20 0
21 0
22 0
23 -1
error
24 0
25 0
26 0
27 0
28 0
29 -1
error
30 0
31 -1
error
32 0
33 0
34 0
35 0
36 0
37 -1
error
38 0
39 0
40 0


In [50]:
drugs = [mol for mol in df_smiles.mol]

In [51]:
drugfparr = calc_fp_arr( drugs )

In [52]:
#do PCA
pca = PCA( n_components=2 )
pca.fit( drugfparr ) 

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)

In [53]:
drugsX = pca.transform( drugfparr )

In [54]:
df_smiles['PCA1'] = [x[0] for x in drugsX]
df_smiles['PCA2'] = [x[1] for x in drugsX]


In [55]:
df_smiles

Unnamed: 0,molregno,canonical_smiles,mw_freebase,mol,PCA1,PCA2
0,921,OC1=NC(=O)C(N1)(c2ccccc2)c3ccccc3,252.27,,-0.739164,-0.43651
1,1114,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C[C@H](N)[C@@H](O)[C@H](C)O5)c4c(O)c3C(=O)c12)C(=O)CO,543.52,,6.314948,-1.58873
2,1280,CC(=O)Oc1ccccc1C(=O)O,180.16,,-0.598761,0.108931
3,1615,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O,334.39,,-0.592182,0.103713
4,2217,CCC1(C(=O)NC(=O)NC1=O)c2ccccc2,232.24,,-0.965809,-0.372099
5,8062,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C@@]3(OC(=O)C)[C@H]2[C@H](OC(=O)c4ccccc4)[C@]5(O)C[C@H](OC(=O)[C@H](O)[C@@H](NC(=O)c6ccccc6)c7ccccc7)C(=C1C5(C)C)C,853.91,,1.369567,7.344075
6,8417,ClCCN(CCCl)P1(=O)NCCCO1,261.09,,-1.245524,-0.878174
7,11968,CN1C(=C(O)c2ccccc2S1(=O)=O)C(=O)Nc3ccccn3,331.35,,-0.895635,-0.36725
8,12506,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@@H]([C@H]3O)N(C)C)[C@](C)(O)C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O,733.93,,1.587377,-0.152065
9,15217,CCCC(CCC)C(=O)O,144.21,,-1.29764,-0.460263


In [62]:
df_cpd.drop_duplicates(subset=['PARENT_PREF_NAME']).head()

Unnamed: 0,PARENT_PREF_NAME,PARENT_MOLREGNO,PARENT_CHEMBL_ID,VERSION_MOLREGNO,VERSION_CHEMBL_ID,ACTIVE_FORM,HECATOS_CATEGORY
0,ACETAMINOPHEN,16450,CHEMBL112,16450,CHEMBL112,0,HEPATOTOXIC
1,AMIODARONE,27185,CHEMBL633,27185,CHEMBL633,0,CARDIOTOXIC
3,AZATHIOPRINE,431141,CHEMBL1542,431141,CHEMBL1542,0,HEPATOTOXIC
6,CELECOXIB,18694,CHEMBL118,389859,CHEMBL395268,0,CARDIOTOXIC
9,CLAVULANIC ACID,44876,CHEMBL777,3741,CHEMBL8290,0,HEPATOTOXIC


In [64]:
df = df_smiles.merge(df_cpd.drop_duplicates(subset=['PARENT_PREF_NAME'])[['PARENT_PREF_NAME','PARENT_MOLREGNO','PARENT_CHEMBL_ID', 'HECATOS_CATEGORY']], left_on='molregno', right_on='PARENT_MOLREGNO')

In [65]:
df

Unnamed: 0,molregno,canonical_smiles,mw_freebase,mol,PCA1,PCA2,PARENT_PREF_NAME,PARENT_MOLREGNO,PARENT_CHEMBL_ID,HECATOS_CATEGORY
0,921,OC1=NC(=O)C(N1)(c2ccccc2)c3ccccc3,252.27,,-0.739164,-0.43651,PHENYTOIN,921,CHEMBL16,HEPATOTOXIC
1,1114,COc1cccc2C(=O)c3c(O)c4C[C@](O)(C[C@H](O[C@H]5C[C@H](N)[C@@H](O)[C@H](C)O5)c4c(O)c3C(=O)c12)C(=O)CO,543.52,,6.314948,-1.58873,EPIRUBICIN,1114,CHEMBL417,CARDIOTOXIC
2,1280,CC(=O)Oc1ccccc1C(=O)O,180.16,,-0.598761,0.108931,ASPIRIN,1280,CHEMBL25,NON-TOXIC
3,1615,CC1(C)S[C@@H]2[C@H](NC(=O)Cc3ccccc3)C(=O)N2[C@H]1C(=O)O,334.39,,-0.592182,0.103713,BENZYLPENICILLIN,1615,CHEMBL29,NON-TOXIC
4,2217,CCC1(C(=O)NC(=O)NC1=O)c2ccccc2,232.24,,-0.965809,-0.372099,PHENOBARBITAL,2217,CHEMBL40,HEPATOTOXIC
5,8062,CC(=O)O[C@H]1C(=O)[C@]2(C)[C@@H](O)C[C@H]3OC[C@@]3(OC(=O)C)[C@H]2[C@H](OC(=O)c4ccccc4)[C@]5(O)C[C@H](OC(=O)[C@H](O)[C@@H](NC(=O)c6ccccc6)c7ccccc7)C(=C1C5(C)C)C,853.91,,1.369567,7.344075,PACLITAXEL,8062,CHEMBL428647,CARDIOTOXIC
6,8417,ClCCN(CCCl)P1(=O)NCCCO1,261.09,,-1.245524,-0.878174,CYCLOPHOSPHAMIDE,8417,CHEMBL88,CARDIOTOXIC
7,11968,CN1C(=C(O)c2ccccc2S1(=O)=O)C(=O)Nc3ccccn3,331.35,,-0.895635,-0.36725,PIROXICAM,11968,CHEMBL527,HEPATOTOXIC
8,12506,CC[C@H]1OC(=O)[C@H](C)[C@@H](O[C@H]2C[C@@](C)(OC)[C@@H](O)[C@H](C)O2)[C@H](C)[C@@H](O[C@@H]3O[C@H](C)C[C@@H]([C@H]3O)N(C)C)[C@](C)(O)C[C@@H](C)C(=O)[C@H](C)[C@@H](O)[C@]1(C)O,733.93,,1.587377,-0.152065,ERYTHROMYCIN,12506,CHEMBL532,HEPATOTOXIC
9,15217,CCCC(CCC)C(=O)O,144.21,,-1.29764,-0.460263,VALPROIC ACID,15217,CHEMBL109,HEPATOTOXIC
