In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import tqdm
from rdkit.Chem import MolFromSmiles, Lipinski # NumHDonors, NumHAcceptors
from rdkit.Chem.rdMolDescriptors import CalcExactMolWt
from rdkit.Chem.Crippen import MolLogP

In [2]:
smi = "CC1(CCC(=C(C1)CN2CCN(CC2)C3=CC=C(C=C3)C(=O)NS(=O)(=O)C4=CC(=C(C=C4)NC(CCN5CCOCC5)CSC6=CC=CC=C6)S(=O)(=O)C(F)(F)F)C7=CC=C(C=C7)Cl)C"

In [3]:
mol = MolFromSmiles(smi)

In [4]:
print(Lipinski.NumHDonors(mol))
print(Lipinski.NumHAcceptors(mol))
print(CalcExactMolWt(mol))
print(MolLogP(mol))

2
11
973.29550982
8.833200000000005


### Lipinski's Rules (RO5)
- MW < 500 Da
- ClogP < 5
- H-bond donor < 5
- H-bond acceptor < 10

In [2]:
df = pd.read_csv("chembl_test_full.csv")

In [3]:
df

Unnamed: 0,smiles,jnk3,gsk3
0,c1cc2ccc3ncc(-c4ccc(-c5ccn[nH]5)cc4)cc3c2cn1,0.91,0.52
1,c1ccc2c(-c3ccncc3)c[nH]c2c1,0.64,0.81
2,c1ccc2cc(-c3n[nH]cc3-c3ccncc3)ccc2c1,0.74,0.72
3,c1cc(-c2nccs2)c2nc(Nc3ccc(-n4cnc(N5CCOCC5)n4)c...,0.98,0.52
4,c1ccc(-c2ccc3c(-c4ccnc(Nc5ccc6c(c5)OCCO6)n4)cn...,0.51,0.98
...,...,...,...
310,O=[N+]([O-])c1ccc(Nc2nccc(-c3cnn4ncccc34)n2)cc1,0.51,0.99
311,O=[N+]([O-])C=Cc1ccc2c(c1)OCO2,0.63,0.62
312,O=S(=O)(c1cccc2cnccc12)N1CCCNCC1,0.72,0.68
313,O=S(=O)(NCCNCC=Cc1ccc(Br)cc1)c1cccc2cnccc12,0.69,0.90


In [4]:
for i, smi in tqdm.tqdm(enumerate(df.loc[:,'smiles'].values), total=len(df)):
    
    mol = MolFromSmiles(smi)
    
    df.loc[i,'donor'] = Lipinski.NumHDonors(mol)
    df.loc[i,'acceptor'] = Lipinski.NumHAcceptors(mol)
    df.loc[i,'mwt'] = CalcExactMolWt(mol)
    df.loc[i,'logp'] = MolLogP(mol)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 315/315 [00:00<00:00, 1764.98it/s]


In [5]:
for i in tqdm.trange(len(df)):
    
    df.loc[i,'violation_Ro5'] = 0
    
    df.loc[i,'violation_Ro5'] += 0 if df.loc[i,'donor'] < 5 else 1
    df.loc[i,'violation_Ro5'] += 0 if df.loc[i,'acceptor'] < 10 else 1
    df.loc[i,'violation_Ro5'] += 0 if df.loc[i,'mwt'] < 500 else 1
    df.loc[i,'violation_Ro5'] += 1 if df.loc[i,'donor'] > 5 else 0

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 315/315 [00:00<00:00, 2920.57it/s]


In [6]:
df.loc[:,'violation_Ro5'].value_counts() / len(df)

0.0    0.815873
1.0    0.161905
2.0    0.019048
4.0    0.003175
Name: violation_Ro5, dtype: float64