### Libraries

In [1]:
!pip install joblib==1.2.0
!pip install MolVS==0.1.1
!pip install numpy==1.23.5
!pip install pandas==1.5.2
!pip install Pillow==9.3.0
!pip install pynndescent==0.5.8
!pip install python-dateutil==2.8.2
!pip install pytz==2022.7
!pip install rdkit==2022.9.3
!pip install scikit-learn==1.2.0
!pip install scipy==1.9.3
!pip install six==1.16.0
!pip install threadpoolctl==3.1.0
!pip install tqdm==4.64.1
!pip install umap-learn==0.5.3



In [2]:
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem import SaltRemover
from molvs import Standardizer
from rdkit.Chem import AllChem
from rdkit import DataStructs

from time import time
import numpy as np
import pandas as pd
import umap

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [3]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

### Data preprocessing

In [4]:
molecules = pd.read_csv("compound-annotation.csv", sep=",")
molecules = molecules[["SMILES"]]
molecules = molecules.drop_duplicates(subset=['SMILES'], ignore_index=True)
molecules = molecules[molecules["SMILES"].notna()]

n = molecules.shape[0]
new_index = []
new_index = list(range(n))

molecules.index = new_index

molecules

Unnamed: 0,SMILES
0,CN(C)C(=O)CC1CC2(CCN(CC2)C(=O)N2CCCC2)Oc2ccccc12
1,Cc1c([nH]c2CC(CC(=O)c12)c1ccco1)C(=O)OCC1CCCO1
2,CNC(=O)CN1CCC11CCN(C1)C(=O)c1ccn(C)n1
3,Cn1cc(cn1)N1CCC2(CCN(C2)C(=O)c2ccncc2)C1=O
4,CC(C)CN1CC2CN(CC2C1)S(=O)(=O)c1ccccc1
...,...
47212,[Zn++].CCC(C)[C@H](N)C1=NCC(S1)C(=O)N[C@@H](CC...
47213,CCCNC(=O)[C@@H]1[C@@H](CO)[C@@H]2Cn3c(=O)c(ccc...
47214,C\C=C\c1ccc2[C@H]3N[C@H]([C@H](CO)[C@H]3Cn2c1=...
47215,CCN1[C@@H](C(=O)Nc2ccccc2)[C@H](CO)[C@H]2Cn3c(...


In [5]:
n = molecules.shape[0]
mol = []
mol1 = []



for k in range(n):
    m = Chem.MolFromSmiles(molecules['SMILES'][k])

    remover = SaltRemover.SaltRemover()  # remove salt
    m = remover.StripMol(m)
    s = Standardizer()  # standardize molecule
    m = s.standardize(m)

    mol.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024))
    mol1.append(list(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024)))


In [6]:
n = molecules.shape[0]
fps = mol
simis = []

for i in range(1, n):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    max_sims = np.max(np.array(sims))
    simis.append(max_sims)
simis.insert(0,1)

[1,
 0.17045454545454544,
 0.22666666666666666,
 0.3088235294117647,
 0.16417910447761194,
 0.15294117647058825,
 0.1686746987951807,
 0.20512820512820512,
 0.1917808219178082,
 0.2077922077922078,
 0.21518987341772153,
 0.5294117647058824,
 0.2857142857142857,
 0.19736842105263158,
 0.16071428571428573,
 0.1935483870967742,
 0.25,
 0.21951219512195122,
 0.26865671641791045,
 0.26582278481012656,
 0.28378378378378377,
 0.17857142857142858,
 0.24050632911392406,
 0.3157894736842105,
 0.20930232558139536,
 0.23529411764705882,
 0.3048780487804878,
 0.2222222222222222,
 0.24324324324324326,
 0.3,
 0.2571428571428571,
 0.24285714285714285,
 0.22093023255813954,
 0.20212765957446807,
 0.6507936507936508,
 0.28846153846153844,
 0.28205128205128205,
 0.22988505747126436,
 0.1917808219178082,
 0.2247191011235955,
 0.24,
 0.26229508196721313,
 0.41935483870967744,
 0.16666666666666666,
 0.49295774647887325,
 0.21428571428571427,
 0.26582278481012656,
 0.27710843373493976,
 0.33962264150943394,


In [10]:
df_tanimoto = molecules.copy()
df_tanimoto['Tanimoto similarity'] = simis
df_tanimoto['outlier'] = np.where(df_tanimoto['Tanimoto similarity'] <= 0.5, True, False)
df_nonoutliers = df_tanimoto[df_tanimoto.loc[:, 'outlier'] == False]

n = df_nonoutliers.shape[0]
new_index = []
new_index = list(range(n))

df_nonoutliers.index = new_index


df_nonoutliers

Unnamed: 0,SMILES,Tanimoto similarity,outlier
0,CN(C)C(=O)CC1CC2(CCN(CC2)C(=O)N2CCCC2)Oc2ccccc12,1.000000,False
1,C1CC(N(C1)C(=O)c1ccncc1)c1nnn2cc(ccc12)-c1ccccc1,0.529412,False
2,CN(C)C(=O)CC1CC2(CCN(CC2)C(=O)Nc2cccc(C)c2)Oc2...,0.650794,False
3,CN(C)C1COC2(C1)CCN(CC2)S(=O)(=O)C1CC1,0.609756,False
4,CNC(=O)CC1CC2(CCN(CC2)C(=O)COc2ccccc2)Oc2ccccc12,0.562500,False
...,...,...,...
35409,CCC(=O)N1[C@H]([C@H](CO)[C@H]2Cn3c(ccc(\C=C\C)...,1.000000,False
35410,CCCNC(=O)[C@@H]1[C@@H](CO)[C@@H]2Cn3c(=O)c(ccc...,0.704225,False
35411,C\C=C\c1ccc2[C@H]3N[C@H]([C@H](CO)[C@H]3Cn2c1=...,0.700000,False
35412,CCN1[C@@H](C(=O)Nc2ccccc2)[C@H](CO)[C@H]2Cn3c(...,1.000000,False


In [11]:
n = df_nonoutliers.shape[0]
nonoutliers = []
nonoutliers1 = []

for k in range(n):
    m = Chem.MolFromSmiles(df_nonoutliers['SMILES'][k])

    remover = SaltRemover.SaltRemover()  # remove salt
    m = remover.StripMol(m)
    s = Standardizer()  # standardize molecule
    m = s.standardize(m)

    nonoutliers.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024))
    nonoutliers1.append(list(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024)))

### Clustering

In [12]:
sample = nonoutliers1

In [13]:
t0 = time()
x_red = umap.UMAP(n_neighbors=100, min_dist=0.0, n_components=2, metric='jaccard',
                  random_state=42).fit_transform(sample)

clustering = AgglomerativeClustering(linkage='ward', n_clusters=7)
clustering.fit(x_red)
tf = time() - t0

  warn(


In [37]:
def assign_cluster_id(df_data, cluster_id):
    df_data['Cluster_ID'] = cluster_id.labels_
    return df_data

In [38]:
df_clusters = assign_cluster_id(df_nonoutliers, clustering)
df_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['Cluster_ID'] = cluster_id.labels_


Unnamed: 0,smiles,Tanimoto similarity,outlier,Cluster_ID
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...,1.000000,False,0
1,CCCCCCCO,0.615385,False,2
2,CCOP(=S)(OCC)SCSC(C)(C)C,0.607143,False,6
3,CCCCCCCI,0.615385,False,2
4,CCCCCC(C)O,0.941176,False,2
...,...,...,...,...
495,CC(=O)OCC(C)C,0.565217,False,6
496,ClC1:C:C:C(C2:C(Cl):C:C(Cl):C(Cl):C:2Cl):C(Cl)...,0.590909,False,3
497,CC1:C:C:C:C([N+](=O)[O-]):C:1,0.695652,False,5
498,CCSCCSP(=S)(OC)OC,0.666667,False,6


In [14]:
s1 = silhouette_score(x_red, clustering.labels_, metric='euclidean')
c1 = calinski_harabasz_score(x_red, clustering.labels_)
d1 = davies_bouldin_score(x_red, clustering.labels_)

In [15]:
df_metrics = pd.DataFrame(data=[[tf, s1, c1, d1, 7, "UMAP & Agglomerative Clustering", "Just non outliers molecules"]], columns=['Time for UMAP & Clustering', 'Silhouette', 'CH score', 'DB score', "Number of Clusters", "Method", "All Molecules"])
df_metrics

Unnamed: 0,Time for UMAP & Clustering,Silhouette,CH score,DB score,Number of Clusters,Method,All Molecules
0,2403.280006,0.506444,25371.072575,0.757673,7,UMAP & Agglomerative Clustering,Just non outliers molecules
