### Libraries

In [24]:
!pip install joblib==1.2.0
!pip install MolVS==0.1.1
!pip install numpy==1.23.5
!pip install pandas==1.5.2
!pip install Pillow==9.3.0
!pip install pynndescent==0.5.8
!pip install python-dateutil==2.8.2
!pip install pytz==2022.7
!pip install rdkit==2022.9.3
!pip install scikit-learn==1.2.0
!pip install scipy==1.9.3
!pip install six==1.16.0
!pip install threadpoolctl==3.1.0
!pip install tqdm==4.64.1
!pip install umap-learn==0.5.3



In [35]:
from rdkit import RDLogger
from rdkit import Chem
from rdkit.Chem import SaltRemover
from molvs import Standardizer
from rdkit.Chem import AllChem
from rdkit import DataStructs

from time import time
import numpy as np
import pandas as pd
import umap

  @numba.jit()
  @numba.jit()
  @numba.jit()
  @numba.jit()


In [26]:
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score, silhouette_score

### Data preprocessing

In [27]:
molecules = pd.read_csv("ESOL_delaney-processed.csv", sep=",")
molecules = molecules[["smiles"]]
molecules = molecules.drop_duplicates(subset=['smiles'], ignore_index=True)
molecules

Unnamed: 0,smiles
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...
1,CC1:O:C:C:C:1C(=O)NC1:C:C:C:C:C:1
2,CC(C)=CCCC(C)=CC=O
3,C1:C:C:C2:C(:C:1):C:C:C1:C:2:C:C:C2:C3:C:C:C:C...
4,C1:C:C:S:C:1
...,...
1112,FC(F)(F)C(Cl)Br
1113,CNC(=O)ON=C(SC)C(=O)N(C)C
1114,CCSCCSP(=S)(OC)OC
1115,CCC(C)C


In [28]:
n = molecules.shape[0]
mol = []
mol1 = []



for k in range(n):
    m = Chem.MolFromSmiles(molecules['smiles'][k])

    remover = SaltRemover.SaltRemover()  # remove salt
    m = remover.StripMol(m)
    s = Standardizer()  # standardize molecule
    m = s.standardize(m)

    mol.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024))
    mol1.append(list(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024)))


In [29]:
n = molecules.shape[0]
fps = mol
simis = []

for i in range(1, n):
    sims = DataStructs.BulkTanimotoSimilarity(fps[i], fps[:i])
    max_sims = np.max(np.array(sims))
    simis.append(max_sims)
simis.insert(0,1)
simis

[1,
 0.12307692307692308,
 0.06382978723404255,
 0.13157894736842105,
 0.1,
 0.17857142857142858,
 0.15384615384615385,
 0.12244897959183673,
 0.10714285714285714,
 0.12048192771084337,
 0.06153846153846154,
 0.34782608695652173,
 0.13793103448275862,
 0.17142857142857143,
 0.25,
 0.08333333333333333,
 0.2,
 0.21739130434782608,
 0.11904761904761904,
 0.47058823529411764,
 0.4074074074074074,
 0.18421052631578946,
 0.2222222222222222,
 0.2558139534883721,
 0.10714285714285714,
 0.6153846153846154,
 0.17777777777777778,
 0.25,
 0.25925925925925924,
 0.4,
 0.2,
 0.6071428571428571,
 0.2653061224489796,
 0.15,
 0.16279069767441862,
 0.375,
 0.3,
 0.2727272727272727,
 0.2553191489361702,
 0.6153846153846154,
 0.4,
 0.3684210526315789,
 0.3333333333333333,
 0.15151515151515152,
 0.2,
 0.9411764705882353,
 0.5,
 0.17142857142857143,
 0.20689655172413793,
 0.1935483870967742,
 0.5,
 0.3333333333333333,
 1.0,
 0.1836734693877551,
 0.44,
 0.32142857142857145,
 0.2,
 0.42105263157894735,
 0.6363

In [30]:
df_tanimoto['Tanimoto similarity'] = simis
df_tanimoto['outlier'] = np.where(df_tanimoto['Tanimoto similarity'] <= 0.5, True, False)
df_nonoutliers = df_tanimoto[df_tanimoto.loc[:, 'outlier'] == False]

df_nonoutliers

Unnamed: 0,smiles,Tanimoto similarity,outlier
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...,1.000000,False
25,CCCCCCCO,0.615385,False
31,CCOP(=S)(OCC)SCSC(C)(C)C,0.607143,False
39,CCCCCCCI,0.615385,False
45,CCCCCC(C)O,0.941176,False
...,...,...,...
1107,CC(=O)OCC(C)C,0.565217,False
1109,ClC1:C:C:C(C2:C(Cl):C:C(Cl):C(Cl):C:2Cl):C(Cl)...,0.590909,False
1111,CC1:C:C:C:C([N+](=O)[O-]):C:1,0.695652,False
1114,CCSCCSP(=S)(OC)OC,0.666667,False


In [31]:
n = df_nonoutliers.shape[0]
new_index = []
for k in range(n):
    new_index.append(k)
new_index

df_nonoutliers.index = new_index
df_nonoutliers

Unnamed: 0,smiles,Tanimoto similarity,outlier
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...,1.000000,False
1,CCCCCCCO,0.615385,False
2,CCOP(=S)(OCC)SCSC(C)(C)C,0.607143,False
3,CCCCCCCI,0.615385,False
4,CCCCCC(C)O,0.941176,False
...,...,...,...
495,CC(=O)OCC(C)C,0.565217,False
496,ClC1:C:C:C(C2:C(Cl):C:C(Cl):C(Cl):C:2Cl):C(Cl)...,0.590909,False
497,CC1:C:C:C:C([N+](=O)[O-]):C:1,0.695652,False
498,CCSCCSP(=S)(OC)OC,0.666667,False


In [32]:
n = df_nonoutliers.shape[0]
nonoutliers = []
nonoutliers1 = []

for k in range(n):
    m = Chem.MolFromSmiles(df_nonoutliers['smiles'][k])

    remover = SaltRemover.SaltRemover()  # remove salt
    m = remover.StripMol(m)
    s = Standardizer()  # standardize molecule
    m = s.standardize(m)

    nonoutliers.append(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024))
    nonoutliers1.append(list(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=1024)))

### Clustering

In [33]:
sample = nonoutliers1

In [36]:
t0 = time()
x_red = umap.UMAP(n_neighbors=100, min_dist=0.0, n_components=2, metric='jaccard',
                  random_state=42).fit_transform(sample)

clustering = AgglomerativeClustering(linkage='ward', n_clusters=7)
clustering.fit(x_red)
tf = time() - t0

  warn(


In [37]:
def assign_cluster_id(df_data, cluster_id):
    df_data['Cluster_ID'] = cluster_id.labels_
    return df_data

In [38]:
df_clusters = assign_cluster_id(df_nonoutliers, clustering)
df_clusters

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_data['Cluster_ID'] = cluster_id.labels_


Unnamed: 0,smiles,Tanimoto similarity,outlier,Cluster_ID
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...,1.000000,False,0
1,CCCCCCCO,0.615385,False,2
2,CCOP(=S)(OCC)SCSC(C)(C)C,0.607143,False,6
3,CCCCCCCI,0.615385,False,2
4,CCCCCC(C)O,0.941176,False,2
...,...,...,...,...
495,CC(=O)OCC(C)C,0.565217,False,6
496,ClC1:C:C:C(C2:C(Cl):C:C(Cl):C(Cl):C:2Cl):C(Cl)...,0.590909,False,3
497,CC1:C:C:C:C([N+](=O)[O-]):C:1,0.695652,False,5
498,CCSCCSP(=S)(OC)OC,0.666667,False,6


In [39]:
s1 = silhouette_score(x_red, clustering.labels_, metric='euclidean')
c1 = calinski_harabasz_score(x_red, clustering.labels_)
d1 = davies_bouldin_score(x_red, clustering.labels_)

In [40]:
df_metrics = pd.DataFrame(data=[[tf, s1, c1, d1, 7, "UMAP & Agglomerative Clustering", "Just non outliers molecules"]], columns=['Time for UMAP & Clustering', 'Silhouette', 'CH score', 'DB score', "Number of Clusters", "Method", "All Molecules"])
df_metrics

Unnamed: 0,Time for UMAP & Clustering,Silhouette,CH score,DB score,Number of Clusters,Method,All Molecules
0,8.196545,0.503605,1322.894734,0.679772,7,UMAP & Agglomerative Clustering,Just non outliers molecules


In [41]:
sample1 = mol1
t0 = time()
x_red = umap.UMAP(n_neighbors=100, min_dist=0.0, n_components=2, metric='jaccard',
                  random_state=42).fit_transform(sample1)

clustering = AgglomerativeClustering(linkage='ward', n_clusters=7)
clustering.fit(x_red)
tf = time() - t0

  warn(


In [42]:
df_clusters = assign_cluster_id(molecules, clustering)
df_clusters

Unnamed: 0,smiles,Cluster_ID
0,N#CC(OC1OC(COC2OC(CO)C(O)C(O)C2O)C(O)C(O)C1O)C...,0
1,CC1:O:C:C:C:1C(=O)NC1:C:C:C:C:C:1,4
2,CC(C)=CCCC(C)=CC=O,2
3,C1:C:C:C2:C(:C:1):C:C:C1:C:2:C:C:C2:C3:C:C:C:C...,3
4,C1:C:C:S:C:1,3
...,...,...
1112,FC(F)(F)C(Cl)Br,5
1113,CNC(=O)ON=C(SC)C(=O)N(C)C,2
1114,CCSCCSP(=S)(OC)OC,2
1115,CCC(C)C,5


In [43]:
s1 = silhouette_score(x_red, clustering.labels_, metric='euclidean')
c1 = calinski_harabasz_score(x_red, clustering.labels_)
d1 = davies_bouldin_score(x_red, clustering.labels_)

In [44]:
new_row = {'Time for UMAP & Clustering' : tf, 'Silhouette' : s1, 'CH score' : c1, 'DB score' : d1, "Number of Clusters" : 7, "Method" : "UMAP & Agglomerative Clustering", "All Molecules" : "Yes"}
df2 = df_metrics.append(new_row, ignore_index=True)
df2

  df2 = df_metrics.append(new_row, ignore_index=True)


Unnamed: 0,Time for UMAP & Clustering,Silhouette,CH score,DB score,Number of Clusters,Method,All Molecules
0,8.196545,0.503605,1322.894734,0.679772,7,UMAP & Agglomerative Clustering,Just non outliers molecules
1,7.235954,0.417009,1919.801801,0.802765,7,UMAP & Agglomerative Clustering,Yes
