# PHASE 6: MOLECULAR SIMILARITY

"""

PHASE 6: Molecule similarity

Created on Tuesday June 20 2023 (v1)

Updated on Tuesday July 11 2023 (v2) - only clustering new drugs

Updated on Monday August 07 2023 (v3) - changed fingerprints from Morgan to MACCS


@author: Odifentse M Lehasa

Since we used only the new drugs, we cannot determine, at this point, which cluster represents which drug class. 
Therefore, we will have to do the next step, similarity scoring, to determine which cluster best represents which drug class. 

The purpose of this notebook is to take the newly discovered molecules (in their clusters) and assess their similarity to the existing ACE inhibitors and ARBs.
This will allow us to see which clusters (from the previous step) are more ACE-like or ARB-like, based on their similarity scores.

"""

## STEP 0: Import libraries

In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs
import pandas as pd
from rdkit.Chem import MACCSkeys


## STEP 1: Import Data

### ACE Inhibitors - original

In [2]:
# Get list of original ACE Inhibitors
df_ace = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/Data/ACE Inhibitors.csv', index_col=0)

# List of ACEIs Molecules
ace_list = df_ace['Canonical SMILES']
ace = [Chem.MolFromSmiles(x) for x in ace_list]
ace


[<rdkit.Chem.rdchem.Mol at 0x7feb117e1640>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1340>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1e20>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1e80>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1a00>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1be0>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1ca0>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1ee0>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1fa0>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1f40>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1dc0>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1760>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1b20>,
 <rdkit.Chem.rdchem.Mol at 0x7feb117e1c40>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9040>]

### ARBs - original

In [3]:
# Get list of original ARBs

df_arbs = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/Data/ARBs.csv', index_col=0)

# List of ARB Molecules
arbs_list = df_arbs['Canonical SMILES']
arbs = [Chem.MolFromSmiles(x) for x in arbs_list]
arbs

[<rdkit.Chem.rdchem.Mol at 0x7feaf04b9220>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9340>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b98e0>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9940>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b94c0>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9820>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9760>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b99a0>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9a00>,
 <rdkit.Chem.rdchem.Mol at 0x7feaf04b9640>]

### Clustered new molecules

In [4]:
# Get all the new molecules in their respective clusters

df_all = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/5. Drug Clusters.csv', index_col=0)
df_all



Unnamed: 0,Canonical SMILES,Carboxyl Functional Group (No.),Sulfhydrl Functional Group (No.),Phosphinyl Functional Group (No.),Biphenyl Functional Group (No.),Benzimidazole Functional Group (No.),Tetrazol Functional Group (No.),Functional Count,Clusters
4,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1,0
6,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1,0
9,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,1,0,0,0,0,0,1,1
18,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1,0
20,NCCCC[C@@H](C(=O)N1[C@H](CO)C[C@H]2CCCC[C@@H]2...,0,0,0,0,0,1,1,0
...,...,...,...,...,...,...,...,...,...
5376,NCCCC[C@@H](C(=O)N1[C@H](C2CCCCC2)C[C@@H]2CCC[...,1,0,0,0,0,0,1,1
5387,C[C@H](CS)C(=O)N[C@@H](CCCCN)C(=O)N1[C@H](c2no...,0,1,0,0,0,0,1,1
5396,CCCCC(=O)O[C@@H](CCCCN)C(=O)N1[C@H](c2noc(=O)[...,1,0,0,0,0,0,1,1
5397,CCC(=O)O[C@@H](CCCCN)C(=O)N1[C@H](c2noc(=O)[nH...,1,0,0,0,0,0,1,1


## STEP 2: Create dataframes for each cluster and its respective molecules

In [6]:

# CLUSTER 0
df_clust0 =  df_all[(df_all['Clusters']==0)] 
clust0_list = df_clust0['Canonical SMILES']
clust0_mols = [Chem.MolFromSmiles(x) for x in clust0_list]

# CLUSTER 1
df_clust1 =  df_all[(df_all['Clusters']==1)] 
clust1_list = df_clust1['Canonical SMILES']
clust1_mols = [Chem.MolFromSmiles(x) for x in clust1_list]

# CLUSTER 2
df_clust2 =  df_all[(df_all['Clusters']==2)] 
clust2_list = df_clust2['Canonical SMILES']
clust2_mols = [Chem.MolFromSmiles(x) for x in clust2_list]



## STEP 3: Use Tanimoto Similarity calculation for all new molecules


### Test similarity of clusters to original ACEI

#### (i) Cluster 0 and ACEI

In [48]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, ace_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in ace_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df1 = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_0ACEI'])
    
    # update the name of the file with appropriate title
    df1.to_csv('6.1 Similarity_Cluster 0 and ACEI.csv', index=False, sep=',') 
    

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust0_mols, ace)     




#### (ii) Cluster 1 and ACEI

In [50]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, ace_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in ace_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df2 = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_1ACEI'])
    
    # update the name of the file with appropriate title
    df2.to_csv('6.2 Similarity_Cluster 1 and ACEI.csv', index=False, sep=',') 
    

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust1_mols, ace)     




#### (iii) Cluster 2 and ACEI

In [51]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, ace_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in ace_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df3 = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_2ACEI'])
    
    # update the name of the file with appropriate title
    df3.to_csv('6.3 Similarity_Cluster 2 and ACEI.csv', index=False, sep=',') 
    

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust2_mols, ace)     




### Test similarity of clusters to original ARBs

#### (iv) Cluster 0 and ARB

In [52]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, arb_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in arb_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df4 = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_0ARB'])
    
    # update the name of the file with appropriate title
    df4.to_csv('6.4. Similarity_Cluster 0 and ARB.csv', index=False, sep=',') 

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust0_mols, arbs)   


#### (v) Cluster 1 and ARB

In [53]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, arb_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in arb_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_1ARB'])
    
    # update the name of the file with appropriate title
    df.to_csv('6.5. Similarity_Cluster 1 and ARB.csv', index=False, sep=',') 

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust1_mols, arbs)   


#### (vi) Cluster 2 and ARB

In [54]:
# calculate similarity between molecule and target molecule 

def calculate_similarity(mols1, arb_mols):
    similarity_data = []
    for mol1 in mols1:
        for mol2 in arb_mols:
            newmol = AllChem.GetMACCSKeysFingerprint(mol1)  #radius of 1
            originalmol = AllChem.GetMACCSKeysFingerprint(mol2)
            similarity = DataStructs.FingerprintSimilarity(newmol, originalmol)
            similarity_data.append((Chem.MolToSmiles(mol1), Chem.MolToSmiles(mol2), similarity))
    
    df = pd.DataFrame(similarity_data, columns=['New molecules', 'Original molecules', 'Similarity Score_2ARB'])
    
    # update the name of the file with appropriate title
    df.to_csv('6.6. Similarity_Cluster 2 and ARB.csv', index=False, sep=',') 

# update following line with appropriate cluster number (do this for each cluster)
calculate_similarity(clust2_mols, arbs)   


## STEP 4: Combine Similarity Results

In [66]:
df1 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.1 Similarity_Cluster 0 and ACEI.csv')
df2 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.2 Similarity_Cluster 1 and ACEI.csv')
df3 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.3 Similarity_Cluster 2 and ACEI.csv')
df4 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.4. Similarity_Cluster 0 and ARB.csv')
df5 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.5. Similarity_Cluster 1 and ARB.csv')
df6 = pd.read_csv('/Users/odilehasa/Hypertension/Final_Experiments/FINAL - October/Output/6.6. Similarity_Cluster 2 and ARB.csv')


### ACEI cluster similarity scores

In [81]:
df_aceSimilarity = pd.DataFrame([df1['Similarity Score_0ACEI'],df2['Similarity Score_1ACEI'],df3['Similarity Score_2ACEI']]).transpose()
df_aceSimilarity.columns = ['CLUSTER 0', 'CLUSTER 1', 'CLUSTER 2']

# save to CSV file
df_aceSimilarity.to_csv('6. ACEIs all cluster similarity scores.csv')

df_aceSimilarity

Unnamed: 0,CLUSTER 0,CLUSTER 1,CLUSTER 2
0,0.575758,0.745455,0.719298
1,0.444444,0.535211,0.500000
2,0.561644,0.709677,0.636364
3,0.538462,0.703704,0.709091
4,0.625000,0.676923,0.608696
...,...,...,...
32095,,0.578947,
32096,,0.536585,
32097,,0.586667,
32098,,0.602564,


### ARB cluster similarity scores

In [80]:
df_arbSimilarity = pd.DataFrame([df4['Similarity Score_0ARB'],df5['Similarity Score_1ARB'],df6['Similarity Score_2ARB']]).transpose()
df_arbSimilarity.columns = ['CLUSTER 0', 'CLUSTER 1', 'CLUSTER 2']

#save as CSV file
df_arbSimilarity.to_csv('6. ARBs all cluster similarity scores.csv') 

df_arbSimilarity

Unnamed: 0,CLUSTER 0,CLUSTER 1,CLUSTER 2
0,0.652778,0.500000,0.486842
1,0.479592,0.427083,0.432990
2,0.488095,0.481013,0.433735
3,0.615385,0.512821,0.463415
4,0.486486,0.545455,0.600000
...,...,...,...
21395,,0.523810,
21396,,0.488372,
21397,,0.617978,
21398,,0.487179,


# ---END---