In [8]:
import pandas as pd
from scipy.spatial.distance import squareform
from rdkit import Chem, DataStructs
from rdkit.Chem import AllChem

In [2]:
data = ["Cc1cccc(-c2ccc3c(c2)cc(C)n3-c2ccccc2)c1",
        "COc1ccc(NC(=O)c2cc(-n3ncc(C)c3C)ccc2F)cc1C",
        "CC(C#N)(c1ccccc1)c1ccccc1",
        "Cc1cc(C(=O)NCCCCN2CCN(c3cnc(N(C)C)nc3C)CC2)co1",
        "Cc1nn(C)c2c(=O)c3c(Cl)cccc3[nH]c12"]

In [3]:
def transform_ECFP(smi):
    m = Chem.MolFromSmiles(smi)
    fp = AllChem.GetMorganFingerprint(m, 2)
    return fp

In [4]:
fps = [transform_ECFP(smi) for smi in data]

In [7]:
records = []

for i, fp1 in enumerate(fps):
    for j, fp2 in enumerate(fps):
        s = DataStructs.TanimotoSimilarity(fp1, fp2)
        records.append((i, j, s))
        
df = pd.DataFrame.from_records(records, columns=["X", "Y", "Tanimoto"])
df

Unnamed: 0,X,Y,Tanimoto
0,0,0,1.0
1,0,1,0.230088
2,0,2,0.269663
3,0,3,0.103704
4,0,4,0.1875
5,1,0,0.230088
6,1,1,1.0
7,1,2,0.134615
8,1,3,0.175573
9,1,4,0.178218


In [10]:
records_v2 = []

for i, fp1 in enumerate(fps):
    for j, fp2 in enumerate(fps):
        if i < j:
            s = DataStructs.TanimotoSimilarity(fp1, fp2)
            d = 1 - s # distance
            records_v2.append(d)
        
distance_matrix = squareform(records_v2)
similarity_matrix = 1 - distance_matrix

df_v2 = pd.DataFrame(similarity_matrix)
df_v2

Unnamed: 0,0,1,2,3,4
0,1.0,0.230088,0.269663,0.103704,0.1875
1,0.230088,1.0,0.134615,0.175573,0.178218
2,0.269663,0.134615,1.0,0.04918,0.107143
3,0.103704,0.175573,0.04918,1.0,0.131579
4,0.1875,0.178218,0.107143,0.131579,1.0
