In [1]:
from rdkit import Chem
from rdkit.DataStructs import TanimotoSimilarity
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
import numpy as np
import time

In [2]:
def slow_similarity(a, b):
    if a is None or b is None: 
        return 0.0
    amol = Chem.MolFromSmiles(a)
    bmol = Chem.MolFromSmiles(b)
    if amol is None or bmol is None:
        return 0.0
    else:
        fp1 = GetMorganFingerprintAsBitVect(amol, 2, nBits=2048, useChirality=False)
        fp2 = GetMorganFingerprintAsBitVect(bmol, 2, nBits=2048, useChirality=False)
        return TanimotoSimilarity(fp1, fp2) 

In [3]:
class FastTanimotoOneToBulk:
    def __init__(self, bs):
        self.bs = bs
        self.b_fps = np.vstack([self._fingerprints_from_smi(smi) for smi in self.bs])
        
    def __call__(self, a):
        a_fp = self._fingerprints_from_smi(a)
        return (a_fp&self.b_fps).sum(axis=1) / (a_fp|self.b_fps).sum(axis=1)
        
    def _fingerprints_from_smi(self, smi):
        mol = Chem.MolFromSmiles(smi)
        fp = GetMorganFingerprintAsBitVect(mol, 2, nBits=2048, useChirality=False)
        nfp = np.array([b=='1' for b in fp.ToBitString()])
        return nfp

# Load the example data

In [4]:
with open('003_Fast_Tanimoto_Similarity_OneToBulk_data.txt') as f:
    bulks = [line.rstrip() for line in f.readlines()]

print(len(bulks))

50000


# Case1 : one-to-bulk

In [5]:
ex = 'ClC1=CC=C2C(C=C(C(C)=O)C(C(NC3=CC(NC(NC4=CC(C5=C(C)C=CC=C5)=CC=C4)=O)=CC=C3)=O)=C2)=C1'

In [6]:
start_time = time.time()

res_slow = [slow_similarity(ex, b) for b in bulks]
print(np.max(res_slow))
print(np.mean(res_slow))
print(np.min(res_slow))

print(f"{time.time() - start_time:.3f} sec")

0.423728813559322
0.13234524722503668
0.0
17.174 sec


In [7]:
start_time = time.time()

res_fast = FastTanimotoOneToBulk(bulks)(ex)
print(np.max(res_fast))
print(np.mean(res_fast))
print(np.min(res_fast))

print(f"{time.time() - start_time:.3f} sec")

0.423728813559322
0.13234524722503668
0.0
14.643 sec


# Case2: bulk-to-bulk

In [8]:
bulk2 = ['CC(=O)NCCNC(=O)C1=C(C2CC2)N(C2=CC=C(C)C(Cl)=C2)N=C1',
        'CC(C(=O)C1=C2C=CC=CC2=[NH+]C1)[NH+]1CCCC1C1CC=CS1',
        'CCN(CC1CCOC1)C(=O)C1=CC=NC(Cl)=C1',
        'CC1=CC=CC=C1CS(=O)CCCC1=CC=CC=C1',
        'CSCC(=O)NNC(=O)C1=C(O)C=C(Cl)C=C1Cl',
        'C1=CC=C(C2=CC(N3CC4C5CCC(O5)C4C3)=C3C=CC=CC3=[NH+]2)C=C1',
        'CC1=CC(CNC(=O)NC2=NOC(C3=CC=CC=C3)=C2)=NO1',
        'CC1=CC2=NC=C(C(=O)NC3=N[N-]C(C(F)(F)F)=N3)C(=O)N2C=C1',
        'CN(CC1=CC=NC=C1)C(=O)C1=CC=C(I)C=C1',
        'COC1=CC=CC=C1N1CCN(C2=NN(CC(=O)NC3CC3)C(=O)C=C2)CC1']

In [9]:
start_time = time.time()

for ex in bulk2:
    res_slow = [slow_similarity(ex, b) for b in bulks]

print(f"{time.time() - start_time:.3f} sec")

122.953 sec


In [10]:
start_time = time.time()

calc_sim = FastTanimotoOneToBulk(bulks)
for ex in bulk2:
    res_slow = calc_sim(ex)

print(f"{time.time() - start_time:.3f} sec")

16.234 sec
