In [None]:
import pandas as pd
import numpy as np
import sys
import os
import pytorch_lightning as pl
from tqdm import tqdm
from torch.utils.data import DataLoader
import torch
sys.path.append("../../lsm/")

from train_spectral import Finetune_SSSpectral
from msdatasets import MSDataset
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit import DataStructs

import torch.nn.functional as F

path = '' # path to the data
mode = 'test' # name of the testing dataset

pl.seed_everything(42)

In [2]:
#Load datasets
train_df = pd.read_pickle(f'{path}/processed_data/train_df.pkl')
test_df = pd.read_pickle(f'{path}/processed_data/{mode}_df.pkl')

### Load ground-truth maximum possible similarities

### Calculate tanimoto similarity with cosine lookup

In [60]:
cos_res = pd.read_pickle(f'{path}/processed_data/{mode}_modified_cosine_results.pkl')

In [None]:
train_smiles_col = 'Train_SMILES'
test_smiles_col = 'Test_SMILES'
tanimoto_sim = []

for i in tqdm(range(len(cos_res))):
    train_smiles = cos_res[train_smiles_col].iloc[i]
    test_smiles = cos_res[test_smiles_col].iloc[i]
    train_mol = Chem.MolFromSmiles(train_smiles)
    test_mol = Chem.MolFromSmiles(test_smiles)
    train_fp = AllChem.GetMorganFingerprintAsBitVect(train_mol, 2, nBits=2048)
    test_fp = AllChem.GetMorganFingerprintAsBitVect(test_mol, 2, nBits=2048)
    
    tanimoto_sim.append(DataStructs.TanimotoSimilarity(train_fp, test_fp))

cos_res['Tanimoto_Similarity'] = tanimoto_sim

In [None]:
#get percentage of rows where tanimoto similarity is > 0.95 and > 0.6 respectively
print(cos_res[cos_res['Tanimoto_Similarity'] > 0.6].shape[0]/cos_res.shape[0])
print(cos_res[cos_res['Tanimoto_Similarity'] > 0.95].shape[0]/cos_res.shape[0])

### Calculate tanimoto similarity results with ms2lsm

In [3]:
model_name = 'spectral' # specify model name to load ckpt
model = Finetune_SSSpectral.load_from_checkpoint(f'{path}/trained_models/{model_name}_best.ckpt')

train_dataset = MSDataset(dataset_path=f'{path}/train/final_train.zarr/', mode='spectral', tanimoto_path = f'{path}/tanimoto/train_tanimoto.pkl')
dataset = MSDataset(dataset_path=f'{path}/test/{mode}.zarr/', mode='spectral', tanimoto_path = f'{path}/tanimoto/{mode}_tanimoto.pkl')

# # #create dataloader
dataloader = DataLoader(
    dataset,
    batch_size= 128,
    shuffle=False,
    num_workers=0,
)
train_dataloader = DataLoader(
    train_dataset,
    batch_size= 256,
    shuffle=False,
    num_workers=0,
)


Data is (742049, 5) dimensions
Data is (12274, 5) dimensions


In [None]:
train_embeddings = torch.empty(size=(0, 512))
precursor_list = []
smiles_list = []
for batch in tqdm(train_dataloader):
    mz, inty, _, precursormz, smiles = batch['mz'], batch['inty'], batch['mode'], batch['precursormz'], batch['smiles1']
    
    mz, inty, precursormz = mz.cuda(), inty.cuda(), precursormz.cuda()
    emb = model(precursormz, mz, inty).detach().cpu()
    train_embeddings = torch.cat((train_embeddings, emb), dim=0)
    smiles_list.append(smiles)
    precursor_list.append(precursormz.detach().cpu())
print(train_embeddings.shape)
train_smiles = np.array(smiles_list).flatten()
# flatten train_smiles on an individual spectrum level
train_smiles = np.array([item for sublist in train_smiles for item in sublist])
precursors_list = torch.stack(precursor_list[:-1])
precursors_list = precursors_list.flatten() 
precursors_list = torch.concat([precursors_list, precursor_list[-1].flatten()])
precursors_list = precursors_list.numpy()

In [5]:
# load numpy embeddings of training data if they have already been generated
if os.path.exists(f'{path}/embeddings/{model_name}_train_embeddings.npy'):
    train_embeddings = np.load(f'{path}/embeddings/{model_name}_train_embeddings.npy')
    train_smiles = np.load(f'{path}/embeddings/{model_name}_train_smiles.npy')
    train_precursors =  np.load(f'{path}/embeddings/{model_name}_train_precursors.npy')
 
    
    train_embeddings = torch.from_numpy(train_embeddings)
else: # otherwise save the training embeddings generated from last block
    #turn embeddings into numpy array
    numpy_embeddings = train_embeddings.numpy()
    os.makedirs(f'{path}/embeddings', exist_ok=True)
    # save embeddings
    np.save(f'{path}/embeddings/{model_name}_train_embeddings.npy', numpy_embeddings)
    np.save(f'{path}/embeddings/{model_name}_train_smiles.npy', train_smiles)
    np.save(f'{path}/embeddings/{model_name}_train_precursors.npy', precursors_list)

In [6]:
test_embeddings = torch.empty(size=(0, 512))
test_precursors = []
test_smiles_list = []
for batch in tqdm(dataloader):
    mz, inty, _, precursormz, smiles = batch['mz'], batch['inty'], batch['mode'], batch['precursormz'], batch['smiles1']
    
    mz, inty, precursormz = mz.cuda(), inty.cuda(), precursormz.cuda()
    emb = model(precursormz, mz, inty).detach().cpu()
    test_embeddings = torch.cat((test_embeddings, emb), dim=0)
    
    test_smiles_list.append(smiles)
    test_precursors.append(precursormz.detach().cpu())
test_smiles_list = np.array(test_smiles_list).flatten()
# flatten test_smiles on an individual spectrum level
test_smiles_list = np.array([item for sublist in test_smiles_list for item in sublist])
#clear cuda cache
torch.cuda.empty_cache()
test_precursors_list = torch.stack(test_precursors[:-1])
test_precursors_list = test_precursors_list.flatten() 
test_precursors_list = torch.concat([test_precursors_list, test_precursors[-1].flatten()])
test_precursors_list = test_precursors_list.numpy()

train_embeddings_norm = F.normalize(train_embeddings, p=2, dim=1)
test_embeddings_norm = F.normalize(test_embeddings, p=2, dim=1)

sim_matrix = torch.mm(test_embeddings_norm, train_embeddings_norm.t())

100%|██████████| 96/96 [00:23<00:00,  4.13it/s]
  test_smiles_list = np.array(test_smiles_list).flatten()


In [7]:
scores = []
for i in tqdm(range(len(sim_matrix))):
    threshold = 0.005


    while True:
        upper_bound = test_precursors_list[i] + threshold/2
        lower_bound = test_precursors_list[i] - threshold/2
        
        # Create a mask identifying values within the range
        mask = (train_precursors >= lower_bound) & (train_precursors <= upper_bound)

        # Create a new array containing only the values within this range
        indices = np.where(mask)    
        if len(indices[0]) > 0:
            cosine_sim = sim_matrix[i]
            filtered = cosine_sim[indices]
            
            argmax = indices[0][filtered.argmax()]
            
            break
        else:
            threshold = threshold * 2
    
    mol1 = Chem.MolFromSmiles(train_smiles[argmax])
    mol2 = Chem.MolFromSmiles(test_smiles_list[i])
    fp1 = AllChem.GetMorganFingerprintAsBitVect(mol1, 2, nBits=2048)
    fp2 = AllChem.GetMorganFingerprintAsBitVect(mol2, 2, nBits=2048)
    scores.append(DataStructs.TanimotoSimilarity(fp1, fp2))
scores = np.array(scores)
print(f"Percentage of rows where tanimoto similarity is > 0.95: {len(scores[scores > 0.95])/len(scores)}")
print(f"Percentage of rows where tanimoto similarity is > 0.6: {len(scores[scores > 0.6])/len(scores)}")

100%|██████████| 12274/12274 [00:08<00:00, 1443.80it/s]

Percentage of rows where tanimoto similarity is > 0.95: 0.21704415838357505
Percentage of rows where tanimoto similarity is > 0.6: 0.3658139155939384





In [55]:
cosine_scores = cos_res['Tanimoto_Similarity']
# create dataframe where first row is method ('LSM1-MS2', 'Cosine Similarity') and second row is the cosine scores
df = pd.DataFrame(data={'LSM1-MS2': scores, 'Cosine Similarity': cosine_scores})
# make results section for this 
os.makedirs('../../results/database_retrieval', exist_ok=True)

names = {'test':'unknown', 'disjoint_test':'known', 'casmi':'casmi'}

df.to_csv(f'../../results/database_retrieval/{names[mode]}_results.csv', index=False)