In [1]:
%cd ..

/system/user/publicwork/seidl/projects/mhn-react


# Fast and efficient retrieval using a trained model
After training the model can be used as embedding-model and templates can be efficiently retrieved, instead of allways holding all templates in gpu-memory

In [4]:
# train a model if you haven't already (these parameters will produce and ok-ish model fast ;) 
!python -m mhnreact.train --model_type=mhn --device=best --fp_size=4096  --fp_type morgan --template_fp_type rdk --concat_rand_template_thresh -1 \
--exp_name rerun --dataset_type 50k --csv_path ./data/USPTO_50k_MHN_prepro.csv.gz --split_col split --ssretroeval False --seed 0 \
--hopf_association_activation None --fp_radius 2 --save_model True
# norm_asso should be True (but is so by default)
# concat_rand_template_thresh -1 means we don't add noise to template classification to make them better distinguishable; 
# reduces top-1 performance but should be better for retrieval from a new set of templates

seeded with 0
loading X, y from csv
test 5007 samples
valid 5001 samples
train 40008 samples
11800 templates
[759, 355, 241, 210, 138, 132, 90, 99, 69, 49, 44, 2821, 1852]
{'fingerprint_type': 'morgan', 'template_fp_type': 'rdk', 'num_templates': 11800, 'fp_size': 4096, 'fp_radius': 2, 'device': 'cuda:0', 'batch_size': 128, 'pooling_operation_state_embedding': 'mean', 'pooling_operation_head': 'max', 'dropout': 0.2, 'lr': 0.0005, 'optimizer': 'Adam', 'activation_function': 'ReLU', 'verbose': False, 'hopf_input_size': 4096, 'hopf_output_size': None, 'hopf_num_heads': 1, 'hopf_asso_dim': 512, 'hopf_association_activation': 'None', 'hopf_beta': 0.05, 'norm_input': True, 'norm_asso': True, 'hopf_n_layers': 1, 'mol_encoder_layers': 1, 'temp_encoder_layers': 1, 'encoder_af': 'ReLU', 'hopf_pooling_operation_head': 'mean'}
loading tfp from file ./data/cache/templ_emb_4096_rdk2_11800_3193087564066102630097097910572472321699534025650127616658827944690499074195974328066199306158252198615939076655

In [7]:
# now we can load in the model (this will take the first of your models in the model directory);
# you can also specify your own model dir by passing the model_path argument
from mhnreact.inspect import *
clf = load_clf(model_fn=list_models()[0], model_type='mhn', device='cpu')

{'fingerprint_type': 'morgan', 'template_fp_type': 'rdk', 'num_templates': 11800, 'fp_size': 4096, 'fp_radius': 2, 'device': 'cpu', 'batch_size': 128, 'pooling_operation_state_embedding': 'mean', 'pooling_operation_head': 'max', 'dropout': 0.2, 'lr': 0.0005, 'optimizer': 'Adam', 'activation_function': 'ReLU', 'verbose': False, 'hopf_input_size': 4096, 'hopf_output_size': None, 'hopf_num_heads': 1, 'hopf_asso_dim': 512, 'hopf_association_activation': 'None', 'hopf_beta': 0.05, 'norm_input': True, 'norm_asso': True, 'hopf_n_layers': 0, 'mol_encoder_layers': 1, 'temp_encoder_layers': 1, 'encoder_af': 'ReLU', 'hopf_pooling_operation_head': 'mean'}


In [139]:
# get templates from USPTO-full (230k templates)
# for USPTO_full_MHN_prepro_woiv run the notebook 03_prepro_uspto_full
import pandas as pd
df = pd.read_csv("/system/user/seidl/seidl/projects/projects/mhn-react/data/USPTO_full_MHN_prepro_woiv.csv.gz")
#df = pd.read_csv("./data/USPTO_50k_MHN_prepro.csv.gz") # USPTO-sm

tmp = df[['reaction_smarts','label']].drop_duplicates(subset=['reaction_smarts','label']).sort_values('label')
# drop the ones from the test set

tmp.index= tmp.label
template_list = tmp['reaction_smarts'].to_dict()

In [9]:
templates = list(template_list.values())
len(templates)

278546

In [None]:
# encode templates
xd = clf.encode_templates(templates)

In [11]:
xd.shape

(278546, 512)

In [10]:
# install autofaiss
#!pip install autofaiss

In [143]:
# use faiss to build index
import faiss
index = faiss.IndexFlatIP(xd.shape[1])
index.add(xd)

In [17]:
# save to disk
faiss.write_index(index, "./data/templates.index")
# load from disk
# index = faiss.read_index("./data/templates.index")

In [None]:
# you can also use autofaiss to build index
# cosine similarity to find similar templates
# uses less memory and picks an appropriate index type automatically
# more prone to false positives than exact search --> experiment: USPTO-sm 70% (vs 90% exact template match accuracy)
#from autofaiss import build_index
#index, index_infos = build_index(xd, save_on_disk=True, 
#                            index_path="./data/templates.index", metric_type='ip', min_nearest_neighbors_to_retrieve=20, 
#                            use_gpu=False, make_direct_map=False) # ip = inner product (cosine sim if vectors are normalized)

In [18]:
# check memory usage 
!du -sh ./data/templates.index

556M	./data/templates.index


In [19]:
# check memory of numpy array in GB
import sys
sys.getsizeof(xd)/1e9

1.140924536

In [144]:
# let's evaluate the test set this way (we trained on USPTO-sm but test on USPTO-full zero-shot)
n_samples = 1000
xq = clf.encode_smiles(df[df.split=="test"].prod_smiles[:n_samples].values.tolist())

In [158]:
y = df[df.split=="test"].label.values.tolist() # the template that should have been retrieved
y = np.array(y)[:n_samples]

In [68]:
%%time
# retrieve top k templates using the MHN-encoded molecule
k=100
_, I = index.search(xq, k)

CPU times: user 7min 35s, sys: 578 ms, total: 7min 35s
Wall time: 7.05 s


In [138]:
# top-k accuracy
for k in [1,2,3,5,10,20,50,100]:
    print(f"top-{k: 4d} accuracy: {(y[:,None]==I[:,:k]).any(axis=1).mean()*100: 6.2f}%")

top-   1 accuracy:   4.10%
top-   2 accuracy:   7.70%
top-   3 accuracy:  10.50%
top-   5 accuracy:  13.40%
top-  10 accuracy:  19.20%
top-  20 accuracy:  26.00%
top-  50 accuracy:  33.60%
top- 100 accuracy:  38.90%


In [114]:
%%time
# retrieve using a dot-droduct via numpy
# besides being slower, it also uses more memory
import numpy as np
I_np = np.argsort(np.dot(xq[:], xd.T), axis=1)[:,-k:][:,::-1]
# to do this more efficently one can used argpartion beforehand ;)

CPU times: user 40.2 s, sys: 4.45 s, total: 44.6 s
Wall time: 23.5 s


In [137]:
# top-k accuracy
for k in [1,2,3,5,10,20,50,100]:
    print(f"top-{k: 4d} accuracy: {(y[:n_samples,None]==I[:,:k]).any(axis=1).mean()*100: 6.2f}%")

top-   1 accuracy:   4.10%
top-   2 accuracy:   7.70%
top-   3 accuracy:  10.50%
top-   5 accuracy:  13.40%
top-  10 accuracy:  19.20%
top-  20 accuracy:  26.00%
top-  50 accuracy:  33.60%
top- 100 accuracy:  38.90%


In [51]:
#(y==I[:,0]).mean()*100 # top 1 accuracy
# top1 acc: 25% for USPTO-sm
# top100 acc: 90% for USPTO-sm

24.94507689235071