In [1]:
# -*- coding: utf-8 -*-
"""
Created on Tue Feb 25 15:36:30 2020

@author: stravsm
"""

import importlib
from importlib import reload
from tqdm import tqdm
import os


In [2]:
%reload_ext autoreload
%autoreload 2 

import tensorflow as tf
import numpy as np
import pandas as pd

from fp_management import database as db
from fp_management import mist_fingerprinting as fpr
from fp_management import fingerprint_map as fpm
import smiles_config as sc

sc.config_file.append("config.EULER-eval.yaml")
sc.config_reload()

import infrastructure.generator as gen
import infrastructure.decoder as dec

import time
from datetime import datetime
import pickle
import pathlib


from rdkit import RDLogger
lg = RDLogger.logger()
lg.setLevel(RDLogger.CRITICAL)
import infrastructure.score as msc
import gc
import random

# Disable dropout. Is there a more elegant way to adapt config at runtime?
sc.config["model_config"]["training"] = False

# Randomness is relevant for stochastic sampling
random_seed = sc.config['random_seed_global']
if random_seed != '':
    random.seed(random_seed)
    np.random.seed(random_seed)
    tf.random.experimental.set_seed(random_seed)

# Setup logger
import logging
logging.basicConfig(format='%(asctime)s - %(message)s', 
                    datefmt='%d-%b-%y %H:%M:%S')
logger = logging.getLogger("MSNovelist")
logger.setLevel(logging.INFO)
logger.info("evaluation startup")

eval_folder = pathlib.Path(sc.config["eval_folder"])
eval_folder.mkdir(parents=True, exist_ok=True)

eval_id = str(int(time.time()))
pickle_id = eval_id
if sc.config['eval_id'] != '':
    eval_id = sc.config['eval_id']
if sc.config['eval_counter'] != '':
    pickle_id = sc.config['eval_id'] + "-" + sc.config['eval_counter']
    
if isinstance(sc.config['weights'], list):
    weights_list = sc.config['weights']
else:
    weights_list = [sc.config['weights']]
    


  warn("PubChem database not found or not connected (read-only path?)")
03-Jun-24 15:05:55 - evaluation startup


In [3]:

# First, do everything independent of weights

fpr.MistFingerprinter.init_instance()
fingerprinter = fpr.MistFingerprinter.get_instance()


In [4]:

  
n = sc.config["eval_n"]
n_total = sc.config["eval_n_total"]
#n_total_ = n_total // n * n
k = sc.config["eval_k"]
kk = sc.config["eval_kk"]
steps = sc.config["eval_steps"]

decoder_name = sc.config["decoder_name"]

evaluation_set = sc.config["evaluation_set"]

# File for CSI:FingerID validation data
data_eval_ = sc.config["db_path_eval"]
# Load mapping table for the CSI:FingerID predictors
# Load dataset and process appropriately
db_eval = db.FpDatabase.load_from_config(data_eval_)
pipeline_options =  db_eval.get_pipeline_options()
    
pipeline_encoder = sc.config['pipeline_encoder']
pipeline_reference = sc.config['pipeline_reference']

dataset_val = db_eval.get_grp(evaluation_set)
if n_total != -1:
    dataset_val = dataset_val[:n_total]
else:
    n_total = len(dataset_val)


In [5]:
pipeline_options

{'embed_X': False,
 'unpackbits': False,
 'unpack': False,
 'fingerprint_selected': 'fingerprint_degraded'}

In [6]:
# On-the-fly translate the dataset :(
# 

def entry_for_row(row):
    res = {
        key: row[key] for key in row.keys()
    }
    res["fingerprint"] = fingerprinter.get_fp(row["fingerprint"])[0,:]
    res["fingerprint_degraded"] = fingerprinter.get_fp(row["fingerprint_degraded"])[0,:]
    return res

dataset_val_mapped = [entry_for_row(x) for x in dataset_val]

In [7]:
r = dataset_val_mapped[0]

In [8]:
r["fingerprint_degraded"].shape

(4096,)

In [9]:
%autoreload 2
# Load dataset and sampler, apply sampler to dataset
# (so we can also evaluate from fingerprint_sampled)
fp_dataset_val_ = gen.smiles_pipeline(dataset_val_mapped,
                                    batch_size = n,
                                    **pipeline_options,
                                    map_fingerprints=False,
                                    degraded_fingerprint_type = "uint8")

fp_dataset_val = gen.dataset_zip(fp_dataset_val_, 
                                 pipeline_encoder, pipeline_reference,
                                 **pipeline_options)


03-Jun-24 15:05:59 - using unpickle_mf
03-Jun-24 15:06:00 - not using unpack
03-Jun-24 15:06:00 - not using fp_map
03-Jun-24 15:06:00 - not using embed_X
03-Jun-24 15:06:00 - Selecting fingerprint fingerprint_degraded


In [10]:
next(iter(fp_dataset_val))

({'fingerprint_selected': <tf.Tensor: shape=(16, 4096), dtype=float32, numpy=
  array([[0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         ...,
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.],
         [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>,
  'mol_form': <tf.Tensor: shape=(16, 10), dtype=float32, numpy=
  array([[15.,  0.,  0.,  0.,  7.,  3.,  0.,  0.,  1., 19.],
         [53.,  0.,  0.,  0.,  0., 22.,  0.,  0.,  0., 90.],
         [47.,  0.,  0.,  0.,  2., 13.,  0.,  0.,  0., 56.],
         [54.,  0.,  0.,  0.,  0., 23.,  0.,  0.,  0., 92.],
         [16.,  0.,  0.,  0.,  2.,  3.,  0.,  0.,  1., 16.],
         [21.,  0.,  0.,  0.,  2.,  3.,  0.,  0.,  0., 28.],
         [23.,  0.,  0.,  2.,  7.,  1.,  0.,  0.,  0., 31.],
         [21.,  1.,  0.,  0.,  2.,  2.,  0.,  0.,  1., 19.],
         [58.,  0.,  0.,  0.,  8.,  8.,  0.,  0.,  0., 78.],
         [21.,  0.,  0.,  0.,  3.,  5

In [11]:

sampler_name = sc.config['sampler_name']
round_fingerprints = True
if sampler_name != '':
    logger.info(f"Sampler {sampler_name} loading")
    sampler_module = importlib.import_module('fp_sampling.' + sampler_name, 'fp_sampling')
    sampler_factory = sampler_module.SamplerFactory(sc.config)
    round_fingerprints = sampler_factory.round_fingerprint_inference()
    sampler = sampler_factory.get_sampler()
    logger.info(f"Sampler {sampler_name} loaded")
    fp_dataset_val_ = sampler.map_dataset(fp_dataset_val_)



03-Jun-24 15:06:00 - Sampler basic_tp_fp loading
03-Jun-24 15:06:01 - Sampler basic_tp_fp loaded


In [12]:

for weights_i, weights_ in enumerate(weights_list):
    eval_id = str(int(time.time()))
    pickle_id = eval_id
    if sc.config['eval_id'] != '':
        eval_id = sc.config['eval_id']
    if sc.config['eval_counter'] != '':
        pickle_id = sc.config['eval_id'] + "-" + sc.config['eval_counter']
        if len(weights_list) > 1:
            pickle_id = sc.config['eval_id'] + "-" + sc.config['eval_counter'] + "-" + weights_i
    
    # logpath_topn = eval_folder / ("eval_" + eval_id + "_topn.txt")
    # logpath_top1 = eval_folder / ("eval_" + eval_id + "_top1.txt")
    picklepath = eval_folder / ("eval_" + pickle_id + ".pkl")
    logger.info(picklepath)
    logger.info(weights_)
    weights = os.path.join(sc.config["weights_folder"], weights_)

    
    retain_single_duplicate = True

    fp_dataset_iter = iter(fp_dataset_val)
    blueprints = gen.dataset_blueprint(fp_dataset_val_)
    
    # Load models
    


03-Jun-24 15:06:01 - /data/MSNovelist-results/eval_1717427161.pkl
03-Jun-24 15:06:01 - w-05-0.071-0.069.hdf5


In [13]:
    import model
    
    model_encode = model.EncoderModel(
                     blueprints = blueprints,
                     config = sc.config,
                     round_fingerprints = round_fingerprints)
    model_decode = model.DecoderModel(
                     blueprints = blueprints,
                     config = sc.config,)
    model_transcode = model.TranscoderModel(
                    blueprints = blueprints,
                     config = sc.config,
                     round_fingerprints = round_fingerprints)
    


using fingerprint rounding in model
using fingerprint rounding in model


In [14]:
fp_dataset_val

<ZipDataset shapes: ({fingerprint_selected: (None, 4096), mol_form: (None, 10), n_hydrogen: (None,)}, ((None,), (None, 4096))), types: ({fingerprint_selected: tf.float32, mol_form: tf.float32, n_hydrogen: tf.float32}, (tf.string, tf.float32))>

In [15]:
    # Build models by calling them
    y_ = model_transcode(blueprints)
    enc = model_encode(next(fp_dataset_iter)[0])
    _ = model_decode(enc)
    
    model_transcode.load_weights(weights, by_name=True)
    model_encode.copy_weights(model_transcode)
    model_decode.copy_weights(model_transcode)
    

03-Jun-24 15:06:03 - Loading layer encoder weights
03-Jun-24 15:06:03 - Loaded
03-Jun-24 15:06:03 - Loading layer hydrogen_estimator weights
03-Jun-24 15:06:03 - Loaded
03-Jun-24 15:06:03 - Loading layer tokens_y weights
03-Jun-24 15:06:03 - Loaded


In [16]:

    # Initialize decoder
    decoder = dec.get_decoder(decoder_name)(
        model_encode, model_decode, steps, n, k, kk, config = sc.config)
    logger.info("Decoder initialized")
    logger.info(f"Processing and scoring predictions")
    


03-Jun-24 15:06:03 - Decoder initialized
03-Jun-24 15:06:03 - Processing and scoring predictions


In [17]:
    logger.info(f"Predicting {n_total} samples - start")
    logger.info(f"Beam block size {n}*{k}*{steps}, sequences retrieved per sample: {kk}")
    result_blocks = []
    reference_blocks = []
    for data in tqdm(fp_dataset_val, total = (n_total -1) // n + 1):
        # repeat the input data k times for each of n queries
        # (now we encode each of k samples individually because the encoding
        # may be probabilistic)
        
        # make a custom decoder if we don't have all n samples
        n_real = len(data[0]['n_hydrogen'])
        if n_real != n:
            decoder = dec.get_decoder(decoder_name)(
                    model_encode, model_decode, steps, n_real, k, kk, config = sc.config)
        
        data_k = {key: tf.repeat(x, k, axis=0) for key, x in data[0].items()}
        states_init = model_encode.predict(data_k)
        # predict k sequences for each query.
        sequences, y, scores = decoder.decode_beam(states_init)
        seq, score, length = decoder.beam_traceback(sequences, y, scores)
        smiles = decoder.sequence_ytoc(seq)
        results_df = decoder.format_results(smiles, score)
        result_blocks.append(results_df)
        reference_df = decoder.format_reference(
            [bytes.decode(x, 'UTF-8') for x in data[1][0].numpy()],
            [d for d in data[1][1].numpy()])
        reference_blocks.append(reference_df)
    results = pd.concat(result_blocks)        
    logger.info(f"Predicting {n_total} samples - done")
    pickle.dump(results, open(
        picklepath.with_suffix("").with_name(picklepath.name + "_all"), "wb")
        )


03-Jun-24 15:06:03 - Predicting 200 samples - start
03-Jun-24 15:06:03 - Beam block size 16*64*128, sequences retrieved per sample: 10
100%|██████████| 13/13 [00:35<00:00,  2.73s/it]
03-Jun-24 15:06:38 - Predicting 200 samples - done


In [24]:
results

Unnamed: 0,smiles,score,id,n,k
0,CS(=O)(=O)N1CCc2nc(-c3cnc(N)nc3N3CCOCC3)ncc21,-3.732617,0,0,0
1,CS(=O)(=O)N1CCc2c(N)nc(-c3cnc(N)nc3N3CCOC3)cc21,-3.855933,1,0,1
2,CS(=O)(=O)N1CCc2c(N3CCOCC3)nc(-c3cnc(N)nc3)nc21,-3.999996,2,0,2
3,CS(=O)(=O)N1CCc2c(N)nc(-c3cnc(N4CCOCC4)nc3)nc21,-4.053775,3,0,3
4,CS(=O)(=O)N1CCc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc21,-4.145812,4,0,4
...,...,...,...,...,...
75,Cc1ccc(C(c2oc3ccccc3c(=O)c2O)c2c(O)c3ccccc3c(=...,-5.372528,75,7,5
76,Cc1ccc(C(c2c(O)c3ccccc3oc(=O)c2O)c2c(O)c3ccccc...,-5.419231,76,7,6
77,Cc1ccc(C(c2c(O)c3ccccc3oc2=O)C(=O)c2c(O)c3cccc...,-5.610552,77,7,7
78,Cc1ccc(C(c2oc3ccccc3c(=O)c2O)c2oc(=O)c3ccccc3c...,-5.880943,78,7,8


In [60]:
    %autoreload
    logger.info(f"Evaluating {n_total} blocks - start")
    
    results_evaluated = []
    for block_, ref_, block_id in zip(tqdm(result_blocks), 
                                    reference_blocks,
                                    range(len(result_blocks))):
        # Make a block with molecule, MF, smiles for candidates and reference
        block = db.process_df(block_, fingerprinter,
                              construct_from = "smiles",
                              block_id = block_id)
        
        if retain_single_duplicate:
            block.sort_values("score", ascending = False, inplace = True)
            block = block.groupby(["n", "inchikey1"]).first().reset_index()
            
        ref = db.process_df(ref_, fingerprinter,
                              construct_from = "smiles",
                              block_id = block_id)
        # Also actually compute the true fingerprint for the reference

        if sc.config["eval_fingerprint_all"]:
            fingerprinter.process_df(ref,
                                    out_column = "fingerprint_ref_true",
                                    inplace=True)
            
        # Match ref to predictions
        block = block.join(ref, on="n", rsuffix="_ref")
        # Keep only correct formula
        block_ok = block.loc[block["inchikey1"].notna()].loc[block["mf"] == block["mf_ref"]]
        # Now actually compute the fingerprints, only for matching MF
        if sc.config["eval_fingerprint_all"]:
            fingerprinter.process_df(block_ok,
                                 inplace=True)
        block = block.merge(
            block_ok[["n","k","fingerprint"]],
            left_on = ["n", "k"],
            right_on = ["n", "k"],
            suffixes = ["_ref", ""],
            how = "left")
    
        results_evaluated.append(block)
        


03-Jun-24 15:27:08 - Evaluating 200 blocks - start
  0%|          | 0/13 [00:00<?, ?it/s]

failed parsing id 12 - s????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 13 - c=???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 14 - C?O??????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 15 - CC(P?????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 16 - CNCCC????????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 17 - CC(C=C???????????????????????????????????????????????????????????????????????????????????????????????????????????????????????
failed parsing id 18 - COCC(=O????????????????????????????????????????????????????????????????????????????

  8%|▊         | 1/13 [00:00<00:06,  1.95it/s]

[{'data_id': 0, 'smiles_generic': 'CS(=O)(=O)N1CCc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc21', 'smiles_canonical': 'CS(=O)(=O)N1CCc2c(-c3cnc(N)nc3)nc(N3CCOCC3)nc21', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAEAAAgAEAAAAAgAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAQAEAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAgAAAAAAABAAACAAAAAAAAACEAABAAAIAACAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAACAAAAAAAAAAAAAAAgAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAYAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAgAAAAEAAAAAAAEAAAAAAAAIAAAAAAAAAAAAAAIAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIQAAAQgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAACAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'CC(C)=CCCC(C)(OC1OC(COC2OCC(O)C(O)C2O)C(O)C(O)C1O)C1CCC2(C)C1C(O)CC1C3(C)CCC(OC4OC(CO)C(O)C(O)C4OC4OC(CO)

 15%|█▌        | 2/13 [00:01<00:06,  1.64it/s]

[{'data_id': 0, 'smiles_generic': 'CCOC(=O)C(C)Oc1ccc(Oc2cnc3cc(Cl)ccc3n2)cc1', 'smiles_canonical': 'CCOC(=O)C(C)Oc1ccc(Oc2cnc3cc(Cl)ccc3n2)cc1', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAgAAAQAAAAAAAAAAAAAAAAQAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAACAAABAACAIAAAAAgAAAAAEAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAgAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAEAACAAAAAAAQAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAgAAAAAAAAAAAAAAAAgAAAAAIAAAAIAAAAAAIBAAAAAAAAAAgAIAEAAAAAAAAAAAAAAAAACEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAQAAAAAAAAAAAAACAAAEAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAIAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAQAAAAAAAAAAAAAAAAAAAAAAAgAAACAAAAAAAAAAAAAAAABAAAQAAAAAAAAAAABAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'CC1(C)OCC(COC(=O)CCc2ccc(OCC(O)CNCCNC(=O)N3CCOCC3)cc2)O1', 'smiles_canonical': 'CC1(C)OCC(COC(=O)CCc2ccc(OCC(O)CNCC

 23%|██▎       | 3/13 [00:01<00:04,  2.01it/s]

[{'data_id': 0, 'smiles_generic': 'Oc1ccc(C=Cc2cc(O)cc3c2C(c2cc(O)cc(O)c2)C(c2ccc(O)cc2)O3)cc1', 'smiles_canonical': 'Oc1ccc(C=Cc2cc(O)cc3c2C(c2cc(O)cc(O)c2)C(c2ccc(O)cc2)O3)cc1', 'fingerprint': b'AAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAEAAAAAAAAAAAAAQAACAAAAAIAAACAAAAAAAAgAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAACAAAAAAAAABAAAAAAAAAAAGAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAIAAAAACAAAAAAAAABAAAAAAAAAAAAAAAAAAgAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAQAAAAAAAAAIAAAAAAAAAAAAACAAAAABAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAABAAAAAAEAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAEAAQAAAAAAAAAAAAAAQAAAAAQAAAgAAAAAAAAAAAIAAAAAAAAAAAAAAACAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'Cc1nn(C(C)c2ccccc2)c2c1C(c1ccccc1)SCC(=O)N2', 'smiles_canonical': 'Cc1nn(C(C)c2cc

 31%|███       | 4/13 [00:01<00:03,  2.27it/s]

[{'data_id': 0, 'smiles_generic': 'CC(C)(C)c1ccc(C(O)CCCN2CCC(C(O)(c3ccccc3)c3ccccc3)CC2)cc1', 'smiles_canonical': 'CC(C)(C)c1ccc(C(O)CCCN2CCC(C(O)(c3ccccc3)c3ccccc3)CC2)cc1', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAIABAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEBAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAgABAAAAAAAgAAAAAAAAABAAAAAAAAAAAQAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAACAAAAAAABAAAABAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAABggAAAAAAAAAAAAAAIAAAAAAIAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAgAAAAAAAAAAAACAAABAAAAAAAAAAAIAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAQAAAAACAAAAAAAAAACAAAAIAAACQAAAAAAAAAAAAAAAAAAIAAAA='}, {'data_id': 1, 'smiles_generic': 'COC1C=COC2(C)Oc3c(C)c(OC(=O)C4CCCCC4)c4c(c3C2=O)C(=O)C(N(C)C)=C(NC(=O)C(C)=CC=CC(C)C(

 38%|███▊      | 5/13 [00:02<00:03,  2.43it/s]

[{'data_id': 0, 'smiles_generic': 'CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1', 'smiles_canonical': 'CN1CCN(C(=O)OC2c3nccnc3C(=O)N2c2ccc(Cl)cn2)CC1', 'fingerprint': b'AEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAwAAAAAAAAAAAAAAAAAAAAAAAAAEAAQAAAAAAAAAAAAAAAAAAACAEAAAAAIIAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAABAAAACAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAgAABAAAAAAAAAAAAAAAAABAIAAAACAABAAAAAAIIAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgBAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAAAAAAQIAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAIAAQAAAAgAAACAAAAAAAAAAAAAACAAAEAAAAAAAAAAAAAAABAgAAAAAAAAAAAAAAIAAAAAAICAAABAAAAAAIAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAABAIAAAAAAAAAAAAAAAAAAAAAAgAAAAA='}, {'data_id': 1, 'smiles_generic': 'CC(C)=CCc1c(-c2ccc(O)cc2O)oc2c(C3C=C(C)CC(c4ccc(O)cc4O)C3C(=O)c3ccc(O)cc3O)c(O)cc(O)c2c1=O', 'smiles_canoni

 46%|████▌     | 6/13 [00:02<00:02,  2.68it/s]

[{'data_id': 0, 'smiles_generic': 'CC(C)=CCCC(C)(OC1OC(COC2OC(CO)C(O)C2O)C(O)C(O)C1O)C1CCC2(C)C1C(O)CC1C3(C)CCC(OC4OC(CO)C(O)C(O)C4OC4OC(CO)C(O)C(O)C4O)C(C)(C)C3CCC12C', 'smiles_canonical': 'CC(C)=CCCC(C)(OC1OC(COC2OC(CO)C(O)C2O)C(O)C(O)C1O)C1CCC2(C)C1C(O)CC1C3(C)CCC(OC4OC(CO)C(O)C(O)C4OC4OC(CO)C(O)C(O)C4O)C(C)(C)C3CCC12C', 'fingerprint': b'AABAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAIACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAgCAAAAAAAAAQAAAAAAAEgAACAAAAAAAAAAAAAAAAAAIAAgAAAAAAAIAABAAABAAAAAAAAAAAAgAAAAAAAIAAAAAAgAAAAAAAAAAEAAAQAAAAAAAAAAAAAAAIABAASAAAAIAAAAAAAAAAAAAAAAAAQAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAQAAAAQgACAAAAAgACwAAAAAIAAAAAAAAAAAAAAAIAAAAAAAABAAAABAAAAAAAAIAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAgAAAAAAAAAAgAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAACAAAAAAAACAIAAAAIAAABCAAAAAAAAAAAAAAIKAAAAABIABAAAAAAAIAAAEAAAAAAAAAAAAAAEAAAAAAAAAAAUAAgAAAAAAAAgAAAAAAAACAAAAAAAgABAABAAAAAAAABAQAAAAQAAAAEAAAAAAAAEAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAA

 54%|█████▍    | 7/13 [00:02<00:02,  2.56it/s]

[{'data_id': 0, 'smiles_generic': 'COc1ccc(N2CCN(C3=C4NC(=O)C(C)=CC=CC(C)C(O)C(C)C(O)C(C)C(OC(C)=O)C(C)C(OC)C=COC5(C)Oc6c(C)c(O)c(c(c6C5=O)C3=O)C4=O)CC2)cc1', 'smiles_canonical': 'COc1ccc(N2CCN(C3=C4NC(=O)C(C)=CC=CC(C)C(O)C(C)C(O)C(C)C(OC(C)=O)C(C)C(OC)C=COC5(C)Oc6c(C)c(O)c(c(c6C5=O)C3=O)C4=O)CC2)cc1', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAESAAAAAAAAAAAAQAAAAAAQAAAAAAAAAAAAABQAAAAAAAAAAAAAAAAACAAAAABBAAAAEAAAAAAAAAAEAAACAAAAAAAAAAAAAAAAAAIAAAAAAAgAAAAAQEAAAAgAAAgAAAAAAAAAAAAAAAAAAEAAAACEAAABAAAAAAAAAAAAAEAAAAAAAAgIABAAAACAAAAAAAAAAABAAAAAAAAAAAAAAAAABAAAAAAAAAQACAAAAAAAAAAAAAAAAAAAAEAAAAEAAAAAIAAABEAAAIAAAAAAAAAAAAAAAAAAgAIAAQCAAAAAAAAAAAAAAIACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAgASAAAAAAAAAAQAAAAAAAAEAAAAgAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAQAAACAAAAAQAAAAAAAAAAgAACAAACAAAAAQAAAAAAAAABAgAAAAAAAAAAAAAAKAAkAABIAAAgAIAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAEQAEAgAAAAAAAAAQAAAAgAAAAAAAAAAAQgAAAEAIAAAAAAACAAgAABAAAAAQAAAIAAAAAAAAABAAAQAAAAAAAAAAAAAIAAAAAAAIgAAAAAAAAAAAAAAIAAAAAAAg

 62%|██████▏   | 8/13 [00:03<00:01,  2.66it/s]

[{'data_id': 0, 'smiles_generic': 'COc1nc(NC(=O)NS(=O)(=O)c2ccccc2C(F)(F)F)nc(C(F)(F)F)n1', 'smiles_canonical': 'COc1nc(NC(=O)NS(=O)(=O)c2ccccc2C(F)(F)F)nc(C(F)(F)F)n1', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAEAAAAABAAAAAAAAAAAAAAEgAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAEAAAAAAAAAAAgAAAAAAIAAACAAAAAAAAAAAAAEAAAAAAAAAAAAAAAEAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAACAAAAAACAAgAAAAAAAAAAAAQAAAABAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAgAAEAAAAAAAAAAAAQAAAAQAAAAAAAAAAAAAAAAAAABAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAABAAAIAAAAAAAAAAAAQAAAAAAAAAAAAAAIAAABAAAAAAQAAAAAAAAAAAAAAAAAAAAAAACAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAEAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAQAAAAAAAQIAAAAAAAAAAAAAAAAAIAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'COc1c(C2=CC(=O)N(C)C2=O)c2c(n1C)C(=O)c1ccccc1C2=O', 'smiles_canonical': 'COc1c(C2=CC(=O)N(C

 69%|██████▉   | 9/13 [00:03<00:01,  2.48it/s]

[{'data_id': 0, 'smiles_generic': 'CC(C)=CCCC(C)(OC1OC(COC2OC(CO)C(O)C2O)C(O)C(O)C1O)C1CCC2(C)C1C(O)CC1C3(C)CCC(OC4OC(CO)C(O)C(O)C4O)C(C)(C)C3CCC12C', 'smiles_canonical': 'CC(C)=CCCC(C)(OC1OC(COC2OC(CO)C(O)C2O)C(O)C(O)C1O)C1CCC2(C)C1C(O)CC1C3(C)CCC(OC4OC(CO)C(O)C(O)C4O)C(C)(C)C3CCC12C', 'fingerprint': b'AABAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAIACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAgCAAAAAAAAAQAAAAAAAEAAACAAAAAAAAAAAAAAAAAAIAAgAAAAAAAIAABAAABAAAAAAAAAAAAgAAAAAAAIAAAAAAAAAAAAAAAAAEAAAQAAAAAAAAAAAAAAAIABAASAAAAIAAAAAAAAAAAAAAAAAAQAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAQAAAAQgACAAAAAgACgAAAAAIAAAAAAAAAAAAAAAIAAAAAAAABAAAABAAAAAAAAIAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAgAAAAAAAAAAgAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAACAAAAAAAACAAAAAAIAAABCAAAAAAAAAAAAAAIKAAAAABIABAAAAAAAIAAAEAAAAAAAAAAAAAAEAAAAAAAAAAAUAAgAAAAAAAAgAAAAAAAACAAAAAAAgABAABAAAAAAAABAQAAAAAAAAAEAAAAAAAAEAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAACAAAAAAAAAAAAAA='}, {'data_

 77%|███████▋  | 10/13 [00:04<00:01,  2.06it/s]

[{'data_id': 0, 'smiles_generic': 'COCCOC1CCC(Nc2cc(=O)n(C)c3ccc(-c4cncs4)cc23)CC1', 'smiles_canonical': 'COCCOC1CCC(Nc2cc(=O)n(C)c3ccc(-c4cncs4)cc23)CC1', 'fingerprint': b'ACAAAAAgAEAAAAAAAACEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAEAAAAACAAAAAAAAAAQAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAKACAAAAAAAAAAAAAAAAAAAAAgAAAAgAAAAgAAAEAAAAAAAgAAAAAAAAgAAAABAAAAAAAAAAAAAAAAIACAAEAAAAAQQAAAQAAAAAAAAABAAAAIAAAACAAABAAAAAAAAAAAAAACAAAAAAAAACAAAAAgAAAIAAAAAAABAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAACAAAAAAAAIAABAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAACABAABAgAAAAAAAAAAAAAAIAAAAAAIAAAAAAAAAAAAgAAEAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAACAAAAAAAAAAAAAAAAAAAAAAAYAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'COCC=C1CN2C3CC45C(=Nc6c(OC)cc(OC)c(OC)c64)OCC3C1CC25', 'smiles_canonical': 'COCC=C1CN2C3CC45C(=Nc6c(OC)cc

 85%|████████▍ | 11/13 [00:05<00:01,  1.84it/s]

[{'data_id': 0, 'smiles_generic': 'CC(=O)C(C)=C(c1ccccc1)N(C=O)Cc1cnc(N)nc1SCCOC(=O)c1ccccc1', 'smiles_canonical': 'CC(=O)C(C)=C(c1ccccc1)N(C=O)Cc1cnc(N)nc1SCCOC(=O)c1ccccc1', 'fingerprint': b'ACAAAAEAAAAAAAAAAAAQAAAACAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAEIgAAAgAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgBAAAAAAAAAAAACAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAACAAAAAAAAAASAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAgAAAAAAIAAAAiAAAAAAAAAAAAAAAAAAAAAAARAAAAAAAAAAIAAAAAAAAAQAAABAAAAAAAEAAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAgAAAAAAAAAAAAAAEAAAAAQAAQAAAAAQAUAAAAAAAAAAAAAAACAAAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAUAECAAAgAAIAAAgAAAAAAAAAAAAACABAgAAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAgAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAIAAAIAAAAAAAAAAAAAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'CC(C=S)=C(CCOC(=O)c1ccccc1)N(C=O)Cc1cnc(N)nc1OCc1ccccc1', 'smiles_canonical': 'CC(C=S

 92%|█████████▏| 12/13 [00:05<00:00,  1.70it/s]

[{'data_id': 0, 'smiles_generic': 'CC=CC1=CC2=CC(=O)C(C)(O)C(OC(=O)c3c(C)cc(O)cc3O)C2CO1', 'smiles_canonical': 'CC=CC1=CC2=CC(=O)C(C)(O)C(OC(=O)c3c(C)cc(O)cc3O)C2CO1', 'fingerprint': b'AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAIAAAIAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEAAACAAAAAAAAAAAAAAAACAIAAAAAAAAAAAAgAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAgAAAAAABBAAAABAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAABCAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAACAEAAAAAAAAAAAAAAgAAAAAAAAAACAgAAAAAAAAAAAAAAAAAAAAAAwAAAAAAAIAAIAgAAAAAAAAAAgAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAEAgAAAAAAAAAAAAAAAAAAAAAAEAAACAAAAAAQAAAAAQAAAAAAAAAIAAAAAQACAAAAACBAAAAAAAAAAAABAAAAAAAAAAAAAAAAIAAAAABIAAAAAAAAAAAAAAAAAACAABAAAAAAAAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAEhAAAAAAAAAAAAAAAAAAABAAAIAAAAAAAAAAAAAAAAAAAAAIAAAAAAAgAAAAAAIAAAAAAAAAAAAQ='}, {'data_id': 1, 'smiles_generic': 'O=C(c1ccc(OCC2CCCN(C(=O)c3ccc(F)c(F)c3)C2)cc1)N1CCOCC1', 'smiles_canonical': 'O=C(c1ccc(OCC2C

100%|██████████| 13/13 [00:06<00:00,  2.16it/s]

[{'data_id': 0, 'smiles_generic': 'COC(=O)c1ccc2c(C(=Nc3ccc(N(C)C(=O)CN4CCN(C)CC4)cc3)c3ccccc3)c(O)[nH]c2c1', 'smiles_canonical': 'COC(=O)c1ccc2c(C(=Nc3ccc(N(C)C(=O)CN4CCN(C)CC4)cc3)c3ccccc3)c(O)[nH]c2c1', 'fingerprint': b'AEAAAAAAAAAAAAAAAACAAAAAAACAAAAAAAQAAAAAAAAAACAAAAAAAAAAAAAAAAAAJAAAAAAAAAEAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACAAAAAIAAIAAAAAAAAAIAAAAAAAgAAAAgAAAAAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAEAAAAAAAACAQgAAAAAAAAAABAAAAAAAAAAAAABAAAAACAAABAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAAAAIAAAAAAAAAACAAAAAAACBAAAAAAAAAAAAAAAAAAAAAAAQAAAAAAAAAAgAAAAAAAAAAACAAAAAAAAAAAAAAAAAEAAAQQBAAAAAAAAAAIAAAAAAACAAAAAAAAAAAAAAAAAAAAAAAAAAAAAABAAAAAAAAAAAAAgAAAAAAAAAAAAAAAABAAEAAAAAAAAAAQAAAAAAAAAAAAAAAAAAAAAAACAAAAAAAAAAAAAAgAABABAgAAAAAAAAAAAAAAAAAAAAAIAAAAAAQAEAAAAAAAAAAAAgAAAAABAAAAAAAAAAAAAAAAAAAABACAAAgAAAAAAAAAAAAAAAAAAAIAAAAAAAAAAAAAABCAAAAAAAAAAAAAAAAAIACAAQgAAAACYAAAAAAAAAAAAAEIAAAAAAAAAAAAAAAAAAAAAAAAAAAA='}, {'data_id': 1, 'smiles_generic': 'CCOC(=O)C(C)Oc1ccc(Oc2cnc3cc(Cl)ccc3n2)cc1', 'smiles_ca




In [61]:
    logger.info(f"Evaluating {n_total} blocks - merging")
    results_complete = pd.concat(results_evaluated)
    results_complete["nn"] = n * results_complete["block_id"] + results_complete["n"]
    results_complete ["evaluation_set"] = evaluation_set
    
    logger.info(f"Pickling predictions from [{evaluation_set}]")
    pickle.dump(results_complete, open(picklepath, "wb"))

    
    results_ok = results_complete.loc[results_complete["fingerprint"].notna()].copy()


03-Jun-24 15:27:46 - Evaluating 200 blocks - merging
03-Jun-24 15:27:46 - Pickling predictions from [val]


In [62]:
results_complete.columns

Index(['n', 'inchikey1', 'smiles', 'score', 'id', 'k', 'smiles_in',
       'smiles_generic', 'smiles_canonical', 'mol', 'inchikey', 'mf',
       'block_id', 'smiles_ref', 'score_ref', 'id_ref', 'n_ref', 'k_ref',
       'fingerprint_ref', 'smiles_in_ref', 'smiles_generic_ref',
       'smiles_canonical_ref', 'mol_ref', 'inchikey_ref', 'inchikey1_ref',
       'mf_ref', 'block_id_ref', 'fingerprint_ref_true', 'fingerprint', 'nn',
       'evaluation_set'],
      dtype='object')

In [81]:
r = np.array([0,1,2])
rr= 2.
np.shape(r), np.shape(rr)

((3,), ())

In [100]:
stats = np.ones((4096, 3))
# Calculate tanimoto score in addition
def tanimoto_or_not(fp1, fp2):
    if fp1 is None:
        return 0
    if fp2 is None:
        return 0
    if np.shape(fp1) == ():
        return 0
    if np.shape(fp2) == ():
        return 0
    score = msc.score_tanimoto(
        #np.expand_dims(fp1, 0),
        #np.expand_dims(fp2, 0),
        fp1,
        fp2,
         stats=stats)[0]
    return score



tanimoto = [tanimoto_or_not(fp1, fp2) for fp1, fp2 in zip(
    results_complete["fingerprint"],
    results_complete["fingerprint_ref"],
    # not fingerprint_ref_true! this would be the oracle!
)]

results_complete["score_tanimoto"] = tanimoto


In [102]:
eval_folder = pathlib.Path("/data/MSNovelist-results")
out_path =  eval_folder / ("eval_" + pickle_id + ".tsv")


results_complete.to_csv(out_path, sep='\t')

In [103]:
results_complete["score_rnn"] = results_complete["score"]
short_out_path =  eval_folder / ("eval_" + pickle_id + "_short.tsv")
results_complete[["smiles", "inchikey1", "inchikey1_ref", "nn", "score_rnn", "score_tanimoto"]].to_csv(short_out_path, sep='\t')