In [1]:
import os
import sys
import time
import tqdm
import pickle
import numpy as np
import pandas as pd
from datetime import datetime

In [2]:
os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
import tensorflow as tf

2025-01-04 13:39:31.396919: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-04 13:39:31.396964: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-04 13:39:31.396968: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-04 13:39:31.399996: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
tf.random.set_seed(0)
tf.keras.utils.set_random_seed(0)
tf.config.experimental.enable_op_determinism()

In [4]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
gpus = tf.config.experimental.list_physical_devices('GPU')

#Setting up memory growth
for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu, True)

2025-01-04 13:39:32.067080: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 13:39:32.080380: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 13:39:32.080458: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [5]:
os.environ['XLA_FLAGS']="--xla_gpu_cuda_data_dir=/home/descartes/miniconda3/envs/DECIMERv1/lib"

In [6]:
from Network import I2S_Model, I2S_Data, I2S_Utils

In [7]:
class Configs:
    ## Inputs
    filenames_training = [
        'training_data_original',
        'training_data_repaint',
        'training_data_rdkit',
        'training_data_randepict',
    ]
    ## Setting up training parameters, found after optimizing
    epochs = 100
    batch_size = 600
    buffer_size = 1000
    embedding_dim = 600
    units = 1024
    ## Here, we are using Inception V3 as base so the feature shape is set to 2048 and the attention shape is set to 64
    features_shape = 2048
    attention_features_shape = 64

In [8]:
image_features_extract_model = I2S_Data.get_image_features_extract_model()

2025-01-04 13:39:32.904038: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 13:39:32.904140: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 13:39:32.904183: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [9]:
with open(os.path.join('Network', 'tokenizer.pkl'), "rb") as fin:
    tokenizer = pickle.load(fin)

vocab_size = len(tokenizer.word_index) + 1
print(f"vocab_size: {vocab_size}")

vocab_size: 69


In [10]:
## Init
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.0005) # lr will be ignored
encoder = I2S_Model.CNN_Encoder(Configs.embedding_dim)
decoder = I2S_Model.RNN_Decoder(Configs.embedding_dim, Configs.units, vocab_size)
trainer = I2S_Utils.Trainer(encoder, decoder, optimizer, tokenizer)

# Test dataset

In [11]:
test_smiles, test_img_name = I2S_Data.data_loader_eval(os.path.join('Data', 'test_data.csv'), os.path.join('Data', 'test_data'))
#test_smiles, test_img_name = I2S_Data.data_loader_eval(os.path.join('Data', 'training_data_original.csv'), os.path.join('Data', 'training_data_original'))
#test_smiles = [I2S_Utils.deep2smi(x) for x in test_smiles]

# Evaluation

In [12]:
output_dir = 'Results'
if not os.path.exists(output_dir):
    os.mkdir(output_dir)

In [14]:
frames = []

In [13]:
for filename in Configs.filenames_training:
    ## Load checkpoint
    checkpoint_path = os.path.join('ckpt', filename)
    trainer.load_checkpoint(checkpoint_path)

    ## Prediction
    test_pred = trainer.evaluate(test_img_name, image_features_extract_model, use_tqdm=True)
    pd.DataFrame({
        'smiles':test_pred,
        'molfile':None,
        'image_file':[x.split(os.sep)[-1] for x in test_img_name],
    }).to_csv(os.path.join(output_dir, f'{filename}.csv'), index=False)

    ## Evaluation
    df_score = pd.DataFrame(columns=['ground_truth', 'predicted', 'tanimoto_score'])
    for i in range(len(test_smiles)):
        x = test_smiles[i]
        y = test_pred[i]
        s = I2S_Utils.calc_tanimoto_similarity(x, y)
        df_score.loc[i,'ground_truth'] = x
        df_score.loc[i,'predicted'] = y
        df_score.loc[i,'tanimoto_score'] = s
    frames.append(df_score)
    df_score.to_csv(os.path.join(output_dir, f'tanimoto_{filename}.csv'), index=False)

2025-01-04 13:02:59.977393: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907                                                                 | 0/300 [00:00<?, ?it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [00:41<00:00,  7.26it/s]
[13:03:41] Explicit valence for atom # 1 O, 3, is greater than permitted
[13:03:41] Explicit valence for atom # 2 C, 5, is greater than permitted
[13:03:41] Explicit valence for atom # 2 P, 26, is greater than permitted
[13:03:41] Explicit valence for atom # 6 C, 5, is greater than permitted
[13:03:41] Explicit valence for atom # 6 C, 8, is greater than permitted
[13:03:41] Explicit valence for atom # 2 C, 25, is greater than permitted
[13:03:41] Explicit valence for atom # 1 O, 3, is greater than permitted
[13:03:41] Explicit valence for atom # 5 N, 5, is greater than permitted


In [15]:
filename = 'Trained_Model'

## Load checkpoint
checkpoint_path = os.path.join('Trained_Models', 'Trained_Models')
trainer.load_checkpoint(checkpoint_path)

## Prediction
test_pred = trainer.evaluate(test_img_name, image_features_extract_model, use_tqdm=True)
pd.DataFrame({
    'smiles':test_pred,
    'molfile':None,
    'image_file':[x.split(os.sep)[-1] for x in test_img_name],
}).to_csv(os.path.join(output_dir, f'{filename}.csv'), index=False)

## Evaluation
df_score = pd.DataFrame(columns=['ground_truth', 'predicted', 'tanimoto_score'])
for i in range(len(test_smiles)):
    x = test_smiles[i]
    y = test_pred[i]
    s = I2S_Utils.calc_tanimoto_similarity(x, y)
    df_score.loc[i,'ground_truth'] = x
    df_score.loc[i,'predicted'] = y
    df_score.loc[i,'tanimoto_score'] = s
frames.append(df_score)
df_score.to_csv(os.path.join(output_dir, f'tanimoto_{filename}.csv'), index=False)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 300/300 [01:09<00:00,  4.34it/s]
[13:42:31] Explicit valence for atom # 0 O, 6, is greater than permitted
[13:42:31] Explicit valence for atom # 6 N, 4, is greater than permitted
[13:42:31] Explicit valence for atom # 2 C, 5, is greater than permitted
[13:42:31] Explicit valence for atom # 2 C, 8, is greater than permitted
[13:42:31] Explicit valence for atom # 15 C, 9, is greater than permitted
[13:42:31] Explicit valence for atom # 15 N, 5, is greater than permitted
[13:42:31] Explicit valence for atom # 1 N, 4, is greater than permitted
[13:42:31] Explicit valence for atom # 1 C, 6, is greater than permitted
[13:42:31] Explicit valence for atom # 7 C, 7, is greater than permitted
[13:42:31] Explicit valence for atom # 10 N, 6, is greater than permitted
[13:42:31] Explicit valence for atom # 10 N, 5, is greater t

# Results

In [14]:
for df, filename in zip(frames, Configs.filenames_training):
    mu = df['tanimoto_score'].mean()
    std = df['tanimoto_score'].std()
    print(f'{filename}:\t{mu:.4f} ({std:.4f})')

training_data_original:	0.0340 (0.0533)
training_data_repaint:	0.0486 (0.0735)
training_data_rdkit:	0.0383 (0.0508)
training_data_randepict:	0.0369 (0.0542)


In [16]:
for df, filename in zip(frames, ['Trained_Model']):
    mu = df['tanimoto_score'].mean()
    std = df['tanimoto_score'].std()
    print(f'{filename}:\t{mu:.4f} ({std:.4f})')

Trained_Model:	0.0235 (0.0402)
