In [1]:
import os
import re
import time
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from datetime import datetime

In [2]:
os.environ['TF_ENABLE_ONEDNN_OPTS'] = "0"
import tensorflow as tf

2025-01-04 09:37:03.720367: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2025-01-04 09:37:03.720391: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2025-01-04 09:37:03.720395: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-04 09:37:03.723444: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
os.environ['CUDA_VISIBLE_DEVICES'] = "0"
gpus = tf.config.experimental.list_physical_devices('GPU')

#Setting up memory growth
for gpu in gpus:
	tf.config.experimental.set_memory_growth(gpu, True)

2025-01-04 09:37:04.373168: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 09:37:04.388064: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 09:37:04.388152: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

In [4]:
os.environ['XLA_FLAGS']="--xla_gpu_cuda_data_dir=/home/descartes/miniconda3/envs/DECIMERv1/lib"

In [5]:
from Network import I2S_Model, I2S_Data, I2S_Utils

# Configurations

In [6]:
class Configs:
    ## Inputs
    filename_training = 'training_data_original'
    #filename_training = 'training_data_randepict'
    filepath_training = os.path.join('Data', f'{filename_training}.csv')
    image_dir = os.path.join('Data', filename_training)
    ## Setting up training parameters, found after optimizing
    epochs = 200
    batch_size = 64
    buffer_size = 1000
    embedding_dim = 600
    units = 1024
    ## Here, we are using Inception V3 as base so the feature shape is set to 2048 and the attention shape is set to 64
    features_shape = 2048
    attention_features_shape = 64

# Load a Training Dataset

In [7]:
img_name_train, img_name_val, smi_train, smi_val, image_features_extract_model = I2S_Data.data_loader(Configs.filepath_training, Configs.image_dir, train_test_split=False)

2025-01-04 09:37:04.439506: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 09:37:04.439612: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysfs-bus-pci#L344-L355
2025-01-04 09:37:04.439661: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:894] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero. See more at https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/testing/sysf

Selected Data  2604 All data  2604
<start>CCC=C=CCC)PC63C<end> Data/training_data_original/CDK_Depict_40_9.png
<start>C=NC=CN=CPC5=N9<end> Data/training_data_original/CDK_Depict_5_28.png
<start>C=CCN=CN)SN<end> Data/training_data_original/CDK_Depict_47_200.png
<start>O=CNC=O)C=COCC=CC=CC=C6)))))))))N5<end> Data/training_data_original/CDK_Depict_42_86.png
<start>CCC=NCC=N)NCCNCC)C6=N))))))))=CS5<end> Data/training_data_original/CDK_Depict_38_218.png


2025-01-04 09:37:05.517148: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:442] Loaded cuDNN version 8907


# Tokenization

In [8]:
with open(os.path.join('Network', 'tokenizer.pkl'), "rb") as fin:
    tokenizer = pickle.load(fin)

vocabs = tokenizer.word_index.keys()
vocab_size = len(tokenizer.word_index) + 1
print(f"vocab_size: {vocab_size}")

vocab_size: 69


In [9]:
seq_train = tokenizer.texts_to_sequences([I2S_Data.split_by_vocabulary(smi, vocabs) for smi in smi_train])
cap_train = tf.keras.preprocessing.sequence.pad_sequences(seq_train, padding='post')
max_length = I2S_Data.calc_max_length(seq_train)
print(f"max_length: {max_length}")

max_length: 60


In [10]:
dataset = I2S_Utils.create_dataset(img_name_train, cap_train, Configs.batch_size, Configs.buffer_size)

# Build a Model

In [11]:
encoder = I2S_Model.CNN_Encoder(Configs.embedding_dim)
decoder = I2S_Model.RNN_Decoder(Configs.embedding_dim, Configs.units, vocab_size)

In [12]:
## Network Parameters
#optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=0.00051)
optimizer = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [13]:
def loss_function(real, pred):
	mask = tf.math.logical_not(tf.math.equal(real, 0))
	loss_ = loss_object(real, pred)

	mask = tf.cast(mask, dtype=loss_.dtype)
	loss_ *= mask

	return tf.reduce_mean(loss_)

# Load the foundation model (pretrained)

In [14]:
trainer = I2S_Utils.Trainer(encoder, decoder, optimizer, tokenizer)

In [15]:
trainer.load_checkpoint(os.path.join('Trained_Models', 'Trained_Models'))

# Initialize a checkpoint manager

In [16]:
## Setting up path to save checkpoint
checkpoint_path = os.path.join('ckpt', Configs.filename_training)
ckpt = tf.train.Checkpoint(encoder=encoder, decoder=decoder, optimizer=optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=100)

In [17]:
start_epoch = 0
## Loading checkpoint to last saved
if ckpt_manager.latest_checkpoint:
	ckpt.restore(tf.train.latest_checkpoint(checkpoint_path))
	start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
print(start_epoch)

0


# Training

In [18]:
num_steps = len(img_name_train) // Configs.batch_size
print(f"num_steps: {num_steps}")

num_steps: 40


In [19]:
## validation dataset
do_validation = False

if do_validation:
    val_smiles, val_img_name = I2S_Data.data_loader_eval(os.path.join('Data', 'validation_data.csv'), os.path.join('Data', 'validation_data'))

In [20]:
## the loss_plot array will be reset many times
best_score = 0.
history = {'loss_training':[], 'score_validation':[]}
for epoch in range(start_epoch, Configs.epochs):
    start = time.time()
    total_loss = 0.

    for (batch, (img_tensor, target)) in enumerate(dataset):
        batch_loss, t_loss = trainer.train_step(img_tensor, target, loss_function)
        total_loss += t_loss
    total_loss = total_loss.numpy() / num_steps
    
    score = 0.
    if do_validation and epoch % 5 == 0:
        val_pred = trainer.evaluate(val_img_name, image_features_extract_model)
        for x, y in zip(val_smiles, val_pred):
            score += I2S_Utils.calc_tanimoto_similarity(x, y)
        score /= len(val_smiles)
        
        if best_score < score:
            print(f'checkpoint is updated: {best_score:.3f} -> {score:.3f}')
            best_score = score

    ## storing the epoch end loss value to plot later
    history['loss_training'].append(total_loss)
    history['score_validation'].append(score)    
    
    ckpt_manager.save()
    print(f">>> [{datetime.now().strftime('%Y/%m/%d %H:%M:%S')}] Epoch: {epoch+1}    Loss: {total_loss:.6f}   ({time.time() - start:.0f} sec)")

>>> [2025/01/04 09:38:11] Epoch: 1    Loss: 1.280049   (40 sec)
>>> [2025/01/04 09:38:18] Epoch: 2    Loss: 0.685794   (7 sec)
>>> [2025/01/04 09:38:21] Epoch: 3    Loss: 0.608453   (3 sec)
>>> [2025/01/04 09:38:24] Epoch: 4    Loss: 0.568957   (3 sec)
>>> [2025/01/04 09:38:28] Epoch: 5    Loss: 0.539069   (3 sec)
>>> [2025/01/04 09:38:31] Epoch: 6    Loss: 0.514472   (3 sec)
>>> [2025/01/04 09:38:36] Epoch: 7    Loss: 0.493284   (5 sec)
>>> [2025/01/04 09:38:43] Epoch: 8    Loss: 0.474990   (7 sec)
>>> [2025/01/04 09:38:50] Epoch: 9    Loss: 0.455647   (7 sec)
>>> [2025/01/04 09:38:56] Epoch: 10    Loss: 0.439177   (7 sec)
>>> [2025/01/04 09:39:03] Epoch: 11    Loss: 0.422073   (7 sec)
>>> [2025/01/04 09:39:10] Epoch: 12    Loss: 0.405732   (7 sec)
>>> [2025/01/04 09:39:16] Epoch: 13    Loss: 0.390010   (7 sec)
>>> [2025/01/04 09:39:23] Epoch: 14    Loss: 0.372296   (7 sec)
>>> [2025/01/04 09:39:30] Epoch: 15    Loss: 0.353713   (7 sec)
>>> [2025/01/04 09:39:36] Epoch: 16    Loss: 0.3

In [21]:
df_history = pd.DataFrame(history)
df_history.to_csv(os.path.join('ckpt', f"history_{Configs.filename_training}.csv"), index=False)