In [1]:
model_path = "/home/grad/Desktop/pietro/denovo/new/risultati/fine/gba/model_final_fxar.h5"
char2idx_path = "/home/grad/Desktop/pietro/denovo/new/attachments_3/2/da_caricare/originale/char2idx.pkl"
idx2char_path = "/home/grad/Desktop/pietro/denovo/new/attachments_3/2/da_caricare/originale/idx2char.pkl"
vocab_path    = "/home/grad/Desktop/pietro/denovo/new/attachments_3/2/da_caricare/originale/vocab.json"
max_length = 90
training_file = "/home/grad/Desktop/pietro/denovo/s4-for-de-novo-drug-design/datasets/fxar/fine.csv"

In [2]:
import os
import sys
import pickle
import json
import numpy as np
import tensorflow as tf
from rdkit import Chem
from rdkit.Chem import QED
import csv

# Se vuoi forzare l'uso della CPU, decommenta la seguente riga:
# os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# --- Definizione delle custom objects (devono essere identiche a quelle usate in training) ---
from tensorflow.keras.layers import Layer, Dense, Dropout, Embedding, LayerNormalization, MultiHeadAttention
from tensorflow.keras.models import load_model

class DynamicPositionalEncoding(Layer):
    def __init__(self, embed_dim, **kwargs):
        super(DynamicPositionalEncoding, self).__init__(**kwargs)
        self.embed_dim = embed_dim
    def build(self, input_shape):
        max_seq_len = input_shape[1]
        pos = np.arange(max_seq_len)[:, np.newaxis]
        i = np.arange(self.embed_dim)[np.newaxis, :]
        angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(self.embed_dim))
        angle_rads = pos * angle_rates
        angle_rads[:, 0::2] = tf.math.sin(angle_rads[:, 0::2])
        angle_rads[:, 1::2] = tf.math.cos(angle_rads[:, 1::2])
        self.pos_encoding = tf.cast(angle_rads[np.newaxis, ...], dtype=tf.float32)
    def call(self, inputs):
        return inputs + self.pos_encoding[:, :tf.shape(inputs)[1], :]
    def get_config(self):
        config = super(DynamicPositionalEncoding, self).get_config()
        config.update({'embed_dim': self.embed_dim})
        return config

class ImprovedTransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ffn_dim, rate=0.1, **kwargs):
        super(ImprovedTransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ffn_dim = ffn_dim
        self.rate = rate
        self.mha = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim, dropout=rate)
        self.ffn = tf.keras.Sequential([
            Dense(ffn_dim, activation="gelu"),
            Dense(embed_dim)
        ])
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)
    def call(self, inputs, training=False):
        seq_len = tf.shape(inputs)[1]
        # Crea la maschera causale (triangolare inferiore)
        causal_mask = tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        attn_output = self.mha(inputs, inputs, attention_mask=causal_mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)
    def get_config(self):
        config = super(ImprovedTransformerBlock, self).get_config()
        config.update({
            "embed_dim": self.embed_dim,
            "num_heads": self.num_heads,
            "ffn_dim": self.ffn_dim,
            "rate": self.rate
        })
        return config

class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, embed_dim, warmup_steps=10000):
        super(CustomSchedule, self).__init__()
        self.embed_dim = tf.cast(embed_dim, tf.float32)
        self.warmup_steps = tf.cast(warmup_steps, tf.float32)
    def __call__(self, step):
        step = tf.cast(step, tf.float32) + 1e-9
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** -1.5)
        return tf.math.rsqrt(self.embed_dim) * tf.math.minimum(arg1, arg2)
    def get_config(self):
        return {"embed_dim": self.embed_dim.numpy(), "warmup_steps": self.warmup_steps.numpy()}

def smoothed_loss(y_true, y_pred):
    mask = tf.cast(tf.math.not_equal(y_true, 0), tf.float32)
    loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y_true, logits=y_pred)
    return tf.reduce_sum(loss * mask) / (tf.reduce_sum(mask) + 1e-9)

custom_objects = {
    "DynamicPositionalEncoding": DynamicPositionalEncoding,
    "ImprovedTransformerBlock": ImprovedTransformerBlock,
    "CustomSchedule": CustomSchedule,
    "smoothed_loss": smoothed_loss,
}

# --- Caricamento del modello fine-tuned ---
model = load_model(model_path, custom_objects=custom_objects)
print("Trained model loaded from folder:", model_path)

# --- Caricamento delle mappature e del vocabolario ---
with open(char2idx_path, "rb") as f:
    char2idx = pickle.load(f)
with open(idx2char_path, "rb") as f:
    idx2char = pickle.load(f)
with open(vocab_path, "r") as f:
    vocab = json.load(f)

# Imposta la lunghezza massima (deve essere la stessa usata in training)

# --- Funzione per generare un batch di SMILES parallelamente ---
def generate_smiles_batch(model, char2idx, idx2char, max_length, batch_size=64, temperature=1.0):
    input_seqs = np.full((batch_size, max_length), char2idx['<PAD>'], dtype=np.int32)
    input_seqs[:, 0] = char2idx['<START>']
    finished = np.zeros(batch_size, dtype=bool)
    end_token = char2idx['<END>']

    for t in range(1, max_length):
        # Otteniamo tutti i logits in una singola chiamata
        logits = model.predict(input_seqs, verbose=0)  # shape (batch_size, max_length, vocab_size)
        step_logits = logits[:, t-1, :]
        step_probs = tf.nn.softmax(step_logits / temperature).numpy()

        # Aggiorniamo tutti i batch
        for i in range(batch_size):
            if not finished[i]:
                sampled = np.random.choice(len(step_probs[i]), p=step_probs[i])
                input_seqs[i, t] = sampled
                if sampled == end_token:
                    finished[i] = True
        if finished.all():
            break

    smiles_list = []
    for seq in input_seqs:
        tokens = [idx2char[idx] for idx in seq
                  if idx not in {char2idx['<PAD>'], char2idx['<START>'], char2idx['<END>']}]
        smi = ''.join(tokens)
        mol = Chem.MolFromSmiles(smi)
        if mol:
            smi = Chem.MolToSmiles(mol, canonical=True)
        smiles_list.append(smi)

    return smiles_list

# --- Funzione per valutare e salvare i batch di SMILES ---
# --- Funzione per valutare e salvare i batch di SMILES ---
def evaluate_and_save_batches(model, char2idx, idx2char, max_length,
                              training_smiles_set, out_csv_path,
                              num_batches=10, batch_size=64, temperature=1.0):
    all_generated = []
    for b in range(num_batches):
        generated = generate_smiles_batch(model, char2idx, idx2char,
                                          max_length, batch_size, temperature)
        all_generated.extend(generated)
        print(f"Batch {b+1}/{num_batches}: generated {len(generated)} SMILES")

    # Filtra e canonicalizza le SMILES valide
    valid_smiles = []
    for smi in all_generated:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            canon = Chem.MolToSmiles(mol, canonical=True)
            valid_smiles.append(canon)

    # Calcola novelty PRIMA di rimuovere duplicati
    if valid_smiles:
        num_novel = sum(1 for smi in valid_smiles if smi not in training_smiles_set)
        novelty_raw = num_novel / len(valid_smiles)
    else:
        novelty_raw = 0.0

    # Rimuovi duplicati e molecole già presenti nel training set
    unique_and_novel = list({smi for smi in valid_smiles if smi not in training_smiles_set})

    # Valuta QED e SA
    qed_list, sa_list = [], []
    for smi in unique_and_novel:
        mol = Chem.MolFromSmiles(smi)
        if mol:
            try:
                qed_list.append(QED.qed(mol))
            except:
                pass
            try:
                sa_list.append(sascorer.calculateScore(mol))
            except:
                pass

    # Salva su CSV solo SMILES uniche e nuove
    with open(out_csv_path, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        for smi in unique_and_novel:
            writer.writerow([smi])
    print(f"Saved {len(unique_and_novel)} unique and novel SMILES to {out_csv_path}")

    # Metriche
    validity = len(valid_smiles) / len(all_generated) if all_generated else 0
    avg_qed = np.mean(qed_list) if qed_list else 0
    avg_sa = np.mean(sa_list) if sa_list else 0
    originality = len(unique_and_novel) / len(valid_smiles) if valid_smiles else 0

    print(f"""
Molecule Generation Report:
  Total generated:         {len(all_generated)}
  Validity:               {validity*100:.2f}% ({len(valid_smiles)}/{len(all_generated)})
  Unique & novel:         {len(unique_and_novel)}
  Average QED:            {avg_qed:.4f}
  Average SA:             {avg_sa:.4f}
  Novelty (raw):          {novelty_raw*100:.2f}% 
  Originality (final):    {originality*100:.2f}% 
""")






# --- Caricamento del set di SMILES del training (per calcolare novelty) ---
# --- Caricamento del set di SMILES del training (per calcolare novelty) ---
training_smiles_set = set()

print(f"Loading and canonicalizing SMILES extracted from file: {training_file}")
count_processed = 0
count_valid = 0
if os.path.exists(training_file):
    with open(training_file, "r") as f:
        for line in f:
            count_processed += 1
            smi = line.strip()
            if not smi:
                continue
            try:
                mol = Chem.MolFromSmiles(smi)
                if mol:
                    # Canonicalizza prima di aggiungere al set
                    canon_smi = Chem.MolToSmiles(mol, canonical=True)
                    training_smiles_set.add(canon_smi)
                    count_valid += 1
                # else: # Opzionale: loggare SMILES non valide nel training set
                #     print(f"Attenzione: SMILES non valida nel training set ignorata: {smi}")
            except Exception as e:
                # Opzionale: loggare errori di parsing
                # print(f"Errore nel processare la SMILES '{smi}': {e}")
                pass # Ignora SMILES che causano errori
    print(f"Processed {count_processed} rows, loaded {len(training_smiles_set)} valid canonical SMILES from training set.")
else:
    print(f"Warning: Training file '{training_file}' not found.")


2025-07-03 15:23:23.571198: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-07-03 15:23:23.584758: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1751549003.601570 2810360 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1751549003.606553 2810360 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1751549003.622492 2810360 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

Trained model loaded from folder: /home/grad/Desktop/pietro/denovo/new/risultati/fine/gba/model_final_fxar.h5
Loading and canonicalizing SMILES extracted from file: /home/grad/Desktop/pietro/denovo/s4-for-de-novo-drug-design/datasets/fxar/fine.csv
Processed 883 rows, loaded 882 valid canonical SMILES from training set.


In [3]:
# --- Esempio di esecuzione ---
num_batches   = 10
batch_size    = 1000
temperature   = 1
out_csv_path  = "/home/grad/Desktop/pietro/denovo/s4-for-de-novo-drug-design/s4_loro/gen_mio/10000/gen_fxr_miol1.csv"

In [4]:
from rdkit import Chem, RDLogger
from rdkit.Chem import QED, RDConfig
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
RDLogger.DisableLog('rdApp.*')
import sascorer
evaluate_and_save_batches(
    model, char2idx, idx2char, max_length,
    training_smiles_set, out_csv_path,
    num_batches=num_batches,
    batch_size=batch_size,
    temperature=temperature
)

I0000 00:00:1751549008.374322 2879859 service.cc:152] XLA service 0x779fd00035d0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
I0000 00:00:1751549008.374337 2879859 service.cc:160]   StreamExecutor device (0): NVIDIA RTX A2000 12GB, Compute Capability 8.6
2025-07-03 15:23:28.416286: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
I0000 00:00:1751549008.509414 2879859 cuda_dnn.cc:529] Loaded cuDNN version 90300








I0000 00:00:1751549013.014564 2879859 device_compiler.h:188] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.








KeyboardInterrupt



In [5]:
pwd

'/home/grad/Desktop/pietro/denovo/new/attachments_3/2/da_caricare'