In [1]:
cfg = dict(
    seq_length  = 160,
    d_model     = 256,
    latent_dim  = 128,   # latent dimension
    n_head      = 8,
    enc_layers  = 7,
    dec_layers  = 7,
    ff_dim      = 1024,
    dropout     = 0.05,
    emb_dropout = 0.05,

    # special token indices (match your vocabulary)
    pad_idx     = 0,
    sos_idx     = 2,
    eos_idx     = 3,
    # -------- regularization tweaks --------
    label_smoothing = 0.02,   # 0.0 to disable
    corruption_p     = 0.05,  # word-dropout on decoder inputs (train only)

    # -------- validation / decoding --------
    beam_every  = 5,   # run beam metrics every N epochs
    beam_size   = 5
)

In [2]:
import torch, torch.nn as nn
import model_bs as mdl
import data_utils as du

# --- paths/config you already have ---
vocab_path   = "/home/md_halim_mondol/LSTM_VAE_Paper/vocab.json"
ckpt_path    = "/home/md_halim_mondol/LSTM_VAE_Paper/checkpoints/best_model.pth"
test_csv     = "/home/md_halim_mondol/Data/Test.csv"

# cfg should already be defined; using its fields:
# cfg["seq_length"], ["d_model"], ["latent_dim"], ["pad_idx"], ["sos_idx"], ["eos_idx"],
# ["enc_layers"], ["dec_layers"], ["n_head"], ["dropout"], ["ff_dim"], ["beam_size"]

# --- load vocab ---
token_to_idx, idx_to_token = du.load_or_create_vocabulary(csv_paths=[], cache_path=vocab_path, test_smiles=None)
assert token_to_idx["<PAD>"] == cfg["pad_idx"]
assert token_to_idx["<SOS>"] == cfg["sos_idx"]
assert token_to_idx["<EOS>"] == cfg["eos_idx"]

# --- build the same architecture you trained ---
model = mdl.LSTM_VAE_Trans(
    vocab_size=len(token_to_idx),
    d_model=cfg["d_model"],
    latent_dim=cfg["latent_dim"],
    pad_idx=cfg["pad_idx"],
    sos_idx=cfg["sos_idx"],
    eos_idx=cfg["eos_idx"],
    enc_layers=cfg["enc_layers"],
    dec_layers=cfg["dec_layers"],
    nhead=cfg["n_head"],
    dropout=cfg["dropout"],
    max_len=cfg["seq_length"],
    dim_feedforward=cfg["ff_dim"])

# --- load weights robustly (handles 'module.' prefixes if any) ---
state = torch.load(ckpt_path, map_location="cpu")
try:
    model.load_state_dict(state, strict=True)
except RuntimeError:
    # remove a leading 'module.' if the checkpoint came from DataParallel
    from collections import OrderedDict
    new_state = OrderedDict()
    for k, v in state.items():
        new_state[k.replace("module.", "", 1)] = v
    model.load_state_dict(new_state, strict=True)

# --- device & optional DataParallel for speed (not required) ---
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model.to(device)
print("Trainable params:", du.count_parameters(model))
print(f"Encoder parameters: {du.count_parameters(model.encoder)}")
model.eval()

# If you want to keep everything single-GPU-friendly for beam_search, DON'T wrap in DataParallel.
# If you DO wrap, remember to pass model.module to functions that call custom methods.

[vocab] loaded cached vocabulary from /home/md_halim_mondol/LSTM_VAE_Paper/vocab.json (49 tokens)


  state = torch.load(ckpt_path, map_location="cpu")


Trainable params: 10280497
Encoder parameters: 2780928


LSTM_VAE_Trans(
  (encoder): EncoderBiLSTM(
    (emb): Embedding(49, 256, padding_idx=0)
    (emb_ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (emb_do): Dropout(p=0.1, inplace=False)
    (lstm): LSTM(256, 128, num_layers=7, batch_first=True, dropout=0.05, bidirectional=True)
    (out_do): Dropout(p=0.05, inplace=False)
    (seq_ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (pool_ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (to_mu): Linear(in_features=256, out_features=128, bias=True)
  (to_logvar): Linear(in_features=256, out_features=128, bias=True)
  (latent_to_token): Sequential(
    (0): Linear(in_features=128, out_features=256, bias=True)
    (1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (emb): Embedding(49, 256, padding_idx=0)
    (emb_ln): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
    (pe): PositionalEncoding(
      (dropout): Dropout(p=0.05, inplace=False)
  

In [3]:
Dye_smiles = [
'CC1=CC(=O)c2c(Br)cc(Br)c(S(=O)(=O)O)c2C1=O',
'Cc1c(Br)cc(Br)c(C(C)c2ccc(C(C)C)c(Br)c2O)c1S(=O)(=O)O',
'Cc1ccccc1N=Nc1ccc(C(N)=O)cc1',
'Cc1ccc(-c2cccc(O)c2C(C)c2ccc(S(=O)(=O)O)cc2)cc1',
'CN(C)c1ccc2c(c1)CN=C2c1ccccc1',
'O=C1c2ccccc2C(=O)c2c(O)cccc21',
'O=C1Nc2ccccc2S(=O)(=O)[N-]c2c1cccc2S(=O)(=O)[O-]',
'CN(C)C1=CS(=O)(=O)c2ccc(N(C)C)cc21',
'CN(C)c1ccc(C(C)(c2ccc(N(C)C)cc2)c2ccc(N(C)C)cc2)cc1',
'Cc1ccc(C(c2ccc(O)cc2)(c2ccc(O)cc2)c2ccc(C(C)C)cc2)cc1',
'CN(C)c1ccc2c(c1)CC1=CC(=[N+](C)C)C(=N2)C=C1',
'O=C1c2cc(=O)cccc2C(=O)c2c(Br)cc(Br)cc21',
'O=c1cc(O)ccc(-c2ccc3c(oc(=O)c4ccccc43)c2O)c1']

In [4]:
import pandas as pd
from inference import reconstruct_smiles_table, tensor_to_smiles
import metrics as met

# Use the *unwrapped* model object for beam_search
m = model  # (if you ever wrap with DataParallel, use: model.module)

df_rec = reconstruct_smiles_table(
    smiles_list=Dye_smiles,
    test_csv=None,
    model=m,
    token_to_idx=token_to_idx,
    idx_to_token=idx_to_token,
    seq_length=cfg["seq_length"],
    pad_idx=cfg["pad_idx"],
    sos_idx=cfg["sos_idx"],
    eos_idx=cfg["eos_idx"],
    device=device,
    mode="beam",
    beam_size=cfg["beam_size"])

# show a preview
display(df_rec.head(10))

# ------------------------------------------------------------------
# 1.  Token-level accuracy (micro-average over SMILES tokens)
# ------------------------------------------------------------------
def token_accuracy_row(gold, pred):
    g = du.tokenize_smiles(gold)
    p = du.tokenize_smiles(pred)
    L = min(len(g), len(p))
    if L == 0:                      # degenerate empty case
        return 0, 0
    correct = sum(gi == pi for gi, pi in zip(g[:L], p[:L]))
    total   = L
    return correct, total

tot_corr = tot_tok = 0
for g, p in zip(df_rec["input"], df_rec["reconstructed"]):
    c, t = token_accuracy_row(g, p)
    tot_corr += c
    tot_tok  += t

beam_token_acc = tot_corr / tot_tok if tot_tok else 0.0
print(f"Token level test accuracy (beam): {beam_token_acc:.4f}")

# ------------------------------------------------------------------
# 2.  Sequence-level (exact-match) accuracy
# ------------------------------------------------------------------
exact_match_acc = (df_rec["input"] == df_rec["reconstructed"]).mean()
print(f"Exact SMILES match accuracy (beam): {exact_match_acc:.4f}")


# ---- summary metrics (no retraining) ----
valid_ratio = (df_rec["valid"] == "yes").mean() if len(df_rec) else float("nan")
avg_lev     = df_rec["lev"].mean() if len(df_rec) else float("nan")

print(f"[beam] validity ratio: {valid_ratio:.3f}")
print(f"[beam] average Levenshtein: {avg_lev:.3f}")



Unnamed: 0,input,reconstructed,valid,lev
0,CC1=CC(=O)c2c(Br)cc(Br)c(S(=O)(=O)O)c2C1=O,CC1=CC(=O)c2c(Br)cc(Br)c(S(=O)(=O)O)c2C1=O,yes,0
1,Cc1c(Br)cc(Br)c(C(C)c2ccc(C(C)C)c(Br)c2O)c1S(=...,Cc1c(Br)cc(Br)c(C(C)c2ccc(C(C)C)c(Br)c2O)c1S(=...,yes,0
2,Cc1ccccc1N=Nc1ccc(C(N)=O)cc1,Cc1ccccc1Nc1ccc(C(N)=O)cc1,yes,2
3,Cc1ccc(-c2cccc(O)c2C(C)c2ccc(S(=O)(=O)O)cc2)cc1,Cc1ccc(-c2cccc(O)c2C(C)c2ccc(S(=O)(=O)O)cc2)cc1,yes,0
4,CN(C)c1ccc2c(c1)CN=C2c1ccccc1,CN(C)c1ccc2c(c1)CN=C2c1ccccc1,yes,0
5,O=C1c2ccccc2C(=O)c2c(O)cccc21,O=C1c2ccccc2C(=O)c2c(O)cccc21,yes,0
6,O=C1Nc2ccccc2S(=O)(=O)[N-]c2c1cccc2S(=O)(=O)[O-],O=C1Nc2ccccc2S(=O)(=O)[N-]c2c1cccc2S(=O)(=O)[O-],yes,0
7,CN(C)C1=CS(=O)(=O)c2ccc(N(C)C)cc21,CN(C)C1=CS(=O)(=O)c2ccc(N(C)C)cc21,yes,0
8,CN(C)c1ccc(C(C)(c2ccc(N(C)C)cc2)c2ccc(N(C)C)cc...,CN(C)c1ccc(C(C)(c2ccc(N(C)C)cc2)c2ccc(N(C)C)cc...,yes,0
9,Cc1ccc(C(c2ccc(O)cc2)(c2ccc(O)cc2)c2ccc(C(C)C)...,Cc1ccc(C(c2ccc(O)cc2)(c2ccc(O)cc2)c2ccc(C(C)C)...,yes,0


Token level test accuracy (beam): 0.9469
Exact SMILES match accuracy (beam): 0.6923
[beam] validity ratio: 0.923
[beam] average Levenshtein: 0.846
