In [None]:
import os
import sys
from rdkit import Chem
import time
import math
from tqdm import tqdm
import numpy as np
from sklearn.linear_model import LinearRegression 
from sklearn import metrics
import torch.optim as optim
import torch
from torch import nn
import numpy as np
import pandas as pd
# %matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from torch_geometric.loader import DataLoader
from rdkit.Chem import AllChem
from rdkit.Chem import RDConfig
from rdkit.Chem import Descriptors
from rdkit.Chem import rdDepictor
sys.path.append(os.path.join(RDConfig.RDContribDir, "SA_Score"))
import sascorer
sns.set_theme(style="white", palette=None)

from rdkit import RDLogger
RDLogger.DisableLog('rdApp.*')
from rdkit.Chem.Draw import IPythonConsole
# IPythonConsole.molSize = (200, 200)   # Change image size
IPythonConsole.ipython_useSVG = True  # Change output to SVG

from catcvae.utils import smiles_to_mol, mol_to_smiles
from dataset import _dataset

In [None]:
def display_molecule(molecules, title=None, texts=None):
    fig, axs = plt.subplots(math.ceil(len(molecules)/5), 5, figsize=(15, math.ceil(len(molecules)*0.75)), dpi=300)
    fig.subplots_adjust(hspace=.5, wspace=.001)
    axs = axs.ravel()
    for i in range(math.ceil(len(molecules)/5)*5):
        if i < len(molecules):
            mol = molecules[i]
            ax = axs[i]
            ax.imshow(Chem.Draw.MolToImage(mol))
            ax.axis('off')
            if title:
                ax.set_title(title[i])
            if texts:
                ax.text(100, 350, texts[i], fontsize=12)
        else:
            ax = axs[i]
            ax.axis('off')

# Read file

In [None]:
file = 'sm_test1'
seed = 0
folder = 'output_0_20250524_151005'
optimize_folder = 'optimize_smiles_20250525_104746'
df = pd.read_csv('dataset/'+file+'/'+folder+'/'+optimize_folder+'/smiles.txt', sep=' ', header=None)
if len(df.columns) == 4:
    df.columns = ['round', 'smiles', 'predicted_value', 'optimize_value']
elif len(df.columns) == 6:
    df.columns = ['round', 'smiles', 'predicted_value', 'optimize_value', 'starting_smiles', 'true_value']

df

In [None]:
df = df.drop_duplicates(subset=['round', 'smiles'])
df = df.reset_index(drop=True)
len(df)

In [None]:
# finetune dataset
# dataset
df_dataset = pd.read_csv('dataset/'+file+'/datasets_dobj_split_0.csv')
df_dataset_test = df_dataset[df_dataset['s']== 'test']
df_dataset = df_dataset[df_dataset['s']!= 'test']

print("dataset: ", len(df_dataset))
print("dataset_test: ", len(df_dataset_test))

In [None]:
# only not in training dataset
smiles_dataset = [Chem.CanonSmiles(smiles, useChiral=False) for smiles in df_dataset['smiles_catalyst'].values]

novel_molecules_index = []
for i, row in df.iterrows():
    smiles_cat = Chem.CanonSmiles(row['smiles'], useChiral=False)
    if smiles_cat not in smiles_dataset:
        novel_molecules_index.append(i)

df = df.iloc[novel_molecules_index]
df = df.reset_index(drop=True)
print("df_novel: ", len(df))

In [None]:
if 'starting_smiles' not in df.columns:
    # group by 'smiles' and average the 'predicted_value' and 'optimize_value' and  std of 'predicted_value'
    df_group = df.groupby(['smiles']).agg({'predicted_value': ['mean', 'std'], 'optimize_value': 'mean'}).reset_index()
    df_group.columns = ['smiles', 'predicted_value', 'predicted_std', 'optimize_value']
    df_group = df_group.fillna(0)
    df_group = df_group[df_group['optimize_value'] < 100]

    df_group = df_group.sort_values(by='predicted_value', ascending=False)
    df_group = df_group.reset_index(drop=True)
    df_sample = df_group[:20] if len(df_group) > 20 else df_group
    mol = [Chem.MolFromSmiles(x) for x in df_sample['smiles']]
    texts = [f'{p:.2f}+/-{std:.2f}' if std != 0 else f'{p:.2f}' 
             for p, std in zip(df_sample['predicted_value'], df_sample['predicted_std'])]
    
    

    display_molecule(mol, texts=texts)

    print(len(df_group))
    # save to excel with molecule image 
    from rdkit.Chem import PandasTools
    df_excel = df_group.copy()
    df_excel = df_excel[['smiles', 'predicted_value']]
    df_excel['MolImage'] = df_excel['smiles'].apply(Chem.MolFromSmiles)
    PandasTools.SaveXlsxFromFrame(df_excel, 'dataset/'+file+'/'+folder+'/'+optimize_folder+'/optimize.xlsx', molCol='MolImage')

In [None]:
if 'starting_smiles' in df.columns:
    r = range(max(df['round'])+1)
    for i in r:
        df_i = df[df['round'] == i]
        df_i = df_i.sort_values(by='predicted_value', ascending=False)
        df_i = df_i.reset_index(drop=True)
        print(f'round {i} : {len(df_i)}')
        mols = [Chem.MolFromSmiles(df_i['starting_smiles'].values[0])]
        title = ['starting_smiles']
        texts = [df_i['true_value'].values[0]]
        smiles_unique = [df_i['starting_smiles'].values[0]]
        for j in range(4):
            try:
                if df_i['smiles'].values[j] not in smiles_unique:
                    mols.append(Chem.MolFromSmiles(df_i['smiles'].values[j]))
                    title.append('')
                    texts.append(df_i['predicted_value'].values[j])
                    smiles_unique.append(df_i['smiles'].values[j])
            except:
                pass

        # display_molecule(mols, title=title, texts=texts)

    print(len(df))
    # save to excel with molecule image 
    from rdkit.Chem import PandasTools
    df_excel = df.copy()
    df_excel = df_excel[['starting_smiles', 'true_value' , 'smiles', 'predicted_value']]
    df_excel['MolImage_starting'] = df_excel['starting_smiles'].apply(Chem.MolFromSmiles)
    df_excel['MolImage'] = df_excel['smiles'].apply(Chem.MolFromSmiles)
    PandasTools.SaveXlsxFromFrame(df_excel, 'dataset/'+file+'/'+folder+'/'+optimize_folder+'/optimize.xlsx', molCol='MolImage')