In [1]:
import pandas as pd
import re
import math
import contextlib
import io
from IPython.display import HTML
from io import BytesIO
import base64

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor

from eval_functions import *

#### Import literature dataset

Read dataset using pandas

In [2]:
df = pd.read_excel('data/literature-dataset.xlsx')

#### Prepare dataset for prediction

Canonicalize the SMILES

In [3]:
df['SMILES'] = df['SMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=True, kekuleSmiles=False))

Generate source SMILES

In [4]:
df['source'] = df['SMILES'].apply(flatten)

Tokenize SMILES

In [5]:
def smi_tokenizer(smi: str) -> str:
        """
        Tokenize a SMILES molecule or reaction. Modified for the special tagging character "!".
        """
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        return ' '.join(tokens)

In [6]:
df['source'] = df['source'].apply(smi_tokenizer)
df['target'] = df['SMILES'].apply(smi_tokenizer)

#### Export dataset

Export source and target tokenized SMILES as text files

In [7]:
df['source'].to_csv('data/opennmt/validation/source.txt', index=False, header=False)
df['target'].to_csv('data/opennmt/validation/target.txt', index=False, header=False)

#### Run predictions

Run predictions using the 5x augmented model. Save verbose output in a log file.

In [8]:
!onmt_translate -model models/partial_augmented_5x/partial_augmented_5x_step_100000.pt -src data/opennmt/validation/source.txt -output data/opennmt/validation/predictions.txt -n_best 1 -beam_size 1 -verbose

[2025-02-04 09:58:05,396 INFO] Loading checkpoint from models/partial_augmented_5x/partial_augmented_5x_step_100000.pt
[2025-02-04 09:58:05,870 INFO] Loading data into the model
[2025-02-04 09:58:08,767 INFO] 
SENT 1: ['C', 'O', 'c', '1', 'c', 'c', '2', 'c', '(', 'c', '(', 'O', 'C', ')', 'c', '1', 'O', 'C', ')', '-', 'c', '1', 'c', 'c', 'c', '(', 'O', 'C', ')', 'c', '(', '=', 'O', ')', 'c', 'c', '1', 'C', '(', 'N', 'C', '(', 'C', ')', '=', 'O', ')', 'C', 'C', '2']
PRED 1: C O c 1 c c 2 c ( c ( O C ) c 1 O C ) - c 1 c c c ( O C ) c ( = O ) c c 1 [C@@H] ( N C ( C ) = O ) C C 2
PRED SCORE: -0.0857

[2025-02-04 09:58:08,767 INFO] 
SENT 2: ['C', 'C', '(', '=', 'C', 'c', '1', 'c', 's', 'c', '(', 'C', ')', 'n', '1', ')', 'C', '1', 'C', 'C', '2', 'O', 'C', '2', '(', 'C', ')', 'C', 'C', 'C', 'C', '(', 'C', ')', 'C', '(', 'O', ')', 'C', '(', 'C', ')', 'C', '(', '=', 'O', ')', 'C', '(', 'C', ')', '(', 'C', ')', 'C', '(', 'O', ')', 'C', 'C', '(', '=', 'O', ')', 'O', '1']
PRED 2: C / C ( = C \ c 1 

#### Import predictions

Add predictions to data frame

In [9]:
predictions = pd.read_csv('data/opennmt/validation/predictions.txt', header=None, sep='\t')
df['prediction'] = predictions

Clean formats

In [10]:
df['source'] = df['source'].apply(lambda x: x.replace(' ', ''))
df['target'] = df['target'].apply(lambda x: x.replace(' ', ''))
df['prediction'] = df['prediction'].apply(lambda x: x.replace(' ', ''))

Add weighted accuracy

In [15]:
df['top1_wt'] = df.apply(lambda x: per_stereocenter(x['source'], x['target'], x['prediction']), axis=1)

#### Display results

In [16]:
df['target_mol'] = df['target'].apply(Chem.MolFromSmiles)
df['prediction_mol'] = df['prediction'].apply(Chem.MolFromSmiles)

def mol_to_img_base64(mol, size=(400, 400)):
    img = Draw.MolToImage(mol, size=size)
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return f'<img src="data:image/png;base64,{img_str}" width="{size[0]}" height="{size[1]}"/>'

df['target_img'] = df['target_mol'].apply(lambda mol: mol_to_img_base64(mol))
df['prediction_img'] = df['prediction_mol'].apply(lambda mol: mol_to_img_base64(mol))

html_content = HTML(df.to_html(escape=False))
with open('new_assignments.html', 'w') as f:
    f.write(html_content.data)