In [1]:
import pandas as pd
import re
import math
import contextlib
import io

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor

from evaluation_functions import *

#### Import literature dataset

Read dataset using pandas

In [2]:
df = pd.read_excel('data/literature-dataset.xlsx')

#### Prepare dataset for prediction

Canonicalize the SMILES

In [3]:
df['SMILES'] = df['SMILES'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=True, kekuleSmiles=True))

Generate source SMILES

In [4]:
def flatten(smiles):
    substitutions = {
        r'\[K[@,H]*\]': '[K]',
        r'\[B[@,H]*\]': 'B',
        r'\[Na[@,H,+,-]*\]': '[Na]',
        r'\[C[@,H]*\]': 'C',
        r'\[N[@,H]*\]': 'N',
        r'\[O[@,H]*\]': 'O',
        r'\[S[@,H]*\]': 'S',
        r'\[P[@,H]*\]': 'P',
        r'\[F[@,H]*\]': 'F',
        r'\[Cl[@,H]*\]': '[Cl]',
        r'\[Br[@,H]*\]': '[Br]',
        r'\[I[@,H]*\]': 'I',
        r'@': '',
        r'/': '',
        r'\\': ''
    }

    for pattern, replacement in substitutions.items():
        smiles = re.sub(pattern, replacement, smiles)

    return smiles

In [5]:
df['source'] = df['SMILES'].apply(flatten)

Tokenize SMILES

In [6]:
def smi_tokenizer(smi: str) -> str:
        """
        Tokenize a SMILES molecule or reaction. Modified for the special tagging character "!".
        """
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        return ' '.join(tokens)

In [7]:
df['source'] = df['source'].apply(smi_tokenizer)
df['target'] = df['SMILES'].apply(smi_tokenizer)

#### Export dataset

Export source and target tokenized SMILES as text files

In [8]:
df['source'].to_csv('data/opennmt/validation/source.txt', index=False, header=False)
df['target'].to_csv('data/opennmt/validation/target.txt', index=False, header=False)

#### Run predictions

Run predictions using the 5x augmented model. Save verbose output in a log file.

In [9]:
buffer = io.StringIO()

with contextlib.redirect_stdout(buffer):
    !onmt_translate -model models/not_augmented/not_augmented_step_200000.pt -src data/opennmt/validation/source.txt -output data/opennmt/validation/predictions.txt -n_best 1 -beam_size 1 -verbose

output = buffer.getvalue()
with open("data/opennmt/validation/log.txt", "w") as file:
    file.write(output)

Helper function to parse log file

In [10]:
def log_prob_to_confidence(log_prob):
    return math.exp(log_prob)

def read_log_file(file_path): 
    with open(file_path, 'r') as file:
        lines = file.readlines()
        lines = [line.strip() for line in lines]

    predictions = []
    scores = []

    for line in lines:
        if re.match(r'^PRED \d+:\s*', line):
            prediction = re.split(r'^PRED \d+:\s*', line, maxsplit=1)[1]
            predictions.append(prediction)
        elif line.startswith('PRED SCORE'):
            score = float(line.split('PRED SCORE: ')[1].strip())
            scores.append(log_prob_to_confidence(score))
    
    return predictions, scores

#### Import predictions

Add predictions to data frame

In [11]:
predictions, scores = read_log_file('data/opennmt/validation/log.txt')

df['prediction'] = predictions
df['confidence'] = scores

Clean formats

In [12]:
df['source'] = df['source'].apply(lambda x: x.replace(' ', ''))
df['target'] = df['target'].apply(lambda x: x.replace(' ', ''))
df['prediction'] = df['prediction'].apply(lambda x: x.replace(' ', ''))

Add weighted accuracy

In [13]:
df['top1_wt'] = df.apply(lambda x: chirality_weighted_accuracy(x['target'], x['prediction']), axis=1)

#### Display results

Display the structures of the target and the predicted SMILES

In [14]:
df.to_excel('data/literature-dataset-pred.xlsx', index=False)