In [1]:
import pandas as pd
import re
import math
import contextlib
import io
from IPython.display import HTML
from io import BytesIO
import base64

from rdkit import Chem
from rdkit.Chem import PandasTools
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import rdDepictor

#### Import literature dataset

Read dataset using pandas

In [2]:
df = pd.read_csv('data/coconut/coconut_incomplete.csv')

#### Prepare dataset for prediction

Canonicalize the SMILES

In [None]:
df['smiles'] = df['smiles'].apply(lambda x: Chem.MolToSmiles(Chem.MolFromSmiles(x), isomericSmiles=True, kekuleSmiles=False))

Tokenize SMILES

In [4]:
def smi_tokenizer(smi: str) -> str:
        """
        Tokenize a SMILES molecule or reaction. Modified for the special tagging character "!".
        """
        pattern =  "(\[[^\]]+]|Br?|Cl?|N|O|S|P|F|I|b|c|n|o|s|p|\(|\)|\.|=|#|-|\+|\\\\|\/|:|~|@|\?|>|\*|\!|\$|\%[0-9]{2}|[0-9])"
        regex = re.compile(pattern)
        tokens = [token for token in regex.findall(smi)]
        return ' '.join(tokens)

In [5]:
df['source'] = df['smiles'].apply(smi_tokenizer)

#### Export dataset

Export source and target tokenized SMILES as text files

In [6]:
df['source'].to_csv('data/opennmt/partial/source.txt', index=False, header=False)

#### Run predictions

Run predictions using the 5x augmented model. Save verbose output in a log file.

In [None]:
!onmt_translate -model models/partial_augmented_5x/partial_augmented_5x_step_100000.pt -src data/opennmt/partial/source.txt -output data/opennmt/partial/predictions.txt -n_best 1 -beam_size 1 -verbose

#### Import predictions

Add predictions to data frame

In [7]:
predictions = pd.read_csv('data/opennmt/partial/predictions.txt', header=None, sep='\t')
df['prediction'] = predictions[0]

Clean formats

In [8]:
df['source'] = df['source'].apply(lambda x: x.replace(' ', ''))
df['prediction'] = df['prediction'].apply(lambda x: x.replace(' ', ''))

Generate PDF report

In [None]:
df['source_mol'] = df['source'].apply(Chem.MolFromSmiles)

df['prediction_mol'] = df['prediction'].apply(Chem.MolFromSmiles)
df = df.dropna(subset=['prediction_mol'])

df = df[['identifier', 'source', 'prediction', 'source_mol', 'prediction_mol']]

def mol_to_img_base64(mol, size=(400, 400)):
    img = Draw.MolToImage(mol, size=size)
    buffer = BytesIO()
    img.save(buffer, format="PNG")
    img_str = base64.b64encode(buffer.getvalue()).decode("utf-8")
    return f'<img src="data:image/png;base64,{img_str}" width="{size[0]}" height="{size[1]}"/>'

df['source_img'] = df['source_mol'].apply(lambda mol: mol_to_img_base64(mol))
df['prediction_img'] = df['prediction_mol'].apply(lambda mol: mol_to_img_base64(mol))

html_content = HTML(df.to_html(escape=False))
with open('partial_assignments.html', 'w') as f:
    f.write(html_content.data)

Selected examples:

- CNP0107513.0
- CNP0107664.0
- CNP0107730.0
- CNP0108208.0
- CNP0213983.0
- CNP0215345.0
- CNP0216125.0
- CNP0375995.0