In [5]:
import utils
import ipywidgets as ipw
import shutil
import os
from IPython.display import display

from rdkit import Chem,DataStructs
from rdkit.Chem import AllChem
import csv
import pathlib as pl

In [None]:
OPENBIS_MOLECULES = None
CONFIG_ELN = utils.get_aiidalab_eln_config()
# CONFIG_ELN = utils.read_json("eln_config.json")
OPENBIS_SESSION, _ = utils.connect_openbis(CONFIG_ELN["url"], CONFIG_ELN["token"])

In [None]:
class Molecule:
    def __init__(self, name, strIn):
        self.name = name
        self.strIn = strIn
        try:
            self.mol = Chem.MolsFromCDXML(strIn)
            if len(self.mol) == 1:
                self.mol = self.mol[0] # Get first molecule
        except RuntimeError:
            self.mol = Chem.MolFromSmiles(strIn)
    
    # create chemical fingerprint of molecule
    def fingerprint(self, fpgen, **kwargs):
        self.fp = fpgen.GetFingerprint(self.mol, customAtomInvariants=self.get_invariant(**kwargs))

    # definition of different invariants to be used for fingerprinting
    def get_invariant(self, BCN=False, BrI=False, Topology=False):
        inv = [atom.GetAtomicNum() for atom in self.mol.GetAtoms()]
        if BCN: # do not differentiate between Boron, Carbon and Nitrogen
            inv = list(map(lambda x: 6 if x == 5 or x == 7 else x, inv))
        if BrI: # do not differentiate between Iodine and Bromine
            inv = list(map(lambda x: 35 if x == 53 else x, inv))
        if Topology: # do not differentiate between any elements
            inv = [1]*len(inv)
        return inv
    
    # use tanimoto similarity to compare self with refMol
    def tanimoto(self, refMol):
        return DataStructs.TanimotoSimilarity(self.fp,refMol.fp)
    
    # use dice similarity to compare self with refMol
    def dice(self, refMol):
        return DataStructs.DiceSimilarity(self.fp,refMol.fp)
    
def get_similar_molecules(openbis_session):
    if len(smiles_textbox.value) > 0 or len(cdxml_file_uploader.value) > 0:
        mols={}
        for substance in openbis_session.get_objects(type = "SUBSTANCE"):
            subst_props = substance.props.all()
            subst_empa_number = subst_props.get("empa_number")
            subst_molecule_permid = subst_props.get("has_molecule")
            mol_object = openbis_session.get_object(subst_molecule_permid)
            mol_structure = mol_object.props.all().get("smiles", None)
            if mol_structure:
                mol_object = Molecule(subst_empa_number, mol_structure)
                if mol_object.mol: # There are SMILES that cannot be converted using RDKit
                    mols.update({subst_empa_number: mol_object})

        # select how to look for similarities
        BCNinv = BCNInv_checkbox.value
        BrIinv = BrIInv_checkbox.value
        Topology = topology_checkbox.value
        useBondTypes = use_bondtypes_checkbox.value
        
        radius_morgan_generator = radius_morgan_generator_intslider.value

        # choose fingerprint generator. Morgan seems to work best
        fpgen = AllChem.GetMorganGenerator(radius = radius_morgan_generator, useBondTypes = useBondTypes)
        #fpgen = AllChem.GetRDKitFPGenerator(minPath=1, maxPath=7)

        # create fingerprint for all molecules
        for key in mols:
            mols.get(key).fingerprint(fpgen, BrI=BrIinv, BCN=BCNinv, Topology=Topology)

        # define reference Molecule to which the list should be compared and create its fingerpint
        if len(cdxml_file_uploader.value) > 0:
            _, file_metadata = next(iter(cdxml_file_uploader.value.items()))
            ref_mol_structure = file_metadata['content']
        else:
            ref_mol_structure = smiles_textbox.value
        
        refMol = Molecule("Ref", ref_mol_structure)
        refMol.fingerprint(fpgen, BrI=BrIinv, BCN=BCNinv, Topology=Topology)

        # create and sort list of molecules with dice similarity to reference molecule
        searchList = [(mols.get(key), mols.get(key).dice(refMol)) for key in mols]
        searchList.sort(reverse=True, key=lambda x: x[1])

        molecule_widgets = []
        num_similar_molecules = num_top_molecules_intslider.value
        for i in range(num_similar_molecules):
            mol, similarity = searchList[i]
            
            openbis_substances_dict = {substance.props.all()["empa_number"] : substance for substance in openbis_session.get_objects(type = "SUBSTANCE")}
            substance_molecule = openbis_session.get_object(openbis_substances_dict[mol.name].props.all()['has_molecule'])
            selected_molecule_name = substance_molecule.props.all().get("name", None)
            selected_molecule_iupac_name = substance_molecule.props.all().get("iupac_name", None)
            selected_molecule_sum_formula = substance_molecule.props.all().get("sum_formula", None)
            selected_molecule_smiles = substance_molecule.props.all().get("smiles", None)
            molecule_description_string = ""
            molecule_description_string += f"Name: {selected_molecule_name}\n"
            molecule_description_string += f"Empa Number: {mol.name}\n"
            molecule_description_string += f"IUPAC Name: {selected_molecule_iupac_name}\n"
            molecule_description_string += f"Sum Formula: {selected_molecule_sum_formula}\n"
            molecule_description_string += f"SMILES: {selected_molecule_smiles}\n"
            molecule_description_string += f"Similarity: {similarity}\n"
            
            molecule_dataset = substance_molecule.get_datasets(type="ELN_PREVIEW")[0]
            fig = ipw.Image(value = utils.read_file("images/white_screen.jpg"), width = '200px', height = '300px', layout=ipw.Layout(border='solid 1px #cccccc'))
            
            if molecule_dataset:
                molecule_dataset.download(destination="images")
                material_image_filepath = molecule_dataset.file_list[0]
                fig.value = utils.read_file(f"images/{molecule_dataset.permId}/{material_image_filepath}")
                shutil.rmtree(f"images/{molecule_dataset.permId}/")
            
            textarea = utils.Textarea(disabled = True, layout = ipw.Layout(width = '600px', height = '200px'), value = molecule_description_string)
            molecule_widgets.append(ipw.HBox([textarea, fig]))
        
        similar_molecules_box.children = molecule_widgets
    else:
        print("No molecule structure entered.")

# Find similar molecules in openBIS inventory

In [None]:
smiles_textbox = utils.Text(description = "SMILES", disabled = False, layout = ipw.Layout(width = '950px'), placeholder = f"Write SMILES here...", style = {'description_width': "48px"})
cdxml_label = ipw.Label(value = "CDXML")
cdxml_file_uploader = ipw.FileUpload(multiple = False, accept = '.cdxml')
cdxml_boxes = ipw.HBox([cdxml_label, cdxml_file_uploader])
search_molecules_button = utils.Button(description = '', disabled = False, button_style = '', tooltip = 'Search', icon = 'search', layout = ipw.Layout(width = '100px', height = '50px'))
search_molecules_button.on_click(lambda change: get_similar_molecules(OPENBIS_SESSION))

BCNInv_checkbox = utils.Checkbox(description = "BCN invariant", indent = False, layout = ipw.Layout(width = "120px"))
BrIInv_checkbox = utils.Checkbox(description = "BrI invariant", indent = False, layout = ipw.Layout(width = "120px"))
topology_checkbox = utils.Checkbox(description = "Invariant to any element", indent = False, layout = ipw.Layout(width = "170px"))
use_bondtypes_checkbox = utils.Checkbox(description = "Use bond types", indent = False, layout = ipw.Layout(width = "120px"))

num_top_molecules_intslider = utils.IntSlider(value = 1, description = "Number of similar molecules", 
                                              min = 1, max = 20, style = {"description_width": "165px"},
                                              layout = ipw.Layout(width = "400px"))
options_checkboxes = ipw.HBox([BCNInv_checkbox, BrIInv_checkbox, topology_checkbox, use_bondtypes_checkbox])
radius_morgan_generator_intslider = ipw.IntSlider(value = 1, description = "Radius (Morgan Generator)",
                                                  min = 1, max = 5, style = {"description_width": "160px"},
                                                  layout = ipw.Layout(width = "400px"))

display(utils.Markdown(data = "### Reference Molecule Structure"))
display(ipw.VBox([smiles_textbox, cdxml_boxes]))

display(utils.Markdown(data = "### Settings"))
display(ipw.VBox([options_checkboxes, radius_morgan_generator_intslider, 
                  num_top_molecules_intslider]))
display(search_molecules_button)

## Results

In [None]:
similar_molecules_box = ipw.VBox()
display(similar_molecules_box)