In [None]:
#@title Fragmenstein
#@markdown Fragmenstein is a position-based fragment-merging python3 tool.
#@markdown In its merging/linking operation, under the coordination of the class Victor, 
#@markdown the class Monster finds spatially overlapping atoms and stitches them together (with RDKit),
#@markdown then the class Igor reanimates (minimises in PyRosetta) them within the protein site restraining the atoms
#@markdown to original positions.
#@markdown As this compound may not be purchasable, one can use the placement operation to
#@markdown make a stitched together molecule based a template.

#@markdown This notebook does sevaral operations.
#@markdown It optionally minimises the template structure,
#@markdown and optionally extracts the hits from provided PDB structures.
#@markdown It combines combinatorially the provided hits
#@markdown It then searches for the most similar molecules to the user chosen molecule
#@markdown in the Enamine Real database (via the API John Irwin's SmallWorld server)
#@markdown and places them.

#@markdown NB Whereas Fragmenstein can deal with covalent ligands
#@markdown and can interconvert a few cysteine reactive warheads
#@markdown this notebook does not do any of that due to corner case mayhem.

#@markdown Fragmenstein can _partially_ work without PyRosetta

#@markdown This notebook from [https://github.com/matteoferla/Fragmenstein).

#@markdown It can be opened in Colabs via [https://colab.research.google.com/github/matteoferla/Fragmenstein/blob/main/colabs/colabs-pyrosetta-migrate_ligands.ipynb](https://colab.research.google.com/github/matteoferla/pyrosetta_help/blob/main/colabs/colabs-pyrosetta-migrate_ligands.ipynb)

#@markdown See also:

#@markdown * https://github.com/matteoferla/Fragmenstein
#@markdown * https://github.com/matteoferla/Python_SmallWorld_API
#@markdown * https://github.com/matteoferla/pyrosetta_help

In [None]:
#@title Installation

# Muppet-proofing: are we in colab?
shell_name = get_ipython().__class__.__name__
if shell_name == 'Shell':
    modality = 'colab'
elif shell_name == 'ZMQInteractiveShell':
    modality = 'jupyter'
elif shell_name == 'TerminalInteractiveShell':
    raise RuntimeError('This is a colabs notebook. Why are you running it in the terminal?')
else:
    raise RuntimeError(f'This is a colabs notebook. not a {shell_name}')

#@markdown ### Quicker loading option via Google Drive
#@markdown Installing PyRosetta with optional backup to your drive (way quicker next time!).
#@markdown Note that PyRosetta occupies some 10 GB, so you'll need to be on the 100 GB plan of Google Drive (it's one pound a month).

#@markdown NB. If `use_drive` is True, you will be prompted to give permission to 
#@markdown use Google Drive —_always_ remember to check strangers code against data theft: search and look for all instances of `http`, `requests` and `post` in the code.

#@markdown ### Download PyRosetta
#@markdown The following is not the real username and password. However, the format is similar.
username = 'boltzmann' #@param {type:"string"}
username.strip().lower()
password = 'constant' #@param {type:"string"}
#@markdown Are these the "normal" common credentials?
#@markdown If so their hash will be checked beforehand to check if they are correct.
hash_comparision_required = True #@param {type:"boolean"}

#@markdown Release to install:
_release = 'release-295' #@param {type:"string"}
#@markdown Use Google Drive for PyRosetta (way faster next time, but takes up space)
#@markdown (NB. You may be prompted to follow a link and possibly authenticate and then copy a code into a box
use_drive = True #@param {type:"boolean"}

import sys
import platform
import os
import importlib
import pip
import hashlib

def install_and_import(package_name: str, 
                       pypi_name: Optional[str]=None,
                       alias_name: Optional[str]=None):
    """If the module has a different name in pypi (`pypi_name`) 
    than its import name (`package_name`), specify it.
    
         pip install pypi_name
         import package_name as alias_name
    """
    if pypi_name is None:
        pypi_name = package_name
    if as_name is None:
        as_name = package_name
    try:
        importlib.import_module(package_name)
    except ImportError as error:
        if error.name != package_name:
            # these are not the droids we are looking for
            raise ImportError(f'Import of {package_name} requires module {name}...', name=error.name)
        # I will go to hell for this, but shmeh:
        pip._internal.cli.main.main(['install', pypi_name])
    globals()[as_name] = importlib.import_module(package_name)

# ================================================================
# install pyrosetta

if modality == 'colab':
    if use_drive:
        from google.colab import drive
        drive.mount('/content/drive')
        _path = '/content/drive/MyDrive'
        os.chdir(_path)
    else:
        _path = '/content'
else:
    # jupyter
    _path = './'

# install pyrosetta
try:
    # is pyrosetta installed?
    import pyrosetta
except ImportError: # pyrosetta is not installed.
    # is pyrosetta not downloaded?
    if not any(['PyRosetta4.Release' in filename for filename in os.listdir()]):
        # check if hash is right.
        if hash_comparision_required:
            # verify the username and password are correct without actually knowing them.
            hashed_username = hashlib.sha256(username.encode()).hexdigest()
            hashed_password = hashlib.sha256(password.encode()).hexdigest()
            expected_hashed_username = 'cf6f296b8145262b22721e52e2edec13ce57af8c6fc990c8ae1a4aa3e50ae40e'
            expected_hashed_password = '45066dd976d8bf0c05dc8dd4d58727945c3437e6eb361ba9870097968db7a0da'
            assert hashed_username == expected_hashed_username, 'The hash of the username is not as expected'
            assert hashed_password == expected_hashed_password, 'The hash of the password is not as expected'
        # download tar in the scratch folder... but extract to whatever the working path is!
        assert not os.system(f'curl -u {username}:{password} https://graylab.jhu.edu/download/PyRosetta4/archive/release/PyRosetta4.Release.python{py_version}.ubuntu/PyRosetta4.Release.python{py_version}.ubuntu.{_release}.tar.bz2 -o /content/a.tar.bz2')
        assert not os.system('tar -xf /content/a.tar.bz2') # destination --> _path
    assert not os.system(f'pip3 install -e {_path}/PyRosetta4.Release.python{py_version}.ubuntu.{_release}/setup/')

# ================================================================
# install other stuff
install_and_import(package_name='Bio', pypi_name='biopython')
install_and_import(package_name='rdkit', pypi_name='rdkit-pypi')
from rdkit import Chem
from rdkit.Chem import PandasTools
install_and_import(package_name='pyrosetta_help', pypi_name='pyrosetta-help', alias_name='ph')
install_and_import(package_name='rdkit_to_params', pypi_name='rdkit-to-params')
install_and_import(package_name='smallworld_api', pypi_name='smallworld-api')
install_and_import('fragmenstein')
# refresh imports
import site
site.main()

# ================================================================
# make folders in _path
working_folder = 'fragmenstein_data'
input_folder = 'input'
output_folder = 'output'
if not os.path.exists(working_folder):
    os.mkdir(working_folder)
    os.chdir('fragmenstein_data')
for folder in (input_folder, output_folder):
    if not os.path.exists(folder):
        os.mkdir(folder)

# ================================================================
# 3D viewer

install_and_import('py3Dmol')
from typing import *
from rdkit import Chem

def stylize(representation:str, color:str) -> Dict[str, Dict[str, Dict[str, str]]]:
    if 'carbon' in color.lower():
        return dict(style={representation: {'colorscheme': color}})
    else:
        return dict(style={representation: {'color': color}})

def make_3Dview(template_pdbblock, colormols:Dict[str, List[Chem.Mol]]) -> py3Dmol.view:
    """
    colormols is a diction of color/colorscheme to list of mols.
    """
    view = py3Dmol.view(js="https://3dmol.org/build/3Dmol.js")
    view.addModel(template_pdbblock, "pdb", stylize('cartoon','gainsboro') )
    view.setStyle(dict(hetflag=True), stylize('stick', 'whiteCarbon') )
    for color, mols in colormols.items():
        for mol in mols:
            view.addModel(Chem.MolToMolBlock(mol), "mol", stylize('stick', color) )
    view.zoomTo(dict(hetflag=True))
    return view

# ================================================================
# mol grid

from rdkit.Chem import Draw
from rdkit.Chem import AllChem
from IPython.display import display

class HorizontalMols:
    def __init__(self, *args):
        self.mols = [self._convert(arg) for arg in args]
        
    def _convert(self, mol_or_smiles: Union[str, Chem.Mol]) -> Chem.Mol:
        if isinstance(mol_or_smiles, Chem.Mol):
            return mol_or_smiles
        elif isinstance(mol_or_smiles, str):
            return Chem.MolFromSmiles(mol_or_smiles)
        elif isinstance(mol_or_smiles, bytes):
            return Chem.Mol(mol_or_smiles)
        elif mol_or_smiles is None:
            return Chem.Mol()
        else:
            raise TypeError(f'What is {mol_or_smiles}?')
        
    def _mol_to_svg(self, mol: Chem.Mol) -> str:
        d2d = Draw.rdMolDraw2D.MolDraw2DSVG(250, 200)
        mol2 = Chem.Mol(mol)
        AllChem.Compute2DCoords(mol2)
        d2d.DrawMolecule(mol2)
        d2d.FinishDrawing()
        return d2d.GetDrawingText()
    
    def _get_mol_name(self, mol: Chem.Mol) -> str:
        if mol.HasProp('_Name'):
            return mol.GetProp('_Name')
        else:
            return '「 no name 」'

    def _get_mol_details(self, mol: Chem.Mol) -> Dict[str, str]:
        return dict(name=self._get_mol_name(mol),
                    smiles=Chem.MolToSmiles(mol),
                    svg=self._mol_to_svg(mol)
                   )
        
    def _repr_html_(self):
        inner = '<h4>{name}<h4><p>{smiles}</p>{svg}'
        template = f'<div style="float: left; padding: 10px;">{inner}</div>'
        return "\n".join(template.format(**self._get_mol_details(mol)) for mol in self.mols)
    
    def show(self) -> None:
        from IPython.display import display
        display(self)

In [None]:
#@title Start PyRosetta
#@markdown Leave alone and just run it. 
#@markdown Only one that you might want to change is `ignore_waters`
#@markdown as merging a ligand with its watershell may be a reasonable thing to do
#@markdown in extreme circumstances —like not even Chuck Norris could find a followup.
import pyrosetta, logging
import pyrosetta_help as ph

#@markdown Do not optimise hydrogen on loading:
no_optH = False #@param {type:"boolean"}
#@markdown Ignore (True) or raise error (False) if novel residue (e.g. ligand) —  **don't tick this**.
ignore_unrecognized_res=False  #@param {type:"boolean"}
#@markdown Use autogenerated PDB residues are often weird (bad geometry, wrong match, protonated etc.): —best do it properly and parameterise it, so **don't tick this**.
load_PDB_components=False  #@param {type:"boolean"}
#@markdown Ignore all waters:
ignore_waters=True  #@param {type:"boolean"}

extra_options= ph.make_option_string(no_optH=no_optH,
                                  ex1=None,
                                  ex2=None,
                                  ignore_unrecognized_res=ignore_unrecognized_res,
                                  load_PDB_components=load_PDB_components,
                                  ignore_waters=ignore_waters)

# capture to log
logger = ph.configure_logger()
logger.handlers[0].setLevel(logging.WARNING) # logging.WARNING = 30
pyrosetta.init(extra_options=extra_options, set_logging_handler=True)
logger = logging.getLogger('rosetta')
logger.handlers[0].setLevel(logging.WARNING) # logging.WARNING = 30

In [None]:
#@title Upload Template model
#@markdown Upload PDB file of the template model used for placing the merger.
#@markdown Suggestion: use a model bound to the native ligand or similar
#@markdown as the side chains will be poised for action!
#@markdown NB. this PDB and those of the hits need to be superposed.

from google.colab import files
uploaded = files.upload()
assert len(uploaded) ==1, 'Upload only one template. Hits later.'
pdbblock = list(uploaded.values())[0]
if isinstance(pdbblock, bytes): # dont recall what format are txt sent as...
    pdbblock = pdbblock.decode('utf8')

In [None]:
#@title Optionally prepare it (1/3)
#@markdown This will be done by 

#@markdown 1. loading it in PyRosetta
#@markdown 2. Optionally energy minimising around a target
#@markdown 3. Optionally remove some molecules

#@markdown ### Step 1
#@markdown If the model has novel ligands, they will be loaded.
#@markdown But to do this a residue type  (=topology) needs to be made or loaded.
#@markdown These are saved as "params files".
#@markdown These following options control both the "acceptor" and "donor" poses (if uploaded).
#@markdown ### Params
#@markdown * Some compounds are parameterised in the database folder of rosetta,
#@markdown others in the PDB component database (if loaded).
#@markdown * Uses the params defined in the cell of the acceptor pose.
#@markdown * If there is no topology avalaible one will be made.
#@markdown * If a params file is present in the working folder it will use it.
#@markdown * See below or visit https://params.mutanalyst.com/ to generate them (upload the with the folder icon on the left).

#@markdown This forces it (a bit silly):
force_parameterisation = False  #@param {type:"boolean"}
#@markdown If it needs to be parameterised make it protonated for pH 7?
neutralise_params=True #@param {type:"boolean"}
save_params=True #@param {type:"boolean"}

#@markdown If a params file is present in the working folder it will use it.
#@markdown Leave this blank... otherwise  (comma separated w/ no rando spaces):
extra_params_files_to_use = '' #@param {type:"string"}
extra_params = [f for f in extra_params_files_to_use.split(',') if f]
use_all_folder_params= ''  #@param {type:"boolean"}
if use_all_folder_params:
    present_params = [filename for filename in os.listdir() if os.path.splitext(filename) == '.params']
else:
    present_params = []
print('loading pose...')
template_pose = ph.ligands.load.parameterised_pose_from_pdbblock(pdbblock,
                                 wanted_ligands = [],
                                 force_parameterisation=force_parameterisation,
                                 neutralise_params=neutralise_params,
                                 save_params=save_params,
                                 overriding_params=extra_params+present_params)

In [None]:
#@title Optional energy minimisation around a target (2/3)
#@markdown (Requires previous cell run)
assert 'template_pose' in globals(), 'Step 1 was not run'

#@markdown If use density map is true, you will be prompted to upload a density map.
#@markdown Upload a f0fc ccp4 or a mrc map. (not a ccp4 difference map, 
#@markdown a mtz reciprocal space map or a pirate treasure map)
#@markdown The map needs to be in the same position as the template.
use_density_map = True  #@param {type:"boolean"}
#@markdown The whole structure could be minimised, but that would be pointless costly timewise
#@markdown for this task.
#@markdown Specify what residue (amino acid or ligand) to centre around 
center_residue_index = 1  #@param {type:"integer"}
center_residue_chain = 'A'  #@param {type:"string"}
center_index : int = template_pose.pdb_info().pdb2pose(res=center_residue_index, chain=center_residue_chain)
assert center_index != 0, 'That residue does not exist!'

#@markdown Specify which neighbouring residues to select in one of three ways:

#@markdown (1) Cutoff distance for the neighbouring residues (in Ångströms) (centroid to centroid)?
#@markdown set to zero to not use.
neighborhood_radius = 1  #@param {type:"integer"}
#@markdown (2) Cutoff distance for the neighbouring residues (in Ångströms) (closest atom to closest atom)?
#@markdown set to zero to not use.
cc_neighborhood_radius = 0  #@param {type:"integer"}
#@markdown (3) Max number of neighbouring residues to choose?
#@markdown set to zero to not use.
n_neighbors = 0  #@param {type:"integer"}

#@markdown ## Minimisation
#@markdown How many cycles of FastRelax to use? 3–15
cycles = 3  #@param {type:"integer"}
#@markdown to change scorefunctions and so forth edit the code.

# Get map
if use_density_map:
    map_filename = 'uploaded_map.ccp4'
    uploaded = files.upload()
    assert len(uploaded) ==1, 'wrong number of files (only one plz)'
    mapblock = list(uploaded.values())[0]
    with open(os.path.join(input_folder, filename), 'wb') as fh:
        fh.write(mapblock)
    # this can be done with `Igor.relax_with_ED`, but I wanted the option here to do
    # it with or without the map
    ed = ph.prep_ED(template_pose, map_filename)
    assert ed.matchPose(pose) > 0.5, 'This is a rubbish fit. Upload the right map.'
    
# prep scorefunction
scorefxn = pyrosetta.get_fa_scorefxn()
if use_density_map:
    scorefxn.set_weight(pyrosetta.rosetta.core.scoring.ScoreType.elec_dens_fast,
                        30)
    
# get neighbourhood selector
neighborhood_radius = 1  #@param {type:"integer"}
#@markdown (2) Cutoff distance for the neighbouring residues (in Ångströms) (closest atom to closest atom)?
#@markdown set to zero to not use.
cc_neighborhood_radius = 0  #@param {type:"integer"}
#@markdown (3) Max number of neighbouring residues to choose?
#@markdown set to zero to not use.
n_neighbors = 0
selector = pyrosetta.rosetta.core.select.residue_selector
resi_sele = selector.ResidueIndexSelector(center_index)
if neighborhood_radius != 0:
    neighbor_sele = selector.NeighborhoodResidueSelector(resi_sele,
                                                         distance=neighborhood_radius,
                                                         include_focus_in_subset=True)
elif cc_neighborhood_radius != 0:
    neighbor_sele = selector.CloseContactResidueSelector()
    neighbor_sele.central_residue_group_selector(resi_sele)
    neighbor_sele.threshold(cc_neighborhood_radius)
elif n_neighbors != 0:
    neighbor_sele = selector.NumNeighborsSelector(n_neighbors, 20)
    # Ah. True. NumNeighborsSelector does not work in PyRosetta.
    raise NotImplementedError
else:
    raise ValueError

# relax
movemap = pyrosetta.MoveMap()
movemap.set_bb(allow_bb=neighbor_sele.apply(template_pose))
movemap.set_chi(allow_chi=neighbor_sele.apply(template_pose))
relax = pyrosetta.rosetta.protocols.relax.FastRelax(scorefxn, cycles)
relax.set_movemap(movemap)
relax.apply(template_pose)

pdbblock = ph.get_pdbstr(template_pose)

In [None]:
#@title Optional remove specified stuff (3/3)
#@markdown (Requires the cell two back to be run)
import re
from warnings import warn
unwanted_residue_names_raw = 'HOH' #@param {type:"string"}
unwanted_residue_names_raw = re.sub(r'[\W]',' ', unwanted_residue_names_raw)
unwanted_residue_names = unwanted_residue_names_raw.split()
selector = pyrosetta.rosetta.core.select.residue_selector
for unwanted_resn in unwanted_residue_names:
    sele = selector.ResidueNameSelector()
    sele.set_residue_name3(unwanted_resn)
    try:
        sele.apply(template_pose)
    except RuntimeError:
        # ResidueNameSelector: XYZ is not a valid residue type name.
        warn(f'There is no residue {unwanted_resn}!')
    for idx in reversed(list(selector.ResidueVector(  selector.apply(template_pose)  ))):
        template_pose.delete_residue_slow(idx)
    assert len(selector.ResidueVector(  sele.apply(template_pose)  )) == 0
    
pdbblock = ph.get_pdbstr(template_pose)

In [41]:
#@title Upload hits or bound hits
#@markdown As the decision is based on extension, please no random extensions...
#@markdown `mol.try2.mol` is fine, but `mol.molecule` or `mol.mol.txt` are not.

#@markdown ### Option 1.
#@markdown If you have a single multientry sdf file, upload that.

#@markdown ### Option 2.
#@markdown If you have multiple mol files, upload those.
#@markdown RDKit handles mdl-mol files better than mol2 files.

#@markdown ### Option 3.
#@markdown If you require your ligand to be extracted from pdb files
#@markdown upload those and specify the three-letter code of the ligand
#@markdown (this is the 3-letter name within the provided PDB(s) not the intended name for the merger!):
ligand_residue_name = 'LIG' #@param {type:"string"}
#@markdown This latter option is über-recommended if you have a covalently bound ligand
#@markdown but your extracted mol files are not or lack bond order.
#@markdown Note that you'll have to also upload a SMILES file (`.smi`)
#@markdown —this is just a tab separated table of SMILES tab name.

#@markdown If your PDB lacks `CONECT` entries (you monster), tick this:
proximityBonding = False #@param {type:"boolean"}


# Go!
import os
from google.colab import files
from rdkit import Chem
from typing import *
from warnings import warn

# get
uploaded = files.upload()

# sort
uploaded_split = {k: [] for k in ('smi', 'sdf', 'mol', 'mol2', 'pdb')}
for filename in uploaded:
    extension = os.path.splitext(filename)[1].lower()[1:]
    if extension not in uploaded_split:
        raise ValueError(f'The extension {extension} is not coded for')
    data = uploaded[filename] # str or bytes??
    if isinstance(data, bytes):
        data = data.decode('utf8')
    uploaded_split[extension].append( dict(filename=filename, data=data) )

# parse SMILES for PDB
if uploaded_split['smi']:
    # smilesdex? yes, I am going to hell for using not only Hungarian notation in Python, 
    # but a Pokémon flavoured one.
    
    smilesdex : Dict[str, str] = dict(map(lambda line: line.split('\t')[-1::-1],
                                         data.replace('\n\n','\n').strip().split('\n'))
                                     )
else:
    smilesdex = {}
    
# parsers
def read_sdf(filename:str, data:str) -> List[Chem.Mol]:
    mols = []
    # I am pretty sure there's no way to run `Chem.SDMolSupplier` on a block
    with open(os.path.join(input_folder, filename), 'w') as fh:
        fh.write(data)
    with Chem.SDMolSupplier(filename, removeHs=False) as suppl:
        for mol in suppl:
            # do I need to add a name??
            mols.append(mol)
    return mols

def read_mol(filename:str, data:str, fun: Callable) -> Chem.Mol:
    mol =  fun(data)
    name = os.path.splitext(filename)[0]
    mol.SetProp('_Name', name)
    return mol

def get_smiles(filename: str, smiledex: Dict[str, str]) ->  Union[None, str]:
    name = os.path.splitext(filename)[0]
    if name in smiledex:
        return smiledex[name]
    else:
        warn(f'Could not find matching SMILES to {name}')
        return None
        

# parse molecules...
hits = []
for entry in uploaded_split['sdf']:
    hits.extend( read_sdf(entry['filename'], entry['data']) ) 
for entry in uploaded_split['mol']:
    hits.append( read_mol(entry['filename'], entry['data'], Chem.MolFromMolBlock) )
for entry in uploaded_split['mol2']:
    hits.append( read_mol(entry['filename'], entry['data'], Chem.MolFromMol2Block) )
for entry in uploaded_split['pdb']:
    smiles : Union[None, str] = get_smiles(entry['filename'], smiledex)
    hits.append( Victor.extract_mol(name=name,
                                    block=entry['data'],
                                    smiles=smiles,
                                    ligand_resn=ligand_residue_name,
                                    proximityBonding=proximityBonding,
                                    throw_on_error = True)
               )
    
# pollute the directory with files
# with Chem.SDWriter(os.path.join(input_folder, 'provided.sdf')) as w:
#     for hit in hits:
#         w.write(hit)

HorizontalMols(hits).show()

ModuleNotFoundError: No module named 'google'

In [55]:
#@title Enter Victor Fragmenstein's laboratory!
#@markdown Three step process:

#@markdown 1. the hits are combined pairwise
#@markdown 2. the mergers are queried in the SmallWorld server against the Enamine REAL DB
#@markdown 3. the purchasable similars are placed

#@markdown In the documentation the example uses `sqlitedict.SqliteDict`
#@markdown as this avoids dramas from segfaults from `KeyboardInterrupt` or funky entries.

# Okay, the code below contains some black magic.
# a Chem.Mol is sent down the pipe to the subprocess pickled.
# But this loses its properties (`mol.HasProp`).
# unless this dark ritual is performed:
# https://github.com/matteoferla/Fragmenstein/blob/master/documentation/mol_properties.md

# =============================================================================================
# ## Define the process

joining_cutoff = 5 #@param {type:"integer"}
quick_renanimation = True #@param {type:"boolean"}
topN_to_pick = 10 #@param {type:"integer"}

import os, re
import pyrosetta, logging
from rdkit import Chem
from fragmenstein import Victor
Victor.work_path = output_folder
Victor.monster_throw_on_discard= True  # stop this merger if a fragment cannot be used.
Victor.monster_joining_cutoff = joining_cutoff # Å
Victor.quick_renanimation = quick_renanimation # for the impatient
Victor.error_to_catch = Exception # stop the whole laboratory otherwise
#Victor.enable_stdout(logging.ERROR)
Victor.enable_logfile(os.path.join(output_folder, 'demo.log'), logging.WARNING)
#Victor.log_errors()

def combine_subprocess(binary_hits: List[bytes]):
    hits : List[Chem.Mol] = [Chem.Mol(bh) for bh in binary_hits]
    pyrosetta.distributed.maybe_init(extra_options='-no_optH false -mute all -ignore_unrecognized_res true -load_PDB_components false')
    try:
        v = Victor(hits=hits,
                   pdb_block=pdbblock, # global()
                   ligand_resn='LIG',
                   ligand_resi='1B',
                   covalent_resi='145A', # a random residue is **still** required for the constaint ref atom.
                  )
        v.combine()
        result : dict = v.summarise()
        binarize : Callable[[Chem.Mol], bytes] = lambda mol: mol.ToBinary(propertyFlags=0b00010111)
        result['unmin_binary'] = binarize(v.monster.positioned_mol)
        result['min_binary'] = binarize(v.minimised_mol)
        return result
        # v.make_pse()
    except Exception as error:
        name = '-'.join([mol.SetProp('_Name') for mol in hits])
        error_msg = f'{error.__class__.__name__} {error}'
        Victor.journal.critical(f'*** {error_msg} for {name}')
        return dict(error=error_msg, name=name)

# =============================================================================================
# ## Iterate

from multiprocessing import Pool, cpu_count
import itertools, random, re
import pandas as pd
from typing import *

#@markdown How many processes to use?
#@markdown 0 = use `multiprocessing.cpu_count()`
#@markdown 1 = single
n_cores = 1 #@param {type:"integer"}
if n_cores < 1:
    n_cores = cpu_count()
    print(f'Using {n_cores} processes')
pool = Pool(n_cores, maxtasksperchild=1)

# https://github.com/matteoferla/Fragmenstein/blob/master/documentation/mol_properties.md
# the mol is binarised... to make it pickleable...
binarize : Callable[[Chem.Mol], bytes] = lambda mol: mol.ToBinary(propertyFlags=0b00010111)

# permutations of {A,B} are AB
# combinations of {A,B} are AB and BA
# products of {A,B} are AA AB BA BB
results = pool.map(combine_subprocess, itertools.permutations(map(binarize, hits), 2))
combinations = pd.DataFrame(results)
# =============================================================================================
# ## plot results

import plotly.express as px
from IPython.display import display
def assign_ddG(value: float):
    if str(value) == str(float('nan')):
        return 'crashed'
    elif value >= 0:
        return 'unstable'
    else:
        return 'stable'

x = pd.DataFrame(dict(
                  rejection = combinations.disregarded.apply(len).astype(bool).map({True: 'too distant', False: 'close'}),
                  ddG = combinations['∆∆G'].apply(assign_ddG),
                  errored = combinations['error'].apply(len).astype(bool).map({True: 'crashed', False: 'successful'}),
                  deviant = (combinations.comRMSD > 1).map({True: 'deviant', False: 'fixed'}),
                  acceptable=((combinations.comRMSD < 1) \
                              & (combinations['∆∆G'] < 0) \
                             & (~combinations.disregarded.apply(len))
                             ).map({True: 'accept', False: 'reject'})
                     ))\
.value_counts('rejection	errored	ddG	deviant acceptable'.split())
summary = x.index.to_frame().reset_index(drop=True)
summary['counts'] = x.values
summary_fig = px.sunburst(summary,
                    title=f'Outcome of {len(combinations)} combinations',
                    path='errored	rejection	ddG	deviant acceptable'.split(), 
                    color='acceptable',
                    color_discrete_map={'(?)':'lightgrey', 
                                        'accept':'turquoise', 
                                        'reject':'coral'},
                    values='counts')
summary_fig.show()

# =============================================================================================
# ## Reverse the warhead...
# this is really unusual and janky way of doing it as one ought to know the metadata already...

from rdkit.Chem import AllChem
from fragmenstein import Victor
from typing import *

def reverse_warhead(smiles) -> Tuple[str, str]:
    """
    Going backwards!
    """
    if not row.smiles or '*' not in row.smiles:
        ret
    mol = Chem.MolFromSmiles(smiles)
    warhead_defs = []
    for warhead_def in Victor.warhead_definitions:
        if mol.HasSubstructMatch( Chem.MolFromSmiles(warhead_def['covalent'])):
            warhead_defs.append(warhead_def)
    if not warhead_defs:
        raise ValueError(f'Could not match {smiles} to a warhead definition in `Victor.warhead_definitions`')
    # most complex one first!
    warhead_def = sorted(warhead_defs, key= lambda d: -len(d['covalent_atomnames']))[0]
    unrxn_mol = AllChem.ReplaceSubstructs(mol=mol,
                                  query=Chem.MolFromSmiles(warhead_def['covalent']),
                                  replacement=Chem.MolFromSmiles(warhead_def['noncovalent'])
                                 )
    return Chem.MolToSmiles(unrxn_mol[0]), warhead_def['name']

warhead_names = []
unreacted_smiles = []
for i, row in combinations.iterrows():
    unrxn, wn = reverse_warhead(smiles)
    warhead_names.append(wn)
    unreacted_smiles.append(unrxn)
        
combinations['unreacted_smiles'] = unreacted_smiles
combinations['warhead_type'] = warhead_names
combinations['LE'] = combinations.apply(lambda row: row['∆∆G']/(row.N_constrained_atoms+row.N_unconstrained_atoms),
                                        axis=1)
combinations.to_csv('combinations.csv')

# =============================================================================================
# ## top 10

best_conbinations = combinations.loc[
                (combinations.comRMSD < 1) \
                & (combinations['∆∆G'] < 0) \
                & (~combinations.disregarded.apply(len))
                ].sort_values('LE').reset_index(drop=True).head(10)
print('Top 10 mergers/linkers sorted by ligand efficiency')
#PandasTools.AddMoleculeColumnToFrame(best_conbinations,'smiles','molecule',includeFingerprints=False)
display(best_conbinations.drop(['unmin_binary', 'min_binary']))

# =============================================================================================
# ### Place purchaisable similars

from smallworld_api import SmallWorld
from warnings import warn

def get_similars_df(row: pd.Series, db='REAL_DB_20Q2') -> Union[None, pd.DataFrame]:
    try:
        sws = SmallWorld()
        similars : pd.DataFrame = sws.search(row.unreacted_smiles, dist=25, db=db)
        similars['name'] = similars.hitSmiles.str.split(expand=True)[1]
        similars['inspirations'] = [row.regarded] * len(similars)
        similars['merger'] = [row.smiles] * len(similars)
        similars['merger_∆∆G'] = row['∆∆G']
        similars[['smiles', 'name', 'topodist','inspirations', 'merger', 'merger_∆∆G']]
        similars['merger_unmin_binary'] = row.unmin_binary
        similars['merger_min_binary'] = row.min_binary
        return similars
    except Exception as error:
        warn(f'{error.__class__.__name__}: {error} for {row.smiles}')
        return pd.DataFrame()

def place_subprocess(data):
    smiles :str = data['smiles']
    # for smallworld placements this is the unminised, not the original hits
    # as sw query was with the former not latter
    hits : List[Chem.Mol] = [Chem.Mol(bh) for bh in data['binary_hits']]
    long_name : str = data['long_name']
    pyrosetta.distributed.maybe_init(extra_options='-no_optH false -mute all -ignore_unrecognized_res true -load_PDB_components false')
    try:
        v = Victor(hits=hits,
                   pdb_block=pdbblock, # global()
                   ligand_resn='LIG',
                   ligand_resi='1B',
                   covalent_resi='145A', # a random residue is **still** required for the constaint ref atom.
                  )
        v.place(smailes=smiles, long_name=long_name)
        result : dict = v.summarise()
        binarize : Callable[[Chem.Mol], bytes] = lambda mol: mol.ToBinary(propertyFlags=0b00010111)
        result['unmin_binary'] = binarize(v.monster.positioned_mol)
        result['min_binary'] = binarize(v.minimised_mol)
        return result
    except Exception as error:
        error_msg = f'{error.__class__.__name__} {error}'
        Victor.journal.critical(f'*** {error_msg} for {long_name}')
        return dict(error=error_msg, name=long_name)

def get_data(row: pd.Series, use_unminised_combination=True) -> dict:
    data = dict(smiles=row.smiles,
               binary_hits=[],
               long_name=row.name)
    if use_unminised_combination:
        binary_hits.append( row.merger_unmin_binary )
    else:
        for name in row.inspirations:
            for hit in hits: # global #type: Chem.Mol
                if hit.GetProp('_Name') == name:
                    binary_hits.append( binarize(hit) )
    return data
    
similars = pd.concat(objs=[get_similars_df(row) for i, row in best_conbinations.iterrows()],
                     ignore_index=True, axis=0)
display(similars)

results = pool.map(combine_subprocess, similars.apply(get_data, axis=1).to_list())
placements = pd.DataFrame(results)
placements.to_csv('placements.csv')
display(placements)
placements['const_ratio'] = placements['N_constrained_atoms']/(placements['N_constrained_atoms']+placements['N_unconstrained_atoms'])

placements['hit_mols'] = None
placements['merger_unminimized_mol'] = None
placements['merger_minimized_mol'] = None

best_placements = placements.loc[
                (placements.comRMSD < 1) \
                & (placements['∆∆G'] < 0) \
                & (placements.const_ratio > 2/3) \  # more than 2 in 3 is actually uncommon
                & (~placements.disregarded.apply(len))
                ].sort_values('LE').reset_index(drop=True).head(10)
print('Top 10 placements sorted by ligand efficiency')
#PandasTools.AddMoleculeColumnToFrame(best_conbinations,'smiles','molecule',includeFingerprints=False)
display(best_placements.drop(['hit_mols', 'merger_unminimized_mol', 'merger_unminimized_mol', 'unmin_binary', 'min_binary'])

# =============================================================================================
# ### Results redux

from IPython.display import clear_output, HTML, display
headerify : Callable[[str], HTML] = lambda header: HTML(f'<h3>{header}</h3>)
clear_output()
display(headerify('Provided hits'))
HorizontalMols(hits)
display(headerify('Step 1. Combine'))                                 
summary_fig.show()
print('Top 10 mergers/linkers sorted by ligand efficiency')
display(best_conbinations.drop(['unmin_binary', 'min_binary']))
display(headerify('Step 2. Placement of purchasable similars'))                            
#display(similars)
display(headerify('Top 10 Placements'))
display(best_placements.drop(['unmin_binary', 'min_binary']))

8

In [None]:
#@title Inspect specific

# from rdkit.Chem import PandasTools
from rdkit.Chem import PandasTools
#PandasTools.AddMoleculeColumnToFrame(best_conbinations,'smiles','molecule',includeFingerprints=False)
