In [None]:
#@title Installation
# ------------------------------------------------

import os

#source /etc/os-release && echo $PRETTY_NAME
with open('/etc/os-release') as fh:
  for line in fh:
    if line and line[0] != '#':
      os.environ[line.split('=')[0]] = line.split('=')[1]
print(f'Running {os.environ["PRETTY_NAME"]}') # on {os.environ["HOST"]} as {os.environ["USER"]}')
import os

gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

# ------------------------------------------------

from IPython.display import display, clear_output
import time, sys
tick = time.time()
# Install mamba without resetting the kernel alla condacolab
!wget -qnc https://github.com/conda-forge/miniforge/releases/latest/download/Mambaforge-Linux-x86_64.sh
!bash Mambaforge-Linux-x86_64.sh -bfp /usr/local
sys.path.append('/usr/local/lib/python3.10/site-packages')
!mamba config --set auto_update_conda false
!pip -q install rdkit
!pip install biopython
#!pip -q install pdb2pqr
!pip install -q meeko
!pip install -q git+https://github.com/matteoferla/DTC-compchem-practical.git
!mamba install -y -c conda-forge -c bioconda vina oddt openbabel
!pip install -q prolif MDAnalysis
!mamba install -y -c hcc adfr-suite
!mamba install -y -c conda-forge plip --no-deps
tock = time.time()
clear_output()
print(f'Installation time: {tock - tick}')

# ------------------------------------------------
from typing import Sequence, List, Set, Dict, Tuple, Optional, Union
import warnings
with warnings.catch_warnings():
  warnings.simplefilter("ignore")
  # last year used pyrosetta... the warning is distracting.
  import DTC_compchem_practical as dtc

## Preparation

Let's dock QRU into Mac1.
The ligand `QRU` was derived from 6FZ and BHA.
`QRU` is a chemical component identifier, but its Enamine REAL catalogue number is `Z5021668601`.
You can find details of the structure and ligand in the PDB: https://www.rcsb.org/structure/5SQJ and
 https://www.rcsb.org/ligand/QRU.
 6FZ and BHA hits were used as templates for the derivative QRU, which is a merger.
 So we will use them to get the centroid.

We should not cheat: we should start with the SMILES of QRU. Find it!


In [None]:
#@title Get ligand and centroid
# so we will use their centroid for placement
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
import numpy.typing as npt

template_hits = [Chem.MolFromMolBlock(b) for b in map(dtc.get_data, ('6FZ.mol', 'BHA.mol'))]
combo: Chem.Mol = Chem.CombineMols(*template_hits)
center = list(AllChem.ComputeCentroid( combo.GetConformer()))
# wait a sec? there is a function already?! Well, it was fun to do manually, no?
size = int(np.max(AllChem.Get3DDistanceMatrix(combo))) + 10
print(f'Box size: {size}x{size}x{size}')

# we will use this for reference only!
crystal: Chem.Mol = Chem.MolFromMolBlock(dtc.get_data('QRU.mol'))
# Let's start from a SMILES which you found _online_
ligand_smiles = '👾👾👾'
ligand = Chem.MolFromSmiles(ligand_smiles)
AllChem.EmbedMolecule(ligand)
ligand_filename = 'QRU.pdb'
Chem.MolToPDBFile(ligand, ligand_filename)
# should we have added hydogens or no here?

In [None]:
# Wait... We forgot to centre the ligand!
# Luckily we aced rototranslations in the previous notebook
from rdkit import Geometry
lig_center = list(AllChem.ComputeCentroid( ligand.GetConformer()))
print('This is actually zero zero zero... ', lig_center)
# Always play it safe though!
lig_conf: Chem.Conformer = ligand.GetConformer()
for i in range(ligand.GetNumAtoms()):
    pos: Geometry.Point3D = lig_conf.GetAtomPosition(i)
    lig_conf.SetAtomPosition(i, Geometry.Point3D(pos.x - lig_center.x + center.x,
                                                 pos.y - lig_center.y + center.y,
                                                 pos.z - lig_center.z + center.z))
Chem.MolToPDBFile(ligand, ligand_filename)

In [None]:
#@title Covert to PDBQT
# This is normally a system call.
# RDKit has no converter for PDBQT.
# OpenBabel can convert to PDBQT, but that is unwise for the protein ("receptor")

import pkg_resources, os, subprocess
from pathlib import Path

prepped_template_filaname = 'template.pdbqt'
prepped_ligand_filaname = 'ligand.pdbqt'
template_filename: str = pkg_resources.resource_filename('DTC_compchem_practical', 'data/mac1-stripped.pdb')
pipe = subprocess.Popen(f'prepare_receptor -r {template_filename} '+
                        f'-o {prepped_template_filaname}',
                           stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, error = pipe.communicate()
print(output.decode('utf-8'))
assert pipe.returncode == 0, f"Error: {error.decode('utf-8')}"
assert (Path('/content') / prepped_template_filaname).exists(), 'prepare_receptor failed??'

pipe = subprocess.Popen(f'prepare_ligand -l {ligand_filename} '+\
                        f'-o {prepped_ligand_filaname} -A hydrogens',
                           stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
output, error = pipe.communicate()
print(output.decode('utf-8'))
assert pipe.returncode == 0, f"Error: {error.decode('utf-8')}"
assert (Path('/content') / prepped_ligand_filaname).exists(), 'prepare_ligand failed??'
print(f'pdbqt files {prepped_template_filaname} and {prepped_ligand_filaname} made')

In [None]:
#@title Call Vina

from rdkit import Chem
from rdkit.Chem import AllChem
from vina import Vina
# score function to use Vina or another, see help(Vina) for more
vina = Vina(sf_name='vina')
vina.set_receptor(prepped_template_filaname)
vina.set_ligand_from_file(prepped_ligand_filaname)
vina.compute_vina_maps(center=center, box_size=[size, size, size])
energy = vina.score()
print('Score before minimization: %.3f (kcal/mol)' % energy[0])
energy_minimized = vina.optimize()
print('Score after minimization : %.3f (kcal/mol)' % energy_minimized[0])
vina.write_pose('vina_minimized.pdbqt', overwrite=True)
vina.dock(exhaustiveness=32, n_poses=20)
vina.write_poses('vina_out.pdbqt', n_poses=10, overwrite=True)
energy = vina.score()
print('Score of best : %.3f (kcal/mol)' % energy[0])

> What is the effect of a bigger box size? Why not do blind docking?

👾👾👾

In [None]:
#@title Covert from PDBQT
# The format needs converting from PDBQT to RDKit objects
# One way is to use a OpenBabel system call, but another is doing it Pythonically.
# This would allow us to keep hold of some metadata...

import io
from openbabel import openbabel as ob
from typing import List

crystal = Chem.MolFromMolBlock(dtc.get_data('QRU.mol'))
poses: List[Chem.Mol] = []

obConversion = ob.OBConversion()
obConversion.SetInAndOutFormats("pdbqt", "pdb")
obmol = ob.OBMol()
notatend = obConversion.ReadString(obmol, vina.poses())
# if you want to read a file it would be:
#obConversion.ReadFile(obmol, 'vina_out.pdbqt')
while notatend:
    pdb_string = obConversion.WriteString(obmol)
    # Oh no! Does Chem.MolFromMolBlock strip hydrogens?
    # if it does we need to tell it not do that. How will we find the argument? Help!
    mol = Chem.MolFromPDBBlock(pdb_string)
    remarks = dict([l.replace('REMARK ','').split(':') for l in pdb_string.split('\n') if 'REMARK' in l])
    for k, v in remarks.items():
      mol.SetProp(k.strip(), v.strip())
    poses.append(mol)

    obmol = ob.OBMol()
    notatend = obConversion.Read(obmol)

# let's have a look
display(poses[0])

> What is going on?

👾👾👾

Case A:
> The hydrogens are now radicals —probably should have paid attention to the yammering on about hydrogens. That is awkward. Were we meant to use a structure with hydrogens right?

👾👾👾

Case B:
Phew!

In [None]:
# Let's fix the chemistry anyway
for i, mol in enumerate(poses):
  AllChem.AssignBondOrdersFromTemplate(crystal, mol)

# We just corrected the bond order (i.e. double bonds etc) from a template.
# RDKit natively does not have a function to perceive bond orders
# from geometry, but OpenBabel does `OBMol.PerceiveBondOrders()`
# OpenBabel also better at handling protein —multimodel, altloc, residue connectivity.
# Hence why some tools such as ODDT and PLIP require it.

In [None]:
#@title Taking a gander
for i, mol in enumerate(poses):
    e = float( mol.GetProp('VINA RESULT').split()[0] )
    print(f'The pose number {i} has an RMSD of {AllChem.CalcRMS(crystal, poses[0])}Aastroem and a score of {e} kcal/mol')

i = 0 # change me!
print(f'Showing pose number {i}')
view = dtc.get_protein_view(dtc.get_data('mac1-stripped.pdb'), resn='HOH')
dtc.add_mols(view, cyanCarbon=crystal, whiteCarbon=poses[i])
view.zoomTo({'model': -1})
view.show()

In [None]:
#@title Get interactions
# Say we have many compounds and we want to look at the diversity of interactions.
# Then we can enumerate the interaction via PLIP, oddt.interactions or ProLIF.

import os
from pathlib import Path
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools

from functools import singledispatchmethod
from typing import Tuple, Dict, List, Union, Sequence, Optional, Any
from collections import Counter, defaultdict
from plip.structure.preparation import PDBComplex, PLInteraction
from openbabel.pybel import Atom, Residue
from openbabel.pybel import ob
from Bio.PDB import PDBParser, Structure
from Bio.PDB.PDBIO import PDBIO
from collection import defaultdict
import io

# let's combine a PDB file and a mol file.
# using a different system again, Biopython
# previous we make `template_hits` (List[Chem.Mol])

# But first to spice things up let's assign residue name and index in RDKit
# this can be done in BioPython, but let's use RDKit.
# This is because RDKit Chem.Mol is a simple molecule and does not care
# it is a single unit or not. In fact, we saw before we could have unconnected mols
# in a mol!
# as a result RDKit does not store PDB Info at the Chem.Mol level but at the Atom level.

mol: Chem.Mol = Chem.Mol(crystal)  # <--- Change me later
resn = 'LIG'  # pymol calls it resn, it is a chemical component identifier
chain = 'X'
resi = 1   # residue PDB index, starts at 1, Fortran style. Can be discontinuous
# pose index is continuous and in many systems starts at 0, C style.
# lets simply assign a number to each atom by element symbol
count = defaultdict(int)
for atom in mol.GetAtoms():
    count[atom.GetSymbol()] += 1
    atom.SetIntProp('_c', count[atom.GetSymbol()])
# the let's assign
for atom in mol.GetAtoms():
    n = atom.GetIntProp('_c')
    info = Chem.AtomMonomerInfo(f'{atom.GetSymbol(): >2}{str(n): <2}', residueName=resn, residueNumber=str(resi), chainId=chain, isHeteroAtom=True)
    atom.SetPDBResidueInfo(info)

# Okay now let's make a holo protein in BioPython
parser = PDBParser()
apo_structure: Structure = parser.get_structure("FOO", io.StringIO(dtc.get_data('mac1-stripped.pdb')))


block: str = Chem.MolToPDBBlock(hit)
lig: Structure = parser.get_structure("BAR", io.StringIO(block))
lig.id = 1
holo_structure: Structure = Structure.Structure("combined")
holo_structure.add(apo_structure[0])
holo_structure.add(lig)
pdb_buffer = io.StringIO()
pdbio = PDBIO()
pdbio.set_structure(holo_structure)
pdbio.save(pdb_buffer)
pdb_buffer.seek()
holo_block = pdb_buffer.getvalue()

# now let's use a system that uses OBabel, PLIP (ODDT.interaction also does)
# load protein-ligand complex PDB block `block` (as_string=True) or filename (as_string=False)
holo = PDBComplex()
holo.load_pdb(block, as_string=True)
# analyse
holo.analyze()
interaction_set: PLInteraction = holo.interaction_sets[':'.join([resn, chain, str(resi)])]
details: List[Dict[str, Any]] = []

# Pretend you don't see this...
def summarize_interaction(self, intxn, atom_names: Sequence[str]) -> Dict[str, Any]:
    # https://github.com/openbabel/openbabel/blob/master/data/atomtyp.txt
    # https://blog.matteoferla.com/2023/07/a-note-no-plip-interactions.html
    relevant_atom_names = []
    type_name = intxn.__class__.__name__
    details = {'type': type_name, 'protein_resn': intxn.restype, 'protein_resi': intxn.resnr,
               'protein_chain': intxn.reschain}
    if type_name == 'hbond':
        if intxn.protisdon:
            # assert hbond.a.residue.name != self.resn
            details['atom_names'] = [atom_names[intxn.a.idx - 1]]
            details['type'] = 'hbond_acceptor'
            details['babel_atom_types'] = [intxn.atype]
        else:
            details['atom_names'] = [atom_names[intxn.d.idx - 1]]
            details['type'] = 'hbond_donor'
            details['babel_atom_types'] = [intxn.dtype]
        details['distance'] = intxn.distance_ad  # intxn.distance_ad is to donor, _ah to hydrogen
    elif type_name == 'hydroph_interaction':
        details['atom_names'] = [atom_names[intxn.ligatom.idx - 1]]
        details['babel_atom_types'] = [intxn.ligatom.type]
        details['distance'] = intxn.distance
    elif type_name == 'pistack':
        details['atom_names'] = [atom_names[a.idx - 1] for a in intxn.ligandring.atoms]
        details['babel_atom_types'] = [a.type for a in intxn.ligandring.atoms]
        details['distance'] = intxn.distance
    elif type_name == 'waterbridge':
        if intxn.protisdon:
            # assert hbond.a.residue.name != self.resn
            details['atom_names'] = [atom_names[intxn.a.idx - 1]]
            details['type'] = 'water_acceptor'
            details['babel_atom_types'] = [intxn.atype]
            details['distance'] = intxn.distance_aw
        else:
            details['atom_names'] = [atom_names[intxn.d.idx - 1]]
            details['type'] = 'water_donor'
            details['babel_atom_types'] = [intxn.dtype]
            details['distance'] = intxn.distance_dw
    elif type_name == 'saltbridge':
        if intxn.protispos:
            details['atom_names'] = [atom_names[a.idx - 1] for a in intxn.negative.atoms]
            details['type'] = 'saltbridge_negative'
            details['babel_atom_types'] = [a.type for a in intxn.negative.atoms]
            details['distance'] = intxn.distance
        else:
            details['atom_names'] = [atom_names[a.idx - 1] for a in intxn.positive.atoms]
            details['type'] = 'saltbridge_positive'
            details['babel_atom_types'] = [a.type for a in intxn.positive.atoms]
            details['distance'] = intxn.distance
    elif type_name == 'pication':
        if intxn.protcharged:
            details['atom_names'] = [atom_names[a.idx - 1] for a in intxn.ring.atoms]
            details['type'] = 'pication_ring'
            details['babel_atom_types'] = [a.type for a in intxn.ring.atoms]
            details['distance'] = intxn.distance
        else:
            details['atom_names'] = [atom_names[a.idx - 1] for a in intxn.charge.atoms]
            details['type'] = 'pication_charge'
            details['babel_atom_types'] = [a.type for a in intxn.charge.atoms]
            details['distance'] = intxn.distance
    elif type_name == 'halogenbond':
        details['atom_names'] = [atom_names[intxn.don.idx - 1]]
        details['babel_atom_types'] = [intxn.don.type]
        details['distance'] = intxn.distance
    else:  # metal_complex basically.
        raise TypeError(type_name)
    return details

for intxn in interaction_set.all_itypes:
    details.append(summarize_interaction(intxn))

print(details)

This was rather circuitous, but we got there.
Now if the above was a function accepting a RDKit Chem.Mol, and gave a `pd.DataFrame` we could compare nicely different compounds.
There are there in the dataset...

> Do `6FZ.mol`, `BHA.mol` share interactions? What new interactions are in `QRU.mol` but not in in the other two?