In [None]:
@title Installations and function declaration
#@markdown # Aims
#@markdown A problem in the uploaded compounds to Fragalysis is
#@markdown the template and the virtual compounds may have clashed due to induced fit
#@markdown yet the latter could not be provided,
#@markdown making it hard to understand if really induced or a clash was let be.
#@markdown Therefore, ideally the hits will be ranked by induced fit,
#@markdown but shown as if lock and key thus not confusing anyone.

#@markdown ### Preface on Colab

#@markdown ← Press the button with the play icon to run a _cell_.
#@markdown In this cell: install requirements.

#@markdown This is a _Colab notebook_, a variant of a Jupyter notebook.
#@markdown If you are not in Colab press [this](https://colab.research.google.com/github/matteoferla/Fragment-hit-follow-up-chemistry/blob/main/colab/upload_fix.ipynb).
#@markdown For the preparation of an upload file
#@markdown see [this](https://colab.research.google.com/github/matteoferla/Fragment-hit-follow-up-chemistry/blob/main/colab/upload_prep.ipynb).

#@markdown Colab runs in Google's servers, hence why you will get asked
#@markdown to sign in if not done so already.
#@markdown Likewise it will ask if you trust the author (Matteo Ferla),
#@markdown if unsure about whether you should trust anything I do
#@markdown [click here for details](https://www.youtube.com/watch?v=dQw4w9WgXcQ).

#@markdown The menu bar can shown/hidden via the chevron in the top right.

#@markdown To inspect code press `show code` ↓

#@markdown Still confused about notebook? Ask your friendly demonstrators for more!


!pip install rdkit requests fragmenstein pandas tqdm plotly -q

from pathlib import Path
import requests
import zipfile
import io
import re
import os
from typing import Dict
from rdkit import Chem
from rdkit.Chem import AllChem, Draw
from rdkit.Chem.Draw import IPythonConsole
from fragmenstein import Wictor  # Victor, but RDKit only
import logging
import operator
import pandas as pd
import plotly.express as px
from tqdm.notebook import tqdm
from google.colab import files

Wictor.enable_stdout(logging.ERROR)

def get_target_data(target: str) -> dict:
    """
    Fetch target data from Fragalysis
    """
    response: requests.Response = requests.get(f'https://fragalysis.diamond.ac.uk/api/targets/?title={target}')
    info = response.json()
    assert info['next'] is None, '?? New feature: capped?'
    assert len(info['results']) == 1, 'ambiguous?'
    return info['results'][0]


def get_apo_pdbblocks(zip_url):
    """
    Fetch the apo pdbblocks from Fragalysis
    Get url from ``get_target_data``.
    """
    apo_pdbblocks = {}
    dejavu = []
    response: requests.Response = requests.get(zip_url)
    response.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        for filename in sorted(z.namelist()):
            if '_apo.pdb' not in filename:
                continue
            # aligned/A71EV2A-x0469_0A/A71EV2A-x0469_0A_apo.pdb
            xchem_code = filename.split('/')[-1].split('-')[-1].replace('_apo.pdb', '')
            # code x0123_0A is the same structure as x0123_1A
            if xchem_code.split('_')[0] in dejavu:
                continue
            else:
                dejavu.append(xchem_code.split('_')[0])
            with z.open(filename) as f:
                apo_pdbblocks[xchem_code] = f.read().decode('utf-8')
    return apo_pdbblocks

def remove_altloc(pdbblock: str, remove_hetatm: bool=True) -> str:
    """Removes all altlocs and actually segi duplicates.
    Test line:

    ATOM      1  N   MET A   1       0.000  0.000  0.000  1.00 60.69           N
    """
    lines: List[str] = []
    seen: List[Tuple[str, str, int]] = []
    for line in pdbblock.split('\n'):
        if 'ANISOU' in line:
            continue # skip
        if line[:4] != 'ATOM' and line[:6] != 'HETATM':
            lines.append(line)
            continue
        if remove_hetatm and line[:6] == 'HETATM':
            continue
        atom_info = line[12:16].strip(), line[21].strip(), int(line[22:26].strip())
        if atom_info not in seen:
            lines.append(f'{line[:16]} {line[17:]}')
            seen.append(atom_info)
        else: # skip
            pass
    return '\n'.join(lines)

def score(vc: Chem.Mol, template_name:str, apo_pdbblock: str, **settings) -> dict:
        """
        Minimised the compound (Fragalysis upload ready),
        in a frozen neighbourhood of the provided template.
        The compounds need to be in position already.

        Done using frozen neighbourhood discussed here:
        https://www.blopig.com/blog/2023/11/the-workings-of-fragmensteins-rdkit-neighbour-aware-minimisation/
        """
        wicky = Wictor([vc], pdb_block=apo_pdbblock, **settings)
        wicky.place(vc.GetProp('original SMILES'), long_name=f'{vc.GetProp("_Name")}_on_{template_name}')
        info = wicky.summarize()
        for k, v in vc.GetPropsAsDict().items():
            if isinstance(v, float):
                wicky.minimized_mol.SetDoubleProp(k, v)
            elif isinstance(v, int):
                wicky.minimized_mol.SetIntProp(k, v)
            else:
                wicky.minimized_mol.SetProp(k, v)
        wicky.minimized_mol.SetProp('ref_pdb', template_name)
        info['mol'] = wicky.minimized_mol
        info['template'] = template_name
        info['query'] = vc.GetProp('_Name')
        return info

In [None]:
#@title Upload file

target = 'A71EV2A' #@param {type:"string"}

from google.colab import files

uploaded = files.upload()

filename = list(uploaded.keys())[0]



In [None]:
#@title Run!

print('Fetching PDB blocks')
data: dict = get_target_data(target)
project_id: str = data['project_id']
apo_pdbblocks: Dict[str, str] = get_apo_pdbblocks( data['zip_archive'] )
apo_pdbblocks = {k: remove_altloc(block) for k, block in apo_pdbblocks.items() }

print('Reading mols')
with Chem.ForwardSDMolSupplier(io.BytesIO(uploaded[filename])) as sdfh:
    vcs = list(sdfh)
header = vcs[0]
vcs = vcs[1:]
print(f'{vcs} molecules provided')

print('Scoring every molecule in every template')
all_scores = []
for vc in tqdm(vcs):
        scores = [score(vc, template, apo_pdbblock)  for template, apo_pdbblock in apo_pdbblocks.items()]
        all_scores.append( scores )
best = [sorted(scores, key=operator.itemgetter('∆∆G'))[0] for scores in all_scores if scores]

print('Scores:')
df = pd.DataFrame(best)
print(f'{len(df)} VCs, {sum(df["∆∆G"] < 0.)} acceptably placed. {sum(df["comRMSD"] < 1)} with minor deviation')
px.scatter(df, '∆∆G', 'comRMSD',
           title='Fragmenstein Wictor placed compounds (best template)').show()

In [None]:
#@title Download

output_filename='template-ajusted.sdf' #@param {type:"string"}
method_suffix='template-adjusted' #@param {type:"string"}

# Writing file
method_name = header.GetProp('method')
new_header = Chem.Mol(header)
new_header.SetProp('method', f'{method_name}{method_suffix}')
n = 0
with Chem.SDWriter(out_filename) as sdfh:
    sdfh.write(new_header)
    for info in best:
        if info['∆∆G'] >= 0.:
            continue
        if info['comRMSD'] >= 2.:
            continue
        sdfh.write(info['mol'])
        n+=1
print(f'wrote {n} molecules')

# download
files.download(output_filename)