In [None]:
#@title Install

#@markdown ### Preface

#@markdown &larr; Press the button with the play icon to run a _cell_.
#@markdown In this case install requirements

#@markdown This is a _Colab notebook_, a variant of a Jupyter notebook.
#@markdown If you are not in Colab press [this](https://colab.research.google.com/github/matteoferla/Fragment-hit-follow-up-chemistry/blob/main/colab/upload_prep.ipynb).

#@markdown Colab runs in Google's servers, hence why you will get asked
#@markdown to sign in if not done so already.
#@markdown Likewise it will ask if you trust the author (Matteo Ferla),
#@markdown if unsure about whether you should trust anything I do
#@markdown [click here for details](https://www.youtube.com/watch?v=dQw4w9WgXcQ).
#@markdown To inspect code press `show code` &darr;

#@markdown ### Aims
#@markdown This notebook takes your SDF file and prepares it for upload to Fragalysis
#@markdown compliant with version 1.2.

print('Installing gist-import rdkit')
!pip install -q gist-import rdkit

from gist_import import GistImporter
from types import ModuleType

script_url: str = 'https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/followup/prep_fragalysis.py'
fragalysis_prep: ModuleType = GistImporter.from_github(script_url).to_module('fragalysis_prep')

In [None]:
#@title Configure header
#@markdown ### Configure
#@markdown **Method name**. Your `submitter_name` will be appended to it,
#@markdown but not the target and two targets therefore cannot share the same cset.
#@markdown So `Protein123-macaroni-art` is recommended.
method = "macaroni-art" #@param {type:"string"}
ref_url="https://www.noradsanta.org/en/" #@param {type:"string"}

#@markdown **submitter**.
submitter_name = "Santa Claus" #@param {type:"string"}
submitter_email = "saint.nicholaus@gchq.gov.uk" #@param {type:"string"}
submitter_institution = "Santa Village, Groenland" #@param {type:"string"}

#@markdown **Extra field names**, comma separated. These are fields you want to
#@markdown appear in the Fragalysis table/sort modal.

wanted_keys_str = "rationale" #@param {type:"string"}
wanted_keys = set(map(str.strip, wanted_keys_str.split(',')))
# remove defaults
wanted_keys = list(wanted_keys - {'ref_pdb', 'ref_mols', 'original SMILES'})

# ----------------------------------------

from rdkit import Chem
import fragalysis_prep  # hack from above

header: Chem.Mol = fragalysis_prep.generate_header(method=method,
                         ref_url=ref_url,
                         submitter_name=submitter_name,
                         submitter_email=submitter_email,
                         submitter_institution=submitter_institution,
                         extras=dict(zip(wanted_keys, wanted_keys))
                                  )

In [None]:
#@title Upload and configure

#@markdown ### Upload SDF
#@markdown press play and you will be propted.

from google.colab import files
from typing import Dict
from rdkit import Chem
from rdkit.Chem import AllChem, Draw, PandasTools
import io, operator

uploaded: Dict[str, bytes] = files.upload()

sdf_bytes: str = list(uploaded.values()).pop()
sdf_stream = io.BytesIO(sdf_bytes)
supplier = Chem.ForwardSDMolSupplier(sdf_stream)
mols = [mol for mol in supplier if mol is not None]
print(f'{len(mols)} molecules uploaded')

max_name_length = 50 #@param {type:"slider", min:1, max:63, step:1}
max_compounds = 200 #@param {type:"slider", min:1, max:1000, step:1}

#@markdown Details key `rationale`, leave blank if absent.
#@markdown The `details_display_word` will be prefixed.
#@markdown Not tried if emoji work.
rationale_prop_name="" #@param {type:"string"}
rationale_display_word="info" #@param {type:"string"}
max_rationale_length = 250 #@param {type:"slider", min:1, max:255, step:1}

#@markdown template. leave blank if already added as `ref_pdb`,
#@markdown and `ref_mols` keys. The latter is comma separated.
#@markdown and does not contain the target prefix or colon + alt name.
#@markdown if its a prop in the sdf the use
#@markdown `ref_mols_prop_name` and `ref_pdb_prop_name` and
ref_pdb='fooo' #@param {type:"string"}
ref_mols='' #@param {type:"string"}
ref_pdb_prop_name='ref_pdb' #@param {type:"string"}
ref_mols_prop_name='ref_mols' #@param {type:"string"}
target_name='protein_123A' #@param {type:"string"}

#@markdown NaN value.
fill_na='-1' #@param {type:"string"}

# ----------------------------------------
import operator, functools
import pandas as pd

import operator, re
import pandas as pd

df = pd.DataFrame([{'mol': mol} for mol in mols])

max_name_length = 50 #@param {type:"slider", min:1, max:63, step:1}

def bleach_name(name: str) -> str:
  bleached = re.sub(r'[^\w_]+', '_', name)
  return bleached[:max_name_length]

df['name'] = df.mol.apply(lambda mol: mol.GetProp('_Name'))\
                   .apply(bleach_name)

def get_bleached_rationale(mol):
  rationale = rationale_display_word + ' '
  if mol.HasProp(rationale_prop_name):
     rationale += mol.GetProp(rationale_prop_name)
  else:
    pass
  return rationale[:max_rationale_length ]

if rationale_prop_name:
  df['rationale'] = df.mol.apply(get_bleached_rationale)

if ref_pdb:
  pass
elif mols[0].HasProp(ref_mols_prop_name):
  ref_pdb=None
else:
  raise ValueError('You need to specify a template ref_pdb')


def bleach_ref_mols(mol: Chem.Mol) -> str:
  if mol.HasProp(ref_mols_prop_name):
    temp = mol.GetProp(ref_mols_prop_name)
  else:
    temp = ref_mols
  temp = temp.replace(f'{target_name}-', '').replace('X', 'x')
  templates = temp.split()
  return ','.join([c.strip() for c in templates])

df['ref_mols'] = df.mol.apply(bleach_ref_mols)


def get_key(mol, key):
  value = mol.GetProp(key) if mol.HasProp(key) else fill_na
  if value in ('NaN', 'None', 'undefined', 'none'):
    return fill_na
  return value

for key in wanted_keys:
  df[key] = df.mol.apply(functools.partial(get_key, key=key))

fragalysis_prep.prep(df.head(max_compounds),
                      header, mol_col='mol',
                      name_col='name',
                      outfile=f'fragalysis_ready.sdf',
                      ref_pdb_name=ref_pdb,
                      extras=wanted_keys
                      )




In [None]:
#@title Download

#@markdown Download file!

from google.colab import files

files.download('fragalysis_ready.sdf')
