In [1]:
from rdkit.Chem import AllChem, PandasTools

import pandas as pd

placements = pd.read_pickle('fragmenstein_placed_paired.pkl.gz').reset_index(drop=True)
similars = pd.read_pickle('fragmenstein_similars_paired.pkl.gz').reset_index(drop=True)

In [2]:
for k in ['id', 'hitSmiles', 'qrySmiles', 'qryMappedSmiles', 'hitMappedSmiles',
       'atomMap', 'atomScore', 'anonIdx', 'mf', 'mw', 'dist', 'ecfp4',
       'daylight', 'topodist', 'mces', 'tdn', 'tup', 'rdn', 'rup', 'ldn',
       'lup', 'mut', 'maj', 'min', 'hyb', 'sub', 'smiles',
       'query_index', 'query_smiles', 'catalogue', 'query_name', 
       'minimized_merger', 'unminimized_merger', 'custom_map']:
    placements[k] = similars[k]

In [3]:
len(similars), len(placements)

(61449, 61449)

In [4]:
from fragmenstein_merge_sw_place import score
import json, operator

with open('weights.json') as fh:
    weights = json.load(fh)

hit_replacements = pd.read_pickle('fragmenstein_hit_replacements_2.pkl.gz')


placements = placements.copy()
score(placements, 
      hit_replacements,
      suffix = '_2',
      hits=hit_replacements.hit_mols.apply(operator.itemgetter(0)).to_list(),
      weights=weights)





INFO: Pandarallel will run on 124 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
PyRosetta-4 2023 [Rosetta PyRosetta4.Release.python311.ubuntu 2023.27+release.e3ce6ea9faf661ae8fa769511e2a9b8596417e58 2023-07-07T12:00:46] retrieved from: http://www.pyrosetta.org
(C) Copyright Rosetta Commons Member Institutions. Created in JHU by Sergey Lyskov and PyRosetta Team.


In [21]:
weights

{'N_rotatable_bonds': 3,
 '∆∆G': 3,
 'interaction_uniqueness_metric': -20,
 'N_unconstrained_atoms': 0.5,
 'N_constrained_atoms': -0.2,
 'N_interactions': -5,
 'N_interactions_lost': 10,
 'max_hit_Tanimoto': -1}

In [23]:
placements.loc[placements.outcome == 'acceptable']\
          .sort_values('ad_hoc_penalty')\
          .drop_duplicates('cluster')[list(weights.keys()) + ['ad_hoc_penalty']].head()

Unnamed: 0,N_rotatable_bonds,∆∆G,interaction_uniqueness_metric,N_unconstrained_atoms,N_constrained_atoms,N_interactions,N_interactions_lost,max_hit_Tanimoto,ad_hoc_penalty
15820,5,-7.998571,0.078042,4.0,23.0,8,0.0,0.301237,-53.457788
13483,2,-7.374585,0.092267,1.0,19.0,9,1.5,0.135572,-51.404674
23144,3,-6.589362,0.108231,3.0,20.0,9,1.0,0.3049,-50.737607
15818,4,-6.903683,0.099624,3.0,23.0,7,0.0,0.311277,-49.114814
8612,2,-8.906276,0.038759,3.0,18.0,6,0.5,0.480043,-49.074055


In [None]:
placements.columns.to_list()

In [18]:
weights

{'N_rotatable_bonds': 3,
 '∆∆G': 3,
 'interaction_uniqueness_metric': -20,
 'N_unconstrained_atoms': 0.5,
 'N_constrained_atoms': -0.2,
 'N_interactions': -5,
 'N_interactions_lost': 10,
 'max_hit_Tanimoto': -1}

In [17]:
from gist_import import GistImporter

fmodule = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/followup/prep_fragalysis.py')
prep = fmodule['prep']
help(prep)
generate_header = fmodule['generate_header']
help(generate_header)

Help on function prep in module gist_import.execution:

prep(df: 'pd.DataFrame', header: 'Chem.Mol', mol_col: 'str', name_col: 'str', outfile: 'str' = 'for_fragalysis.sdf', ref_mol_names: 'Optional[str]' = None, ref_pdb: 'Optional[str]' = None, extras: 'Optional[dict]' = None) -> 'None'
    Prepare a SDF file for Fragalysis.
    
    
    :param df: dataframe with molecules
    :param header: Chem.Mol generated by ``generate_header`` for example
    :param mol_col: name of the column containing the molecules
    :param name_col: name of the column containing the names
    :param outfile: name of the output file
    :param ref_mol_names: comma separated list of names of the reference molecules (for all hits). Ignored if present.
    :param ref_pdb: name of the protein to use. Ignored if present.
    :param extras: Extra fields to add to the SDF file, these need to be in the ``header`` Chem.Mol
    :return:

Help on function generate_header in module gist_import.execution:

generate_head

In [36]:
placements['catalogue'].unique()

array(['Enamine-SC-Stock-Mar2022.smi.anon',
       'Enamine-BB-Stock-Mar2022.smi.anon', 'REAL-Database-22Q1.smi.anon',
       'mcule_purchasable_building_blocks_230120.smi.anon',
       'mcule_purchasable_virtual_230121.smi.anon',
       'MculeUltimate-20Q2.smi.anon'], dtype=object)

In [38]:
placements['Enamine'] = placements['catalogue'].apply(lambda n: n in ['Enamine-SC-Stock-Mar2022.smi.anon',
       'Enamine-BB-Stock-Mar2022.smi.anon', 'REAL-Database-22Q1.smi.anon']).astype(int)
placements['Mcule'] = placements['catalogue'].apply(lambda n: n in ['mcule_purchasable_building_blocks_230120.smi.anon',
       'mcule_purchasable_virtual_230121.smi.anon',
       'MculeUltimate-20Q2.smi.anon']).astype(int)
placements = placements.copy()

In [58]:
target_name='XX01ZVNS2B'
method_name = 'Fragmenstein-pairs-no_curation'

import numpy as np
from rdkit import Chem
from typing import List

def fix(mol: Chem.Mol):
    assert isinstance(mol, Chem.Mol)
    assert mol.GetNumAtoms()
    mol.ClearComputedProps()
    for name in mol.GetPropNames():
        mol.ClearProp(name)

placements.loc[(placements.outcome == 'acceptable') \
                            & (placements['∆∆G'] <= -5) \
                            & (placements.largest_ring <= 10) \
                            & (placements.ad_hoc_penalty <= 0)
                            ]\
                   .sort_values('ad_hoc_penalty')\
          .sort_values('ad_hoc_penalty')\
          .drop_duplicates('cluster')\
          .copy()\
          .to_pickle('fragmenstein_placed_paired_filtered.pkl.gz')


with pd.option_context('mode.chained_assignment', None):

    df = placements.loc[(placements.outcome == 'acceptable') \
                            & (placements['∆∆G'] <= -5) \
                            & (placements.largest_ring <= 10) \
                            & (placements.ad_hoc_penalty <= 0)
                            ]\
                       .sort_values('ad_hoc_penalty')\
                       .drop_duplicates('cluster')\
                       .rename(columns={c: ':'.join(map(str, c)) for c in placements.columns if isinstance(c, tuple)})\
                       .reset_index()\
                       .head(500).copy()
    # list of str to str w/ comma-separator
    df['ref_mols'] = df.hit_names.apply(lambda l: ','.join([v.replace(f'{target_name}-', '') for v in l]))
    # not used
    df['washed_mol'] = df.minimized_mol.apply(fix)
    df['name'] = df['name'].apply(lambda v: v.split('-'+target_name)[0])
        
numericals = [k for k, v in df.iloc[0].items() if type(v) in (int, np.int64, float, np.float64)]
nonempty = [c for c, v in (df[[c for c in df.columns if ':' in c]].sum() > 0).to_dict().items() if v]
extras: List[str] = [c for c in df.columns if c in numericals and ':' not in c] + nonempty
bad_columns = [x for x, v in df[extras].astype(float).isna().any().to_dict().items() if v]
assert not bad_columns, 'Some entries are not numeric'
header: Chem.Mol = generate_header(method_name,
                         ref_url='https://github.com/matteoferla/Fragment-hit-follow-up-chemistry',
                         submitter_name='Matteo Ferla',
                         submitter_email='matteo.ferla@stats.ox.ac.uk',
                         extras=dict(zip(extras, extras))
                        )
prep(df, header, mol_col='minimized_mol', name_col='id',
     outfile=f'{method_name}.sdf',
     ref_pdb='x0051_0B',
     extras=extras
    )

[20:19:14] Molecule does not have explicit Hs. Consider calling AddHs()
