In [None]:
import pandas as pd

double = pd.read_pickle('fragmenstein_placed_paired_filtered.pkl.gz')
triple = pd.read_pickle('fragmenstein_placed_2A.pkl.gz')

combined = pd.concat([double.reset_index(drop=True), triple.reset_index(drop=True)], ignore_index=True).reset_index(drop=True).copy() 

In [110]:
from rdkit import Chem
import operator, json
from fragmenstein_merge_sw_place import score

def get_first(mols):
    ersatzmolekül = Chem.Mol()
    ersatzmolekül.SetProp('_Name', 'error')
    # replaces operator.itemgetter(0)
    if not isinstance(mols, list):
        return ersatzmolekül
    elif len(mols) == 0:
        return ersatzmolekül
    else:
        return mols[0]
        
hit_replacements = pd.read_pickle(f'fragmenstein_hit_replacements_2.pkl.gz')
extracted_hits = hit_replacements.hit_mols.apply(get_first).to_list()

with open('weights.json') as fh:
    weights = json.load(fh)
    weights['strain_per_HA'] = weights['∆∆G'] / 3
    # every 3 kJ/mol straight is as bad as losing 1 kcal/mol free energy?
    weights['N_PAINS'] = 20  # a single PAINS violation is as bad as 

#del sample['ad_hoc_penalty']
score(combined, hit_replacements, '_test', extracted_hits, weights)

In [111]:
combined.loc[combined.outcome == 'acceptable']\
          .sort_values('ad_hoc_penalty')\
          .drop_duplicates('cluster')[list(weights.keys()) + ['ad_hoc_penalty']].head()

Unnamed: 0,N_rotatable_bonds,∆∆G,interaction_uniqueness_metric,N_unconstrained_atoms,N_constrained_atoms,N_interactions,N_interactions_lost,max_hit_Tanimoto,strain_per_HA,N_PAINS,ad_hoc_penalty
6,7,-8.934376,2.43847,9.0,19.0,6,1.0,0.253448,3.581136,0,-70.544848
0,5,-7.998571,0.157572,4.0,23.0,8,0.0,0.301237,6.249713,0,-48.798672
3,4,-6.903683,0.184044,3.0,23.0,7,0.0,0.311277,2.392016,0,-48.411187
2,3,-6.589362,0.22098,3.0,20.0,9,1.0,0.3049,5.154831,0,-47.837751
5,0,-6.164977,0.225711,3.0,15.0,6,0.5,0.496403,4.510615,0,-45.494939


In [112]:
from gist_import import GistImporter

fmodule = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/followup/prep_fragalysis.py')
prep = fmodule['prep']
generate_header = fmodule['generate_header']

In [113]:
placements = combined

In [114]:
placements['catalogue'].unique()

array(['REAL-Database-22Q1.smi.anon', 'Enamine-BB-Stock-Mar2022.smi.anon',
       'mcule_purchasable_virtual_230121.smi.anon',
       'MculeUltimate-20Q2.smi.anon', 'Enamine-SC-Stock-Mar2022.smi.anon',
       'mcule_purchasable_building_blocks_230120.smi.anon', nan],
      dtype=object)

In [115]:
placements['Enamine'] = placements['catalogue'].apply(lambda n: n in ['Enamine-SC-Stock-Mar2022.smi.anon',
       'Enamine-BB-Stock-Mar2022.smi.anon', 'REAL-Database-22Q1.smi.anon']).astype(int)
placements['Mcule'] = placements['catalogue'].apply(lambda n: n in ['mcule_purchasable_building_blocks_230120.smi.anon',
       'mcule_purchasable_virtual_230121.smi.anon',
       'MculeUltimate-20Q2.smi.anon']).astype(int)
placements = placements.copy()

In [122]:
target_name='XX01ZVNS2B'
method_name = 'Fragmenstein-no_curation-NSP2B'
N_top = 500

import numpy as np
from rdkit import Chem
from typing import List

def fix(mol: Chem.Mol):
    assert isinstance(mol, Chem.Mol)
    assert mol.GetNumAtoms()
    mol.ClearComputedProps()
    for name in mol.GetPropNames():
        mol.ClearProp(name)

placements.loc[(placements.outcome == 'acceptable') \
                            & (placements['∆∆G'] <= -5) \
                            & (placements.largest_ring <= 10) \
                            & (placements.ad_hoc_penalty <= 0)
                            ]\
                   .sort_values('ad_hoc_penalty')\
          .sort_values('ad_hoc_penalty')\
          .drop_duplicates('cluster')\
          .copy()\
          .to_pickle('fragmenstein_placed_combined_filtered.pkl.gz')


with pd.option_context('mode.chained_assignment', None):

    df = placements.loc[(placements.outcome == 'acceptable') \
                            & (placements['∆∆G'] <= -5) \
                            & (placements.largest_ring <= 10) \
                            & (placements.ad_hoc_penalty <= 0)
                            ]\
                       .sort_values('ad_hoc_penalty')\
                       .drop_duplicates('cluster')\
                       .rename(columns={c: ':'.join(map(str, c)) for c in placements.columns if isinstance(c, tuple)})\
                       .reset_index()\
                       .head(N_top).copy()
    # list of str to str w/ comma-separator
    df['ref_mols'] = df.hit_names.apply(lambda l: ','.join([v.replace(f'{target_name}-', '') for v in l]))
    # not used
    df['washed_mol'] = df.minimized_mol.apply(fix)
    df['name'] = df['name'].apply(lambda v: v.split('-'+target_name)[0])
        
numericals = [k for k, v in df.iloc[0].items() if type(v) in (int, np.int64, float, np.float64)]
nonempty = [c for c, v in (df[[c for c in df.columns if ':' in c]].sum() > 0).to_dict().items() if v]
extras: List[str] = [c for c in df.columns if c in numericals and ':' not in c] + nonempty
bad_columns = [x for x, v in df[extras].astype(float).isna().any().to_dict().items() if v]
if not bad_columns:
    print(f'Some entries are not properly numeric, forcing: {bad_columns}')
    for col in bad_columns:
        df[col] = df[col].fillna(0).astype(float)
header: Chem.Mol = generate_header(method_name,
                         ref_url='https://github.com/matteoferla/Fragment-hit-follow-up-chemistry',
                         submitter_name='Matteo Ferla',
                         submitter_email='matteo.ferla@stats.ox.ac.uk',
                         extras=dict(zip(extras, extras))
                        )
prep(df, header, mol_col='minimized_mol', name_col='id',
     outfile=f'{method_name}.sdf',
     ref_pdb='x0051_0B',
     extras=extras
    )
print('DONE')

[12:39:16] Molecule does not have explicit Hs. Consider calling AddHs()


In [123]:
print('DONE')

DONE


In [3]:
!ls *combined*

fragmenstein_placed_combined.pains.pkl.gz
fragmenstein_placed_combined_filtered.pkl.gz


In [3]:
import pandas as pd

combined = pd.read_pickle('fragmenstein_placed_combined_filtered.pkl.gz')

In [4]:
[c for c in combined.columns if isinstance(c, tuple) and c[2] in critical]

[('pistack', 'TYR', 161),
 ('hydroph_interaction', 'TYR', 161),
 ('hydroph_interaction', 'ASP', 83),
 ('hbond', 'TYR', 130),
 ('hbond', 'TYR', 161),
 ('hydroph_interaction', 'TYR', 130),
 ('saltbridge', 'ASP', 83),
 ('hbond', 'ASP', 83),
 ('pication', 'TYR', 161),
 ('hbond', 'LEU', 128),
 ('halogenbond', 'TYR', 130),
 ('halogenbond', 'LEU', 128),
 ('hbond', 'GLY', 82),
 ('halogenbond', 'TYR', 161),
 ('halogenbond', 'GLY', 82),
 ('halogenbond', 'ASP', 83),
 ('hydroph_interaction', 'LEU', 128),
 ('pistack', 'TYR', 130)]

In [5]:
critical = [161, 128, 130, 82, 83]
secondary = [150, 135, 132, 51, 152, 36, 52]
#'pistack 161, #150, 51, 
# hydroph_interaction

col_names = [c for c in combined.columns if isinstance(c, tuple) and c[2] in critical+secondary]

def score(row):
    score = 0
    for col in col_names:
        if not (row[col] > 0):
            continue
        elif col[0] == 'hydroph_interaction':
            w = 0.5
        elif col[0] == 'pistack':
            w = 2.
        else:
            w = 1.
        if col[2] in critical:
            w = w * 2.
        score += w * row[col]
    return score
            

# x0969 x0089 x1098
combined['intxn_scored'] = combined.apply(score, axis=1)

In [22]:
target_name='XX01ZVNS2B'
method_name = 'Fragmenstein-no_curation-intxn-NSP2B'
N_top = 500

import numpy as np
from rdkit import Chem
from typing import List
from gist_import import GistImporter

fmodule = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/followup/prep_fragalysis.py')
prep = fmodule['prep']
generate_header = fmodule['generate_header']

def fix(mol: Chem.Mol):
    assert isinstance(mol, Chem.Mol)
    assert mol.GetNumAtoms()
    mol.ClearComputedProps()
    for name in mol.GetPropNames():
        mol.ClearProp(name)


with pd.option_context('mode.chained_assignment', None):

    df = combined.loc[(combined.outcome == 'acceptable') \
                & (combined.N_PAINS == 0) \
                & (combined.Enamine) \
                & (combined.strain_per_HA <= 15.) \
                & (combined['∆∆G'] <= -5) \
                & (combined.largest_ring <= 10) \
                & (combined.ad_hoc_penalty <= 0)
    ].sort_values('intxn_scored', ascending=False) \
                       .sort_values('intxn_scored', ascending=False)\
                       .drop_duplicates('cluster')\
                       .rename(columns={c: ':'.join(map(str, c)) for c in combined.columns if isinstance(c, tuple)})\
                       .reset_index()\
                       .head(N_top).copy()
    # list of str to str w/ comma-separator
    df['ref_mols'] = df.hit_names.apply(lambda l: ','.join([v.replace(f'{target_name}-', '') for v in l]))
    # not used
    df['washed_mol'] = df.minimized_mol.apply(fix)
    df['name'] = df['name'].apply(lambda v: v.split('-'+target_name)[0])
        
numericals = [k for k, v in df.iloc[0].items() if type(v) in (int, np.int64, float, np.float64)]
nonempty = [c for c, v in (df[[c for c in df.columns if ':' in c]].sum() > 0).to_dict().items() if v]
extras: List[str] = [c for c in df.columns if c in numericals and ':' not in c] + nonempty
bad_columns = [x for x, v in df[extras].astype(float).isna().any().to_dict().items() if v]
if not bad_columns:
    print(f'Some entries are not properly numeric, forcing: {bad_columns}')
    for col in bad_columns:
        df[col] = df[col].fillna(0).astype(float)
header: Chem.Mol = generate_header(method_name,
                         ref_url='https://github.com/matteoferla/Fragment-hit-follow-up-chemistry',
                         submitter_name='Matteo Ferla',
                         submitter_email='matteo.ferla@stats.ox.ac.uk',
                         extras=dict(zip(extras, extras))
                        )
prep(df, header, mol_col='minimized_mol', name_col='id',
     outfile=f'{method_name}.sdf',
     ref_pdb='x0051_0B',
     extras=extras
    )
print('DONE')

Some entries are not properly numeric, forcing: []


[12:03:05] Molecule does not have explicit Hs. Consider calling AddHs()


DONE


In [None]:
SUPRESSED_EXCEPTION = Exception

from typing import List
import contextlib
from rdkit import Chem
from rdkit.Chem import AllChem
from rdkit.Chem.rdfiltercatalog import FilterCatalogParams, FilterCatalog, FilterCatalogEntry
params = FilterCatalogParams()
params.AddCatalog(FilterCatalogParams.FilterCatalogs.PAINS)
catalog = FilterCatalog(params)

def get_pains(mol) -> List[str]:
    with contextlib.suppress(Exception):
        entry: FilterCatalogEntry
        if not isinstance(mol, Chem.Mol) or mol.GetNumHeavyAtoms() == 0:
            return []
        AllChem.SanitizeMol(mol)
        return [entry.GetDescription() for entry in catalog.GetMatches(mol)]

def UFF_Gibbs(mol):
    # free energy cost of bound conformer
    if not isinstance(mol, Chem.Mol) or mol.GetNumHeavyAtoms() == 0:
        return float('nan')
    with contextlib.suppress(SUPRESSED_EXCEPTION):
        AllChem.SanitizeMol(mol)
        # this is actually UFF
        copy = Chem.Mol(mol)
        return Monster.MMFF_score(None, mol, True)
    return float('nan')

combined['PAINSes'] = combined.minimized_mol.apply(get_pains)
combined['N_PAINS'] = combined.PAINSes.apply(len)
combined['UFF_Gibbs'] = combined.minimized_mol.apply(UFF_Gibbs)
combined['strain_per_HA'] = combined.UFF_Gibbs / (combined.N_constrained_atoms + combined.N_unconstrained_atoms)