## Manual

Due to SW down all jobs crashed.
Instead of running all the mergers from scratch and shortlisting them badly a better-ish filter was done.

Unfortunately PLIP was not run

In [None]:
import numpy as np
import functools
import numpy.typing as npt
import pandas as pd
from pathlib import Path
from rdkit import Chem, Geometry, DataStructs
from rdkit.Chem import AllChem, Draw, PandasTools
from rdkit.Chem import rdMolDescriptors as rdmd
from rdkit.ML.Cluster import Butina
from sklearn.cluster import KMeans
from collections import defaultdict
from fragmenstein import Laboratory
from smallworld_api import SmallWorld, NoMatchError

In [None]:
path: Path

dfs = []
for path in Path('.').glob('fragmenstein_mergers_*pkl.gz'):
    df = pd.read_pickle(path)
    df['experiment'] = path.name[len('fragmenstein_mergers_'):-len('.pkl.gz')]
    dfs.append(df.loc[df.outcome == 'acceptable'])

df = pd.concat(dfs, ignore_index=True)

In [None]:
stats = {'initial': len(df)}

In [None]:
df['rototweaked_ddG'] = df.apply(lambda row: row['∆∆G'] + row['N_rotatable_bonds'] * 0.7, axis=1)
df = df.sort_values('rototweaked_ddG').reset_index(drop=True).copy()

import plotly.io as pio
# remote jupyter notebook weirdness
pio.renderers.default='iframe'  
import plotly.express as px

px.histogram(df, 'rototweaked_ddG', title='Energy', template='plotly_white')

In [None]:
df = df.loc[df.rototweaked_ddG <= -3].copy()
stats['dG_cull'] = len(df)

In [None]:
from rdkit.ML.Cluster import Butina
from rdkit import Chem, Geometry, DataStructs
from rdkit.Chem import rdMolDescriptors as rdmd

def butina_cluster(mol_list, cutoff=0.20):
    # https://github.com/PatWalters/workshop/blob/master/clustering/taylor_butina.ipynb
    fp_list = [rdmd.GetMorganFingerprintAsBitVect(AllChem.RemoveAllHs(m), 3, nBits=2048) for m in mol_list]
    dists = []
    nfps = len(fp_list)
    for i in range(1, nfps):
        sims = DataStructs.BulkTanimotoSimilarity(fp_list[i], fp_list[:i])
        dists.extend([1 - x for x in sims])
    mol_clusters = Butina.ClusterData(dists, nfps, cutoff, isDistData=True)
    cluster_id_list = [0] * nfps
    for idx, cluster in enumerate(mol_clusters, 1):
        for member in cluster:
            cluster_id_list[member] = idx
    return cluster_id_list

m = df.minimized_mol.apply(lambda m: m if isinstance(m, Chem.Mol) else Chem.Mol())
df['FP_cluster'] = butina_cluster(m.to_list())
df = df.drop_duplicates('FP_cluster').copy()
stats['butina_cull'] = len(df)

In [None]:
from rdkit import Chem

with Chem.SDMolSupplier('7DA6-fragmented.sdf') as sdfh:
    substrate_blocks = {m.GetProp('_Name'): m for m in sdfh}
    del substrate_blocks['ver_1.2']

# Cluster mergers

Normally I don't cluster mergers, but there are so many of them...
Unfortunately there's no PLIP, so distance

In [None]:
import numpy as np
import functools
import numpy.typing as npt
from rdkit import Chem, Geometry, DataStructs
from rdkit.Chem import AllChem, Draw, PandasTools

def get_mindist(molA: Chem.Mol, molB: Chem.Mol) -> float:
    d: npt.ArrayLike = AllChem.Get3DDistanceMatrix( Chem.CombineMols(molA, molB) )
    a = molA.GetNumAtoms()
    b = molB.GetNumAtoms()
    d[:a, :a] = d[:a, :a] * np.nan
    d[a:, a:] = d[a:, a:] * np.nan
    return np.nanmin(d)
    
for name, target in substrate_blocks.items():
    df[f'{name}_distance'] = df.minimized_mol.apply(functools.partial(get_mindist, target))

In [None]:
import numpy as np
from sklearn.cluster import KMeans
from collections import defaultdict
rank = defaultdict(int)

kmeans = KMeans(n_clusters=6, random_state=0, n_init='auto')
df['pocket_cluster'] = kmeans.fit_predict(df['PHE_1_distance	ARG0_distance	GLY1_distance	LYS2_distance'.split()])

def r(c):
    rank[c] += 1
    return rank[c]

df['pocket_cluster_rank'] = df.pocket_cluster.apply(r)

In [None]:
# I don't really want squared, so linear deviation
df['combined_distance'] = df['PHE_1_distance	ARG0_distance	GLY1_distance'.split()].sum(axis=1)

In [None]:
px.violin(df, 'pocket_cluster', 'combined_distance')

In [None]:
# shortlisted
combinations = df.loc[(df.pocket_cluster_rank < 100) & (df.combined_distance < 8)]
stats['shortlisted'] = len(combinations)
print(stats)

## Get analogues

In [None]:
pdbblock = Path('x0310_apo.pdb').read_text()

In [None]:
from smallworld_api import SmallWorld, NoMatchError
sws = SmallWorld()
analogs = sws.search_many(combinations.smiles.to_list(),
                                      dist=15,
                                      length=1_000,
                                      db='REAL-Database-22Q1.smi.anon',
                                      tolerated_exceptions=Exception)

analogs['catalogue'] = 'REAL-Database-22Q1.smi.anon'
analogs['query_name'] = analogs.query_index.map(combinations.reset_index().name.to_dict())
analogs['hits'] = analogs.query_index.map(combinations.hit_mols.to_dict())
analogs['hit_names'] = analogs.hits.apply(lambda m: [mm.GetProp('_Name') for mm in m] if isinstance(m, list) else ['error'])
analogs['minimized_merger'] = analogs.query_index.map(combinations.minimized_mol.to_dict())
analogs['unminimized_merger'] = analogs.query_index.map(combinations.unminimized_mol.to_dict())
analogs['name'] = analogs['id'] + ':' + analogs['query_name']
analogs['smiles'] = analogs.hitSmiles.str.split(expand=True)[0]
analogs['custom_map'] = analogs.loc[~analogs.hits.isna()].apply(get_custom_map, axis=1)
analogs.to_pickle(f'fragmenstein_analogues_manual.REAL.pkl.gz')

In [None]:
!ls *analog*

In [None]:
analogs = pd.concat([
    pd.read_pickle('fragmenstein_analogs_fragpairsA.pkl.gz').reset_index(drop=True),
    pd.read_pickle('fragmenstein_analogs_fullpairs.pkl.gz').reset_index(drop=True),
    pd.read_pickle('fragmenstein_analogues_manual.REAL.pkl.gz').reset_index(drop=True)
            ],
    ignore_index=True)

In [None]:
#analogs = analogs.loc[~analogs.hits.isna()].reset_index(drop=True)
len(analogs)
#analogs.copy().to_pickle('fragmenstein_analogues_combined.pkl.gz')

In [None]:
# There are too many analogues. Done on multiple nodes.
analogs = analogs.loc[~analogs.hits.isna()]
placements: pd.DataFrame = Laboratory._place_ops(analogs=analogs,
                                                 pdbblock=pdbblock,
                                                 n_cores=os.cpu_count() - 1,
                                                 timeout=240,
                                                 suffix='manual')

hit_replacements = pd.read_pickle('fragmenstein_hit_replacements_fragpairs.pkl.gz')
Laboratory.score(placements, hit_replacements, **settings)
placements.to_pickle(f'fragmenstein_placements_manual.pkl.gz')

## Try again

In [None]:
import pandas as pd
from pathlib import Path
placements: pd.DataFrame = pd.concat([pd.read_pickle(path) for path in Path('.').glob('*_placements*.pkl.gz')], ignore_index=True)

In [None]:
placements = placements.loc[placements.outcome == 'acceptable']
len(placements) # from 251130

In [None]:
placements = placements.loc[placements.ad_hoc_penalty < 0].copy()
len(placements)

In [None]:
from fragmenstein.laboratory._score import UniquenessMeter

intxn_names = [c for c in placements.columns if isinstance(c, tuple)]
tallies = placements[intxn_names].sum()
ratioed = UniquenessMeter(tallies, intxn_names, k=0.5)
placements['interaction_uniqueness_metric'] = placements.apply(ratioed, axis=1)

In [None]:
from fragmenstein.laboratory._score import PenaltyMeter

penalize = PenaltyMeter(weights={"N_rotatable_bonds": 1,
             "\u2206\u2206G": 1,
             "interaction_uniqueness_metric": -2.5,
             "N_unconstrained_atoms": 0.2,
             "N_constrained_atoms": -0.05,
             "N_interactions": -1.5,
             "N_interactions_lost": 2,
             "max_hit_Tanimoto": -0.5,
             "N_PAINS": 5,
             "strain_per_HA": 1})
placements['ad_hoc_penalty'] = placements.apply(penalize, axis=1)

In [None]:
import plotly.io as pio
pio.renderers.default='iframe'  
import plotly.express as px

pocket_cols = [c for c in placements.columns if isinstance(c, tuple) and c[2] in (105,124,)]
placements['deep_P1_sidechain'] = placements[pocket_cols].sum(axis=1) > 0

px.scatter(placements, 'ad_hoc_penalty', '∆∆G', color='deep_P1_sidechain', opacity=0.2, template='plotly_white')

In [None]:
pocket_cols

In [None]:
placements = placements.loc[placements.ad_hoc_penalty < 0].copy()
len(placements)

In [176]:
placements.to_pickle('fragmenstein_placements_filtered.pkl.gz')

## Inspect

In [None]:
from rdkit import Chem
from rdkit.Chem import Draw

Draw.MolsToGridImage(placements.sort_values('ad_hoc_penalty').head(10).smiles.apply(Chem.MolFromSmiles))

In [None]:
import py3Dmol
from fragmenstein.mol3d_display import monkey_patch
from fragmenstein.branding import divergent_colors
n = 20
colors = divergent_colors[20]


viewer = py3Dmol.view()
monkey_patch(viewer)
for i, row in placements.sort_values('ad_hoc_penalty').head(n).reset_index().iterrows():
    viewer.add_mol(row.minimized_mol, name=row.name, carbon_color=colors[i])
    viewer.add_template(pdbblock = Path('x0310_apo.pdb').read_text())
viewer.show()

In [None]:
#placements['far_pocket'] = placements.name.apply(lambda name: any([n in name for n in 'x0473 x0911 x0853 x0929'.split()]))
wanted_cols = [c for c in placements.columns if isinstance(c, tuple) and c[2] in (18, 124, 82, 100, 98, 131, 83)]
subbed = placements.loc[(placements[wanted_cols].sum(axis=1) > 1) & (placements[ser105_col].sum(axis=1) > 0)].sort_values('ad_hoc_penalty').head(n).reset_index()
from IPython.display import display

display(subbed[['name', 'ad_hoc_penalty', '∆∆G']+wanted_cols])
colors = divergent_colors[len(subbed)]

viewer = py3Dmol.view()
monkey_patch(viewer)
for i, row in subbed.iterrows():
    viewer.add_mol(row.minimized_mol, name=row.name, carbon_color=colors[i])
    viewer.add_template(pdbblock = Path('x0310_apo.pdb').read_text())
viewer.show()

## Remove similar

In [None]:
from fragmenstein.laboratory._score import butina_cluster
m = placements.minimized_mol.apply(lambda m: m if isinstance(m, Chem.Mol) else Chem.Mol())
placements['cluster'] = butina_cluster(m.to_list(), 0.2)
placements.cluster.max()

In [None]:
placements = placements.sort_values('ad_hoc_penalty').reset_index(drop=True).drop_duplicates('cluster')

In [None]:
from collections import defaultdict
from Bio.SeqUtils import seq1

def narrate(row: pd.Series):
    grouped = defaultdict(list)
    for name, value in row.items():
        if not isinstance(name, tuple) or value == 0.:
            continue
        itxn_type, resn, resi = name
        grouped[itxn_type].append(seq1(resn, undef_code="X")+str(resi))
    narrative = ''
    for itxn_type in sorted(grouped):
        narrative += f'{itxn_type}:{"+".join(grouped[itxn_type])}; '
    return narrative

placements['rationale'] = 'info ' + placements.apply(narrate, axis=1)

## Cost

In [None]:
import json

with open('cost.json', 'w') as fh:
    costs = json.load(fh)

from gist_import import GistImporter

store = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/fragment_elaboration_scripts/enamine_store.py')\
                    .to_module()

In [None]:
#costs = {}

placements['Id'] = placements.name.str.split('-x',expand=True)[0] 

for code in placements.sort_values('ad_hoc_penalty').reset_index().Id.to_list()[:1000]:
    if code in costs:
        print(code, costs[code])
    elif 'Z' in code or 'PV' in code:
        costs[code] = store.get_price(code, catalogue=store.StoreCatalog.REALDB, currency=store.StoreCurrency.USD)
        print(code, costs[code])
        time.sleep(10)
    elif 'EN' in code:
        costs[code] =store.get_price(code, catalogue=store.StoreCatalog.BB, currency=store.StoreCurrency.USD)
        print(code, costs[code])
        time.sleep(10)
    else:
        raise ValueError

import json

with open('cost.json', 'w') as fh:
    json.dump(costs, fh)

In [None]:
placements['$/mg'] = placements.Id.map(costs)

## Twice: w/ and w/o S105 & V124

In [None]:
import numpy as np
from scipy.cluster.vq import kmeans, vq
from collections import defaultdict

def intxn_cluster(df, k):
    """
    Probability scale interactions
    """
    intxn_cols = [c for c in df.columns if isinstance(c, tuple)]
    data_for_clustering = df[intxn_cols].fillna(0).copy()
    tallies = data_for_clustering.sum().to_dict()
    data_for_clustering = data_for_clustering.apply(lambda col: col / tallies[col.name],axis=0).fillna(0)
    centroid, variance = kmeans(data_for_clustering.values, k)
    labels, _ = vq(data_for_clustering.values, centroid)
    df['intxn_cluster'] = labels
    rank = defaultdict(int)
    def r(c):
        rank[c] += 1
        return rank[c]
    df['intxn_cluster_rank'] = df.intxn_cluster.apply(r)

In [None]:
placements['deep_P1_sidechain'] = placements[pocket_cols].sum(axis=1).astype(bool)

In [None]:
df1 = placements.loc[placements.deep_P1_sidechain].copy()
intxn_cluster(df1, k=8) 

In [None]:
df2 = placements.loc[~placements.deep_P1_sidechain].copy()
intxn_cluster(df2, k=8)

In [None]:
# This is slow... inadvisable
placements['intxn_cluster'] = {**(df1['intxn_cluster'] + 100).to_dict(), **(df2['intxn_cluster'] + 200).to_dict()}
placements['intxn_cluster_rank'] = {**df1['intxn_cluster_rank'].to_dict(), **df2['intxn_cluster_rank'].to_dict()}

In [None]:
placements.to_pickle('fragmenstein_placements_filtered.pkl.gz')

### Prep

In [None]:
from gist_import import GistImporter

# fu for fragalysis upload
fu = GistImporter.from_github('https://raw.githubusercontent.com/matteoferla/Fragment-hit-follow-up-chemistry/main/fragment_elaboration_scripts/prep_fragalysis.py').to_module()
# generate_header
# floatify_columns
# prep

In [None]:
df = df1
method_name = 'A71-Fragmenstein-iter2-at-S105-V124'

#df = df2
#method_name = 'A71-Fragmenstein-iter2-not-S105-V124'

In [None]:
import operator

wanted_key_types = {'rationale': str, 
                    'intxn_cluster': int,
               'intxn_cluster_rank': int,
               'N_interactions': int, 
               '∆∆G': float, 
                    '$/mg': float,
                    'comRMSD': float,
                    'N_rotatable_bonds': int,
                   }

df = df.copy()
for col in sorted([col for col in df.columns if isinstance(col, tuple)], key=operator.itemgetter(2)):
    name = ':'.join(map(str, col))
    df[name] = df[col]
    wanted_key_types[name] = int
    
for k, ktype in wanted_key_types.items():
    df[k] = df[k].astype(ktype)
df = df.copy()

wanted_keys = list(wanted_key_types)

df['ref_mols'] = df.hit_names.apply(lambda ns: ','.join([n[:8] for n in ns]))

header: Chem.Mol = fu.generate_header(method=method_name,
                         ref_url='https://github.com/matteoferla/EV-A71-2A-elaborations',
                         submitter_name='Matteo Ferla',
                         submitter_email='matteo.ferla@stats.ox.ac.uk',
                         submitter_institution='University of Oxford',
                         extras=dict(zip(wanted_keys, wanted_keys))
                                  )
                                   
fu.prep(df.sort_values('intxn_cluster_rank').reset_index().iloc[:100], 
     header,
        mol_col='minimized_mol', 
     name_col='Id',
     outfile=f'{method_name}.sdf',
     ref_pdb_name='x0310_0A',
     extras=wanted_keys
    )

In [None]:
#!pip install git+https://github.com/retostauffer/python-colorspace

n = 100

import numpy as np
from colorspace.colorlib import HCL, hexcols

hues : np.ndarray = np.linspace(0,360, n+1)+15
hues[hues >= 360] -= 360

colors = HCL(H = hues[:-1], C = [100]*n, L = [65]*n)
colors.to('hex')
for i, color in enumerate(colors.colors()):
    print(f'color 0x{color[1:]}, resn UNK and state {i} and element C')

In [175]:
print('\u00B0')

°
