In [None]:
from sqlitedict import SqliteDict
from rdkit.Chem import PandasTools
import json
import pandas as pd
from fragmenstein import Victor

import numpy as np
import os
from rdkit import Chem
from rdkit.Chem import AllChem


def old_ranker(row):
    try:
        return float(row['∆∆G'])/5 + float(row.comRMSD) + row.N_unconstrained_atoms /5 - row.N_constrained_atoms / 10
        #return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms * 0.5)*10 + float(row.comRMSD)
    except:
        return float('nan')
    

rank_weights = {'LE': 1., 'comRMSD': 2., 'atom_bonus': 3. , 'novelty_penalty': 5.}
def ranker(row):
    try:
        atom_bonus = row.N_constrained_atoms / (20 + row.N_constrained_atoms)
        novelty_penalty = row.N_unconstrained_atoms / row.N_constrained_atoms
        return rank_weights['LE'] * float(row.LE) + \
               rank_weights['comRMSD'] * float(row.comRMSD) + \
               - rank_weights['atom_bonus'] * atom_bonus + \
                rank_weights['novelty_penalty'] * novelty_penalty
    except:
        return float('nan')
    
def LE(row):
    try:
        return float(row['∆∆G'])/(row.N_unconstrained_atoms + row.N_constrained_atoms)
    except:
        return float('nan')

def get_mol3D(name):
    path = os.path.join(Victor.work_path, name, name+'.minimised.mol')
    if os.path.exists(path):
        try:
            mol = Chem.MolFromMolFile(path, sanitize=True)
            if mol is None:
                return None
            Chem.SanitizeMol(mol, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL)
            return mol
        except Exception as error:
            print(f'{type(error)}: {error}')
            return None
    else:
        return None


def get_table(db_name, mols=True, mol_only=True):
    results = SqliteDict(db_name, encode=json.dumps, decode=json.loads, autocommit=True)
    result_table = pd.DataFrame(results.values())
    print(len(result_table), sum(~result_table['∆∆G'].isna()))
    result_table['LE'] = result_table.apply(LE,1)
    rank = result_table.apply(ranker, axis=1).rank()
    m = np.nanmax(rank.values)
    result_table['%Rank'] = rank / m * 100
    result_table['N_hits'] = result_table.regarded.apply(lambda x: len(x) if str(x) != 'nan' else float('nan'))
    result_table = result_table.loc[~result_table.smiles.isna()].sort_values(['%Rank'], axis=0) 
    if mols:
        result_table['mol3D'] = result_table['name'].apply(get_mol3D)
        #result_table['mol2D'] = result_table['name'].apply(get_mol2D)
        PandasTools.AddMoleculeColumnToFrame(result_table,'smiles','mol2D')
        if mol_only:
            result_table = result_table.loc[~result_table.mol3D.isna()]
    return result_table

In [None]:
##############################################
project = 'mergers'
from fragmenstein import Victor
Victor.work_path = project
db_name = f'{project}.sqlite'
result_table = get_table(db_name, mols=True)
result_table

In [None]:
atom_Ns = {}

for folder in ('newinputs',): #'input', 'UCSF2-hits', 'frags'):
    for file in os.listdir(folder):
        if '.mol' in file:
            mol = Chem.MolFromMolFile(os.path.join(folder, file), sanitize=False)
            if mol is None:
                atom_Ns[file.replace('.mol','')] = 0 # float nan?
            else:
                mol = Chem.GetMolFrags(mol, asMols=True)[0] # just in case
                atom_Ns[file.replace('.mol','')] = mol.GetNumAtoms()

In [None]:
hit_counter = lambda hits: sum([atom_Ns[hit] for hit in hits])
merge_counter = lambda row: row.N_hit_atoms - row.N_unconstrained_atoms - row.N_constrained_atoms 
result_table = result_table.assign(N_hit_atoms=result_table.regarded.apply(hit_counter))
result_table = result_table.assign(N_diff_atoms=result_table.apply(merge_counter, axis='columns'))
result_table

## Michelanglo

In [None]:
from michelanglo_api import MikeAPI
mike = MikeAPI('XXXX', 'XXXXX')
#p = mike.convert_pdb('6WOJ') # new
p = mike.get_page('XXXXXX') # edit existing
p.retrieve()
p.show_link()

In [None]:
task = 'Fragmenstein_NSP3_mergers'
#repo_name = 'Data_for_own_Michelanglo_pages2'
repo_name = 'NSP3-macrodomain'
folder = 'molfiles'
title = 'Fragmenstein NSP3 Mergers'
#gitfolder=f'/well/brc/matteo/{repo_name}'
gitfolder=f'/well/brc/matteo/NSP3/git-repo'
sdfile=f'{gitfolder}/{folder}/mergers.sdf'
xlsfile=f'{gitfolder}/{folder}/mergers.xlsx'
targetfolder=f'{gitfolder}/{folder}'

import os, re

if not os.path.exists(targetfolder):
    os.mkdir(targetfolder)

In [None]:
target = 'Mike'
#target = 'Excel'
#target = 'Frag'


# frgaments have _Greek
degreekify = lambda s: re.sub('_[α-ω]', '', s)
deprefixify = lambda s: re.sub('mArh-', '', s)

def clean_refs(x):
    h = ','.join([deprefixify(degreekify(xx)) for xx in x if 'mArh-x' in xx])
    if h == '':
        return 'x0104_0'
    else:
        return h

from datetime import datetime, date

outgoing = result_table.sort_values(['%Rank'], axis=0).loc[~result_table.mol3D.isna()]
outgoing = outgoing.loc[outgoing.regarded.apply(lambda x: len(x) >= 1)]
outgoing['ref_mols'] = outgoing.regarded.apply(clean_refs)
outgoing['regarded'] = outgoing.regarded.apply(lambda x: ','.join(x))
outgoing['disregarded'] = outgoing.disregarded.apply(lambda x: ','.join(x))
# outgoing['ref_mols'] = outgoing.apply(lambda row: row.regarded+','+row.disregarded if row.disregarded else row.regarded, axis=1)
outgoing['name'] = outgoing['name'].str.replace('-covalent', '')
outgoing['name'] = outgoing['name'].str.replace('mArh-', '')
outgoing['ref_pdb'] = 'x0104_0' #'6WOJ_0_A' #'X0104_0_A' #
outgoing['original SMILES'] = outgoing['smiles']
# ref_url - the url to the forum post that describes the work
# submitter_name - the name of the person submitting the compounds
# submitter_email - the email address of the submitter
# submitter_institution - the submitters institution
# generation_date - the date that the file was generated in format yyyy-mm-dd
# method
#ref_mols - a comma separated list of the fragments that inspired the design of the new molecule (codes as they appear in fragalysis - e.g. x0104_0,x0692_0)
#ref_pdb - either (a) a filepath (relative to the sdf file) to an uploaded pdb file (e.g. Mpro-x0692_0/Mpro-x0692_0_apo.pdb) or (b) the code to the fragment pdb from fragalysis that should be used (e.g. x0692_0)
#original SMILES


metadata = {'name':           'ver_1.2',
            'submitter_name': 'Matteo Ferla',
            'submitter_email': 'matteo@well.ox.ac.uk',
            'submitter_institution': 'Universtity of Oxford',
            'generation_date': date.today().isoformat(),
            'method': 'Fragmenstein-top-automerger',
            'original SMILES': 'molecule smiles',
            #'smiles': 'molecule smiles used',
            'ref_url': 'https://github.com/matteoferla/NSP3-macrodomain',
            'ref_mols': 'all reference molecules',
            'ref_pdb': 'All ligands were evaluated against the apo of Rosetta-ED-guided-minimised 6WOJ',
            'N_hits': 'Number of hits used',
            'N_constrained_atoms': 'Number of atoms in the submission that were constrained',
            'N_diff_atoms': 'Difference in number of heavy atoms between the merger and the hits (negative: atoms added, positive: atoms merged)',
            #'N_unconstrained_atoms': 'Number of heavy atoms in the submission that were NOT constrained',
            #'runtime': 'seconds it took',
            'regarded': 'Fragments used for mapping',
            'disregarded': 'Fragments rejected for mapping',
            'comRMSD': 'Combined RMSD from the atoms of the fragments that contributed to the position of the followup',
            '∆∆G': 'Difference in Gibbs Free energy relative to unbound molecule in kcal/mol (ref2015 scorefxn; negative=Good)',
            #'∆G_bound': 'Gibbs Free energy of ligand bound',
            #'∆G_unbound': 'Gibbs Free energy of ligand unbound',
            'LE': 'Ligand efficiency (kcal/mol/N_heavy)',
            '%Rank':f"Sorted by {rank_weights['comRMSD']}x RSMD (high is bad) + "+\
                    f"{rank_weights['LE']}x ligand efficiency (high is bad) - "+\
                    f"{rank_weights['atom_bonus']}x N_constrained_atoms/(20+N_constrained_atoms) + "+\
                    f"{rank_weights['novelty_penalty']}x N_unconstrained_atoms/N_constrained_atoms",
            'mol3D': Chem.MolFromSmiles('FOO'),
            'mol2D': Chem.MolFromSmiles('FOO'),
          }

if target == 'Frag':
    del metadata['regarded']
    del metadata['disregarded']
    del metadata['mol2D']
    outgoing = outgoing.iloc[:500]
elif target == 'Mike':
    del metadata['mol2D']

# add fist compound
outgoing = pd.concat([pd.DataFrame([metadata]), outgoing], ignore_index=True)

In [None]:
## MAKE SDF

assert target != 'Excel', 'Requires mol2D'
from rdkit.Chem import PandasTools
#Fragmenstein_permissive_rescored_20200609.sdf
PandasTools.WriteSDF(outgoing, sdfile, molColName='mol3D', idName='name',
                     properties=list(set(metadata.keys()) - {'name', 'mol3D'}), allNumeric=False)

In [None]:
## Make Excel

assert target == 'Excel', 'Requires mol2D'
PandasTools.SaveXlsxFromFrame(outgoing[list(set(metadata.keys()) - {'mol3D'})], xlsfile, molCol='mol2D', size=(300, 300))

In [None]:
p.description = f'''
## {title} {date.today().isoformat()}

[Fragmenstein](https://github.com/matteoferla/Fragmenstein) auto-mergers.

> For files and notes see [NSP3 data on GitHub](https://github.com/matteoferla/NSP3-macrodomain).

'''
p.loadfun = ''
p.title = title
p.columns_viewport = 6
p.columns_text = 6
p.sdf_to_mols(sdfile=sdfile,
                 targetfolder=targetfolder,
                 skip_first=True)



p.sdf_to_json(sdfile=sdfile,
                 keys=('regarded',
                       '∆∆G', 'LE', 'N_hits', 'N_constrained_atoms', 'N_diff_atoms', 'comRMSD',
                       '%Rank'),
                 key_defaults=('', # regarded
                               999., #∆∆G
                               999., #LE
                               0, #N_hits
                               0, #N_constrained_atoms
                               0, #N_diff_atoms
                               999., #comRMSD
                               100. #%Rank
                              ), 
                 filename=f'{targetfolder}/data.json')
p.make_fragment_table(sdfile=sdfile,
                   username='matteoferla',
                   repo_name=repo_name,
                   foldername=folder,
                   protein_sele='81:A',
                   sort_col=8,
                   sort_dir='asc',
                   template_row=-1,
                   fragment_row=1,
                   jsonfile='data.json')
p.commit()

In [None]:
%%bash

cd /well/brc/matteo/NSP3/git-repo
# tar -czvf 'Fragmenstein_NSP3_quicktest2/mergers.sdf.gz' 'Fragmenstein_NSP3_quicktest2/mergers.sdf'
rm 'molfiles/mergers.sdf'
git add .
git commit -m ':truck: data!'
git push