In [2]:
from cresset import flare
import pandas as pd
import numpy as np
import os
import rdkit
from time import process_time

In [14]:
def get_testset(path):
    testset_path = os.path.join(path, r'rescore_result') + '.csv'
    print(testset_path)
    
    if os.path.exists(testset_path) == True:
        print('Testset exists')
        testset = pd.read_csv(testset_path)
    else:
        #cluster = pd.read_csv(os.path.join(path, 'cluster_table') + '.csv')
        meta = pd.read_csv(os.path.join(path,'metadata') + '.csv')
        Mpro_soaks = pd.read_csv(os.path.join(path,'Mpro_soaks') + '.csv')

        meta = meta.rename(columns = {'crystal_name':'name','RealCrystalName':'real_name','alternate_name':'CID','smiles':'SMILES_meta','new_smiles':'new_SMILES_meta'})
        Mpro_soaks = Mpro_soaks.rename(columns = {'Compound ID':'CID','Rapid Fire avg IC50 (uM)':'R_IC50','Fluorescence avg IC50 (uM)':'F_IC50','Sample Name':'real_name'})
        testset = pd.merge(meta, Mpro_soaks, how = 'left', on = 'name')
        testset = testset[~testset['F_IC50'].isnull()].sort_values(by = ['site_name','name'])[['site_name','name','real_name','SMILES_meta','CID','R_IC50','F_IC50']].reset_index(drop = True)
        #testset = pd.merge(testset, cluster[['sub_crystal_name','cluster']], how = 'left', on = 'name')
        
        trueCID = {'Mpro-P0243':'EDJ-MED-d08626de-4',
                   'Mpro-P0793':'EDG-MED-5d232de5-8',
                   'Mpro-P0816':'EDG-MED-5d232de5-7'}
        for k,v in trueCID.items():
            row = (testset.crystal_name == k)
            testset.loc[row, 'compound_ID_meta'] = v
            testset.loc[row, 'compound_ID_soaks'] = v
        
        testset = testset[testset['F_IC50'] < 99]
        testset.drop_duplicates(subset = ['crystal_name'], keep = 'first', inplace = True)
        testset['pIC50'] = -np.log10(testset['F_IC50']/1000000)
        testset = testset.reset_index(drop = True)
        #testset.to_csv(testset_path, index = 0)
    return testset

In [4]:
def get_protein(project, title, nb_dir):
    protein_path = os.path.join(nb_dir, 'protein', title) + '.pdb'
    
    if os.path.exists(protein_path):
        project.proteins.extend(flare.read_file(protein_path))
        protein = project.proteins[-1]
        protein.title = title
        
        print('\nPreparing complex ' + title)
        prep = flare.ProteinPrep()
        prep.proteins = [protein]
        prep.start()
        prep.wait()
        
        print('Minimising complex ' + title)
        minimization = flare.Minimization()
        minimization.protein = protein
        minimization.start()
        minimization.wait()
    return project.proteins[-1]

In [5]:
def get_ligands(project, testset, path):
    raw_list = []
    redock_list = []
    dock_list = []
    for index, row in testset.iterrows():
        title = row['sub_crystal_name']
        #smiles = row['SMILES']
        ligand_path = os.path.join(path, 'aligned', title, title) + '.pdb'
        project.ligands.extend(flare.read_file(ligand_path))
        project.ligands[-1].title = title + '_raw'
        project.ligands[-1].add_hydrogens()
        raw_list.append(project.ligands[-1])

        project.ligands.append(project.ligands[-1])
        project.ligands[-1].title = title + '_redock'
        redock_list.append(project.ligands[-1])
        
        lig = proj.ligands[-1].to_rdmol()
        smiles = rdkit.Chem.rdmolfiles.MolToSmiles(lig)
        project.ligands.extend(flare.read_string(smiles,'smi'))
        project.ligands[-1].add_hydrogens()
        project.ligands[-1].title = title + '_dock'
        dock_list.append(project.ligands[-1])
    return raw_list, redock_list, dock_list

In [6]:
def get_gridbox(project, title, path):
    ref_path = os.path.join(path,'protein',title) + '.pdb'
    project.ligands.extend(flare.read_file(ref_path))
    x,y,z=0,0,0
    count = 0
    for atom in project.ligands[-1].atoms:
        if 'H' not in atom.name:
            count += 1
            x += atom.pos[0]
            y += atom.pos[1]
            z += atom.pos[2]
    x = np.divide(x,count)
    y = np.divide(y,count)            
    z = np.divide(z,count)   
    project.ligands.remove(project.ligands[-1])
    print('Binding site centroid coordinates ', x,y,z)
    return ((x-15,y-15,z-15),(x+15,y+15,z+15))

In [7]:
def get_testset(path):
    testset_path = os.path.join(path,'testset') + '.csv'
    
    if os.path.exists(testset_path) == True:
        testset = pd.read_csv(testset_path)
    else:
        cluster = pd.read_csv(os.path.join(path, 'cluster_table') + '.csv')
        meta = pd.read_csv(os.path.join(path,'Mpro','metadata') + '.csv')
        Mpro_soaks = pd.read_csv(os.path.join(path,'Mpro','Mpro_soaks') + '.csv')
        Mpro_cocrystallisation = pd.read_csv(os.path.join(path,'Mpro','Mpro_cocrystallisation') + '.csv')

        meta = meta.rename(columns = {'crystal_name':'sub_crystal_name','RealCrystalName':'crystal_name','alternate_name':'compound_ID_meta','smiles':'smiles_meta_0','new_smiles':'smiles_meta_1'})
        Mpro_soaks = Mpro_soaks.rename(columns = {'Compound ID':'compound_ID_soaks','Rapid Fire avg IC50 (uM)':'R_IC50','Fluorescence avg IC50 (uM)':'F_IC50','Sample Name':'crystal_name'})
        testset = pd.merge(meta, Mpro_soaks, how = 'left', on = 'crystal_name')
        testset = testset[~testset['F_IC50'].isnull()].sort_values(by = ['site_name','crystal_name'])[['site_name','sub_crystal_name','crystal_name','SMILES','compound_ID_meta','compound_ID_soaks','R_IC50','F_IC50']].reset_index(drop = True).copy()
        testset = pd.merge(testset, cluster[['sub_crystal_name','cluster']], how = 'left', on = 'sub_crystal_name')
        
        trueCID = {'Mpro-P0243':'EDJ-MED-d08626de-4',
                   'Mpro-P0793':'EDG-MED-5d232de5-8',
                   'Mpro-P0816':'EDG-MED-5d232de5-7'}
        for k,v in trueCID.items():
            row = (testset.crystal_name == k)
            testset.loc[row, 'compound_ID_meta'] = v
            testset.loc[row, 'compound_ID_soaks'] = v
        
        testset = testset[testset['F_IC50'] < 99]
        testset.drop_duplicates(subsets = ['crystal_name'], keep = 'first', inplace = True)
        testset = testset.reset_index(drop = True)
        testset.to_csv(testset_path, index = 0)
    return testset

In [8]:
def getDisSquare(cord1, cord2):
    DisSquare = 0
    for a,b in zip(cord1,cord2):
        DisSquare += np.square(a-b)
    return DisSquare

In [9]:
def get_RMSD(refLig, dockLig, df, protocol):
    RMSD_list = []
    df_result = df.copy().set_index(['sub_crystal_name'],drop = True)
    title = refLig.title.rsplit('_',1)[0]
    for n in range(len(dockLig.poses)):
        DisSquare = 0
        nonH_count = 0
        for atom_r, atom_d in zip(refLig.atoms, dockLig.poses[n].atoms):
            if 'H' in atom_r.name or 'H' in atom_d.name:
                pass
            else:
                if atom_r.name == atom_d.name:
                    nonH_count += 1
                    DisSquare += getDisSquare(atom_r.pos,atom_d.pos)
                else:
                    print('exception, return initial df')
                    return df
        RMSD = np.divide(np.sqrt(DisSquare),nonH_count)
        RMSD_list.append(RMSD)
        df_result.loc[title,'Pose' + str(n) + '_' + protocol] = RMSD
    df_result.loc[title,'BestRMSD' + '_' + protocol] = min(RMSD_list)
    df_result = df_result.reset_index()
    return df_result

In [10]:
def get_rdkit_RMSD(refLig, dockLig, df, protocol):
    RMSD_list = []
    title = refLig.title.rsplit('_',1)[0]
    df_result = df.copy().set_index(['sub_crystal_name'],drop = True)
    ref = refLig.to_rdmol()
    for n in range(len(dockLig.poses)):
        lig = dockLig.poses[n].to_rdmol()
        #RMSD = rdkit.Chem.rdMolAlign.AlignMol(lig, ref)
        try:
            RMSD_noH = rdkit.Chem.rdMolAlign.AlignMol(rdkit.Chem.rdmolops.RemoveAllHs(lig), rdkit.Chem.rdmolops.RemoveAllHs(ref))
        except:
            RMSD_noH = 9999
        RMSD_list.append(RMSD_noH)
        df_result.loc[title,'Pose' + str(n) + '_' + protocol] = RMSD_noH
    if len(RMSD_list)>0: 
        df_result.loc[title,'BestRMSD' + '_' + protocol] = min(RMSD_list)
    df_result = df_result.reset_index()
    return df_result        

In [11]:
def get_prop(ligand, df, protocol):
    result = ligand.properties.items()
    print(ligand.title)
    title = ligand.title.rsplit('_',1)[0]
    df = df.set_index(['sub_crystal_name'], drop = True)
    df.loc[title,'dG_'+ protocol] = result[-4][1].value
    df.loc[title,'RS_'+ protocol] = result[-5][1].value
    df.loc[title,'VS_'+ protocol] = result[-3][1].value
    df.loc[title,'LE_'+ protocol] = result[-2][1].value    
    df = df.reset_index()
    return df

In [13]:
if __name__ == "__main__":
    nb_dir = 'D:\JupyterNotebook\BSc\Flare'

    testset = get_testset(nb_dir)
    test_result = testset.copy()
    proj = flare.Project()
    
    gridbox = get_gridbox(proj, '6lu7_ligand', nb_dir)
    redock = flare.Docking()
    dock = flare.Docking()
    
    get_protein(proj,'6lu7', nb_dir)
    raw_list, redock_list, dock_list = get_ligands(proj, testset, nb_dir)
    
    redock.protein = proj.proteins[-1]
    dock.protein = proj.proteins[-1]
    redock.ligands = redock_list
    dock.ligands = dock_list
    
    redock.max_poses = 5
    redock.system.quality = flare.LeadFinderSystem.Quality.ExtraPrecision
    dock.max_poses = 5
    dock.system.quality = flare.LeadFinderSystem.Quality.ExtraPrecision
    redock.system.grid_box = gridbox
    dock.system.grid_box = gridbox
    redock.sequences = proj.proteins[-1].sequences
    dock.sequences = proj.proteins[-1].sequences
    
    stdtime = time.time()
    print('Redocking ...')
    redock.start()
    while (redock.is_running()):
        print(redock.progress())
        time.sleep(120 - ((time.time() - stdtime) % 120))
    print('Docking from scratch ...')
    dock.start()
    while (dock.is_running()):
        print(dock.progress())
        time.sleep(120 - ((time.time() - stdtime) % 120))
    tottime = time.time()-stdtime
    print('Time', tottime)
    
    proj.save(os.path.join(nb_dir, 'redock') + '.flr')
    print('Project saved')
    print('Solving results')    
    for redock_lig, raw_lig in zip(redock_list, raw_list):
        test_result = get_prop(redock_lig, test_result, 'redock')
        test_result = get_rdkit_RMSD(raw_lig, redock_lig, test_result, 'redock')
    for dock_lig, raw_lig in zip(dock_list, raw_list):
        test_result = get_prop(dock_lig, test_result, 'dock')
        test_result = get_rdkit_RMSD(raw_lig, dock_lig, test_result, 'dock')

    test_result.to_csv(os.path.join(nb_dir, 'redock_rescore_result')+ '.csv', index = 0)
    print('Result saved')

In [17]:
nb_dir = 'D:\JupyterNotebook\BSc\Flare'
testset = get_testset(nb_dir)

for redock_lig, raw_lig in zip(redock.ligands, raw_list):
    test_result = get_prop(redock_lig, test_result, 'redock')
    test_result = get_rdkit_RMSD(raw_lig, redock_lig, test_result, 'redock')
for dock_lig, raw_lig in zip(dock.ligands, raw_list):
    test_result = get_prop(dock_lig, test_result, 'dock')
    test_result = get_rdkit_RMSD(raw_lig, dock_lig, test_result, 'dock')

test_result.to_csv(os.path.join(nb_dir, 'redock_rescore_result')+ '.csv', index = 0)
print('Result saved')