# Next improvement attempts
### _Introduce structural information_

In [None]:
# temporary -- all code needed up to this point
import pandas as pd
import random
from random import sample
import re
import pprint

from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem.Descriptors import *

#from rdkit.Chem import MCS
from rdkit.Chem import rdFMCS
from rdkit.Chem import fmcs
from rdkit.Chem import Draw
from rdkit.Chem import PandasTools
import xlsxwriter

import EFGs
from EFGs import mol2frag,ifg,identify_functional_groups

data = pd.read_csv('hob_data_set_MSMcleanup.csv')

In [None]:
# addition of Murcko to data
from rdkit.Chem.Scaffolds import MurckoScaffold
df = data
for index,row in df.iterrows():
    drugname = df.loc[index,'Name']
    drugSMILES = df.loc[index,'smile']
    drugmol = Chem.MolFromSmiles(drugSMILES)
    
    murcko_core = MurckoScaffold.GetScaffoldForMol(drugmol)
    result = Chem.MolToSmiles(murcko_core)
    df.loc[index,'murcko'] = result

In [None]:
''' # CLEAN SMILES '''
import re
def clean_smiles(smilesx,option=1):
    if option==2:
        expr = re.compile(r'[0-9]+\*')
        out = expr.sub('*',smilesx)
    else:
        expr = re.compile(r'\(?\[?[0-9]+\*\]?\)?')
        out = expr.sub('',smilesx)
    #out = []
    #for k in smilesList:
        #k2 = expr.sub('',k)
        #out.append(k2)
    
    return out

def clean_smiles_from_mol_list(mol_list,option=1):
    list_out = []
    for x in mol_list:
        smilesx = Chem.MolToSmiles(x)
        if option==2:
            expr = re.compile(r'[0-9]+\*')
            out = expr.sub('*',smilesx)
            list_out.append(out)
        elif option==3:
            expr = re.compile(r'\(?\[?[0-9]+\*\]?\)?')
            out = expr.sub('',smilesx)
            
            #out2 = Chem.MolFromSmiles(out)
            #out3 = Chem.MolToSmiles(out2)
            
            expr = re.compile(r'\([A-z]+\)')
            #out4 = expr.sub('',out3)
            out2 = expr.sub('',out)
            #out3 = Chem.MolFromSmiles(out2)
            #out4 = Chem.MolToSmiles(out3)
            #list_out.append(out4)
            list_out.append(out2)
        else:
            expr = re.compile(r'\(?\[?[0-9]+\*\]?\)?')
            out = expr.sub('',smilesx)
            list_out.append(out)
    return list_out

''' # BRICS FRAGMENTATION '''
from rdkit.Chem import BRICS

def gimme_BRICS(drugmol,sorting=True):
    fragMols = []
    #fragSmiles1 = []
    #fragSmiles2 = []
    
    for x in BRICS.BRICSDecompose(drugmol):
        fragMols.append(Chem.MolFromSmiles(x))
        
        #smiles = clean_smiles(x,option=1)
        #fragSmiles1.append(smiles)
        #smiles = clean_smiles(x,option=2)
        #fragSmiles2.append(smiles)
        
    if sorting:
        fragMols.sort(reverse=True,key = lambda x:x.GetNumAtoms())
    else:
        pass
    #return fragMols,fragSmiles1,fragSmiles2
    return fragMols

### build fragments dictionary
* We will make a Drug Dict, and a Frag Dict
* Drug Dict (PK = `drug_name`) will be used to store list of BRICS frags
* Frag Dict (PK = `frag_smiles`) will store: 
  * drug_list = list of each `drug_name`
  * size = `frag_mol.GetNumHeavyAtoms()`
  * frequency = `len(drug_list)`
* Each Frag Dict PK will be entered based on the fragment smiles BOTH v1 (simplified) and v2 (specific)

In [None]:
drug_dict = {}
frag_dict = {}
fragment_tracker = set()
frag_pairs_dict = {}

df = data
for index,row in df.iterrows():
    #if index < 50:
    drugname = df.loc[index,'Name']
    drugSMILES = df.loc[index,'smile']
    drugmol = Chem.MolFromSmiles(drugSMILES)
    Mcore_smiles = df.loc[index,'murcko']
    
    drug_dict[drugname] = {}
    drug_dict[drugname]['smiles'] = drugSMILES
    drug_dict[drugname]['mol'] = drugmol
    drug_dict[drugname]['Mcore_smiles'] = Mcore_smiles
    
    for atom in drugmol.GetAtoms():
        atom.SetIsotope(0)
    params = Chem.AdjustQueryParameters()
    params.makeBondsGeneric = True
    params.makeDummiesQueries = True
    drug_patt = Chem.AdjustQueryProperties(drugmol,params)
    
    drug_dict[drugname]['drug_patt'] = drug_patt
    
    fragMols = gimme_BRICS(drugmol)
    fragSmiles1 = clean_smiles_from_mol_list(fragMols,option=1)
    fragSmiles2 = clean_smiles_from_mol_list(fragMols,option=2)
    fragSmiles3 = clean_smiles_from_mol_list(fragMols,option=3)
    
    drug_dict[drugname]['BRICS_mols'] = fragMols
    drug_dict[drugname]['BRICS_smilesV1'] = fragSmiles1
    drug_dict[drugname]['BRICS_smilesV2'] = fragSmiles2
    drug_dict[drugname]['BRICS_smilesV3'] = fragSmiles3
    drug_dict[drugname]['BRICS_count'] = len(fragMols)
    
    
    frag_patts = []
    for fragx in fragMols:
        frag_patt = fragx
        for atom in frag_patt.GetAtoms():
            atom.SetIsotope(0)
        params = Chem.AdjustQueryParameters()
        params.makeBondsGeneric = True
        params.makeDummiesQueries = True
        frag_patt = Chem.AdjustQueryProperties(frag_patt,params)
        frag_patts.append(frag_patt)
    
    drug_dict[drugname]['frag_patts'] = frag_patts
    
    import EFGs
    from EFGs import mol2frag,ifg,identify_functional_groups
    drugFrags_EFGS = mol2frag(drugmol)[0]

    drug_dict[drugname]['frags_EFGS'] = drugFrags_EFGS
    
    # set empty lists for frag pairings (later)
    drug_dict[drugname]['BRICS_pairs_smile'] = []
    drug_dict[drugname]['BRICS_pairs_size'] = []
    drug_dict[drugname]['BRICS_pairs_frequency'] = []
    drug_dict[drugname]['BRICS_pairs_mol'] = []
    
    # also BRICS decomposition on the Murcko Scaffold, for simplified fragments
    murckomol = Chem.MolFromSmiles(df.loc[index,'murcko'])
    
    for atom in murckomol.GetAtoms():
        atom.SetIsotope(0)
    params = Chem.AdjustQueryParameters()
    params.makeBondsGeneric = True
    params.makeDummiesQueries = True
    murcko_patt = Chem.AdjustQueryProperties(murckomol,params)
    
    drug_dict[drugname]['murcko_mol'] = murckomol
    drug_dict[drugname]['murcko_patt'] = murcko_patt
    
    murckofragMols = gimme_BRICS(murckomol)
    
    murckofrag_patts = []
    for fragx in murckofragMols:
        for atom in fragx.GetAtoms():
            atom.SetIsotope(0)
        params = Chem.AdjustQueryParameters()
        params.makeBondsGeneric = True
        params.makeDummiesQueries = True
        frag_patt = Chem.AdjustQueryProperties(fragx,params)
        murckofrag_patts.append(frag_patt)
    
    drug_dict[drugname]['murckoFrag_mols'] = murckofragMols
    drug_dict[drugname]['murckoFrag_patts'] = murckofrag_patts
    
    murckofragSmiles1 = clean_smiles_from_mol_list(murckofragMols,option=1)
    murckofragSmiles3 = clean_smiles_from_mol_list(murckofragMols,option=3)
    
    drug_dict[drugname]['murckoFrag_smiles1'] = murckofragSmiles1
    drug_dict[drugname]['murckoFrag_smiles3'] = murckofragSmiles3
    drug_dict[drugname]['murckoFrag_count'] = len(murckofragMols)
    
    murckoFrags_EFGS = mol2frag(murckomol)[0]
    drug_dict[drugname]['murckoFrags_EFGS'] = murckoFrags_EFGS
    
    # categorize Sidechains using difference between All Fragments vs. Murcko (core) Fragments   
    sidechains = []
    for x in fragSmiles3:
        if x in murckofragSmiles3:
            pass
        else:
            sidechains.append(x)
    drug_dict[drugname]['sidechain_frags'] = sidechains
    
    tempset = set()
    for L in [fragSmiles1,fragSmiles2,fragSmiles3,murckofragSmiles1,murckofragSmiles3]:
        for item in L:
            tempset.add(item)
    drug_dict[drugname]['all_distinct_frags'] = tempset
    

    # Create Dictionary for Individual Fragments
    for i,j in enumerate(fragMols):
        frag_smiles_1 = fragSmiles1[i]
        frag_smiles_2 = fragSmiles2[i]
        frag_smiles_3 = fragSmiles3[i]
        frag_pattern = frag_patts[i]
        #frag_size = j.GetNumHeavyAtoms()
        
        #allfraglist = fragSmiles1[:]
        #allfraglist.extend(fragSmiles2)
        #allfraglist.extend(fragSmiles3)
        # record entries from BOTH keys... simplified (v1) + specific (v2) fragments
        for k0 in [(frag_smiles_1,'v1'),(frag_smiles_2,'v2'),(frag_smiles_3,'v3')]:
            k = k0[0]
            if k not in frag_dict.keys():
                temp_list = []
                temp_list.append(drugname)
                frag_dict[k] = {}
                #frag_dict[k]['size'] = frag_size
                frag_dict[k]['frequency'] = len(temp_list)
                frag_dict[k]['drugs'] = temp_list
                frag_dict[k]['frag_mol'] = j
                frag_dict[k]['frag_patt'] = frag_pattern
                frag_dict[k]['size'] = j.GetNumHeavyAtoms()
                templist2 = set()
                templist2.add(f"smiles_{k0[1]}")
                frag_dict[k]['source'] = templist2
            else:
                frag_dict[k]['drugs'].append(drugname) 
                frag_dict[k]['frequency'] = len(frag_dict[k]['drugs'])
                frag_dict[k]['source'].add(f"smiles_{k0[1]}")
                #frag_dict[k]['frequency'] = len(frag_dict[frag_smiles_1]['drugs'])
                
    # ADD  MURCKO CORE TO FRAG_DICT
    
    k = Mcore_smiles
    j = murckomol
    frag_size = j.GetNumHeavyAtoms()
    if k not in frag_dict.keys():
        temp_list = []
        temp_list.append(drugname)
        frag_dict[k] = {}
        frag_dict[k]['size'] = frag_size
        frag_dict[k]['frequency'] = len(temp_list)
        frag_dict[k]['drugs'] = temp_list
        frag_dict[k]['frag_mol'] = j
        frag_dict[k]['frag_patt'] = murcko_patt
        templist2 = set()
        templist2.add("murcko_core")
        frag_dict[k]['source'] = templist2
    else:
        frag_dict[k]['drugs'].append(drugname) 
        frag_dict[k]['frequency'] = len(frag_dict[k]['drugs'])
        frag_dict[k]['source'].add("murcko_core")
    
    # ALSO ADD  MURCKO FRAGMENTS TO FRAG_DICT
    for i,j in enumerate(murckofragMols):
        Mfrag_smiles_1 = murckofragSmiles1[i]
        Mfrag_smiles_3 = murckofragSmiles3[i]
        Mfrag_size = j.GetNumHeavyAtoms()
        murcko_frag_pattern = murckofrag_patts[i]
        
        #allMfraglist = murckofragSmiles1[:]
        #allMfraglist.extend(murckofragSmiles3)
        #for k in allMfraglist:
        for k in [(Mfrag_smiles_1,"v1"),(Mfrag_smiles_3,"v3")]:
            if k not in frag_dict.keys():
                temp_list = []
                temp_list.append(drugname)
                frag_dict[k] = {}
                frag_dict[k]['size'] = frag_size
                frag_dict[k]['frequency'] = len(temp_list)
                frag_dict[k]['drugs'] = temp_list
                frag_dict[k]['frag_mol'] = j
                frag_dict[k]['frag_patt'] = murcko_frag_pattern
                templist2 = set()
                templist2.add(f"murckofrag_{k0[1]}")
                frag_dict[k]['source'] = templist2
            else:
                frag_dict[k]['drugs'].append(drugname) 
                frag_dict[k]['frequency'] = len(frag_dict[k]['drugs'])
                frag_dict[k]['source'].add(f"murckofrag_{k0[1]}")
    

    # Create Dictionary for Fragment pairings
    for i1,j1 in enumerate(fragSmiles3): #use fragSmiles3 because that's most simplified version
        frag_x = j1
        size_x = fragMols[i1].GetNumHeavyAtoms()

        if j1 not in frag_pairs_dict.keys():
            frag_pairs_dict[j1] = {}

        for i2,j2 in enumerate(fragSmiles3):
            if i2 == i1:
                pass
            #elif j2 in frag_pairs_dict.keys():
            #    pass
            else:
                frag_y = j2
                size_y = fragMols[i2].GetNumHeavyAtoms()

                if j2 not in frag_pairs_dict[j1].keys():
                    temp_list = []
                    temp_list.append(drugname)
                    frag_pairs_dict[j1][j2] = {}
                    frag_pairs_dict[j1][j2]['pair_size'] = (size_x + size_y)
                    frag_pairs_dict[j1][j2]['frequency'] = len(temp_list)
                    frag_pairs_dict[j1][j2]['drugs'] = temp_list
                    
                else:
                    frag_pairs_dict[j1][j2]['drugs'].append(drugname)
                    frag_pairs_dict[j1][j2]['frequency'] = len(frag_pairs_dict[j1][j2]['drugs'])
                
                drug_dict[drugname]['BRICS_pairs_smile'].append((frag_x,frag_y))
                drug_dict[drugname]['BRICS_pairs_size'].append((size_x + size_y))
                drug_dict[drugname]['BRICS_pairs_frequency'].append(len(frag_pairs_dict[j1][j2]['drugs']))
                drug_dict[drugname]['BRICS_pairs_mol'].append((fragMols[i1],fragMols[i2]))
    #else:
    #    pass

In [None]:
import pickle

filename = 'frag_dict_20220827new.pickle'
with open(filename, "wb") as handle:
    pickle.dump(frag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
import pickle

filename = 'drug_dict_20220827.pickle'
with open(filename, "wb") as handle:
    pickle.dump(drug_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
import pickle

filename = 'frag_pairs_dict_20220827.pickle'
with open(filename, "wb") as handle:
    pickle.dump(frag_pairs_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Recalculate Fragment Sizes, record to Frag DIct
for frag_smiles in frag_dict.keys():
    resize = frag_dict[frag_smiles]['frag_mol'].GetNumHeavyAtoms()
    recordedSize = frag_dict[frag_smiles]['size']
    frag_dict[frag_smiles]['size (old)'] = recordedSize
    frag_dict[frag_smiles]['size'] = resize

    # save frag_dict again
import pickle
filename = 'frag_dict_20220827new.pickle'
with open(filename, "wb") as handle:
    pickle.dump(frag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# Recalculate Drug/Fragment Matches, record to Frag DIct
matches = []

fragList = enumerate([x for x in frag_dict.keys()])
#searchPatterns = enumerate([frag_dict[frag_smiles]['frag_patt'] for frag_smiles in frag_dict.keys()])    
drugList = [x for x in drug_dict.keys()]


              
#for fragPattern in searchPatterns:
for i,j in fragList:
    frag_dict[j]['druglist3'] = []
    if frag_dict[j]['source'].intersection({'murckofrag_v3','smiles_v3','murcko_core'}) != set():
        fragPattern = frag_dict[j]['frag_patt']

        for index,key in enumerate(drugList):
            drugmol = drug_dict[key]['mol']
            #if index < 200:
            drugname = key
            drugmol = drug_dict[drugname]['mol']
            drug_patt = drug_dict[drugname]['drug_patt']

            if drug_patt.HasSubstructMatch(fragPattern):
                frag_dict[j]['druglist3'].append(key)
                frag_dict[j]['frequency3'] = len(frag_dict[j]['druglist2'])

In [None]:
# backup!!
# save frag_dict again
import pickle
filename = 'frag_dict_20220827new3.pickle'
with open(filename, "wb") as handle:
    pickle.dump(frag_dict, handle, protocol=pickle.HIGHEST_PROTOCOL)

### <font color='red'> Some missing code belongs here (below) </font>

In [None]:
object_to_save = frag_dict
save_as = 'frag_dict_20220829_new1.pickle'
with open(save_as, 'wb') as handle:
    pickle.dump(object_to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
object_to_save = drug_dict
save_as = 'drug_dict_20220829_new1.pickle'
with open(save_as, 'wb') as handle:
    pickle.dump(object_to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
import pickle
# --- drug_dict (pickle) ---
filename = 'drug_dict_20220829_new1.pickle'
with open(filename, 'rb') as handle:
    drug_dict = pickle.load(handle)

# --- frag_dict (pickle) ---
filename = 'frag_dict_20220829_new1.pickle'
with open(filename, 'rb') as handle: 
    frag_dict = pickle.load(handle)
    
frag_dict_df = pd.DataFrame.from_dict(frag_dict)
frag_dict_df = frag_dict_df.transpose()

drug_dict_df = pd.DataFrame.from_dict(drug_dict)
drug_dict_df = drug_dict_df.transpose()
drug_dict_df.reset_index(inplace=True)
drug_dict_df = drug_dict_df.rename(columns={'index':'drug_name'})

In [None]:
drugs = [(n,idx) for (n,idx) in enumerate(drug_dict.keys())]
for idx,drug in drugs:
    drug_fragset = drug_dict[drug]['fragments_final']
    
    fragments_subset = []
    
    for (frag,frag_size,frag_freq) in drug_fragset:
        if frag_freq > 2 and frag_size > 2 and frag_size < 25:
            fragments_subset.append((frag,frag_size,frag_freq))
            
    drug_dict[drug]['fragments_subset'] = fragments_subset
    drug_dict[drug]['count_frags_subset'] = len(fragments_subset)

In [None]:
 
object_to_save = drug_dict
save_as = 'drug_dict_20220829_new2.pickle'
with open(save_as, 'wb') as handle:
    pickle.dump(object_to_save, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
drugs = [(n,idx) for (n,idx) in enumerate(drug_dict.keys())]
for idx,drug in drugs:
    drug_fragset = drug_dict[drug]['fragments_subset']
    
    revised = []
    import re
    for (x,y,z) in mylist:
        if z < 1000 and re.findall('[\[,\],\*,\@,\#]',x) == []:
            revised.append((x,y,z))
        else:
            pass

    drug_dict[drug]['fragments_subset2'] = revised

In [None]:
drug_dict_df = pd.DataFrame.from_dict(drug_dict)
drug_dict_df = drug_dict_df.transpose()
drug_dict_df.reset_index(inplace=True)
drug_dict_df = drug_dict_df.rename(columns={'index':'drug_name'})

In [None]:
df_out = drug_dict_df[['drug_name','fragments_final_set','fragment_count','fragments_subset','count_frags_subset','fragments_subset2','frags_EFGS','BRICS_smilesV3','BRICS_count']]
df_out.head()

In [None]:
df_merged = pd.merge(df,df_out,how='left',left_on='Name',right_on='drug_name')
df_merged.head()

In [None]:
df_merged = df_merged.drop(columns=['drug_name',
                                    'drugmol',
                                    'murcko',
                                    'murckomol',
                                    'count_frags_subset',
                                   'BRICS_count'])

df_merged = df_merged.rename(columns={'Name':'drug_name', 
                                      'smile':'drug_smiles', 
                                      'value (oral BA %)':'ba_pct',   
                                      'fragments_final_set':'frags_all', 
                                      'fragment_count':'num_frags_all',
                                      'fragments_subset':'frags_better', 
                                      'fragments_subset2':'frags_best', 
                                      'frags_EFGS':'frags_efgs',
                                      'BRICS_smilesV3':'frags_brics'})

In [None]:
df_merged.to_pickle('bioavailabilityData_w_Frags.pkl')

In [None]:
''' # Simplify fragment columns [having lists of tuples] to columns of simple lists '''
data = df_merged

# unpack tuples of fragment info and re-write as simple lists
for index,row in data.iterrows():
    frags_all = row.frags_all
    frags_better = row.frags_better
    frags_best = row.frags_best
    
    frags_all_new = []
    frags_better_new = []
    frags_best_new = []
    
    for (x,y,z) in frags_all:
        frags_all_new.append(x)
    for (x,y,z) in frags_better:
        frags_better_new.append(x)
    for (x,y,z) in frags_best:
        frags_best_new.append(x)
    
    data.at[index,'frags_all'] = frags_all_new
    data.at[index,'frags_better'] = frags_better_new
    data.at[index,'frags_best'] = frags_best_new

In [None]:
# delete unnecessary fragment count columns
data = data.drop(columns=['num_frags_all'])

In [None]:
data.to_pickle('bioavailabilityData_w_Frags_simpler.pkl')