In [1]:
from rdkit import Chem
from rdkit.Chem import AllChem
import numpy as np
from scipy import stats
import pandas as pd
from tqdm import tqdm
from constants import *

In [2]:
DATABASE_PATH_DICT = {
    r"ALL": f"{DATA_PATH}/nps_intersected",
    r"ALMANAC-skin": f"{DATA_PATH}/nps_intersected/ALMANAC/skin",
    r"ALMANAC-ovary": f"{DATA_PATH}/nps_intersected/ALMANAC/ovary",
    r"ALMANAC-lung": f"{DATA_PATH}/nps_intersected/ALMANAC/lung",
    r"ALMANAC-breast": f"{DATA_PATH}/nps_intersected/ALMANAC/breast",
    r"FRIEDMAN": f"{DATA_PATH}/nps_intersected/FRIEDMAN",
}

In [3]:
def remove_useless_descriptor(df, threshold):
    des1 = len(df.columns)

    h = df.columns.tolist()
    df = df.to_numpy().astype(np.float)
    df = np.array(df)

    STDEV = np.std(df, axis=0)
    idx = [idx for idx, val in enumerate(STDEV) if val > threshold]
    df2 = df[:, idx]
    hx = np.array(h)[idx]

    df = pd.DataFrame(df2, columns=[hx])

    des2 = len(df.columns)

    print("from Remove useless descriptor")
    print(
        "The initial set of "
        + str(des1)
        + " descriptors has been reduced to "
        + str(des2)
        + " descriptors."
    )

    return df, des1, des2


def correlation(df, threshold):
    des3 = len(df.columns)
    corr = stats.pearsonr
    col_corr = set()  # Set of all the names of deleted columns
    corr_matrix = df.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if corr_matrix.iloc[i, j] >= threshold:
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
                if colname in df.columns:
                    del df[colname]  # deleting the column from the dataset
    des4 = len(df.columns)

    print("from Remove correlation")
    print(
        "The initial set of "
        + str(des3)
        + " descriptors"
        + " has been reduced to "
        + str(des4)
        + " descriptors."
    )

    return df, des3, des4

def get_ecfp6_map(smiles, nBits=2048, r=3):
    print(smiles)
    smiles_unique = pd.Series(np.unique(smiles))
    mol = smiles_unique.apply(Chem.MolFromSmiles)
    fp1 = mol.apply(lambda m: AllChem.GetMorganFingerprintAsBitVect(m, r, nBits=nBits))
    map_smiles_to_fp = dict(zip(smiles_unique.tolist(), fp1.tolist()))
    return map_smiles_to_fp

def get_ecfp6_list(smileslist, r=3, nBits=2048):
    # ret = np.zeros((len(smileslist), nBits))
    # for idx, smiles in tqdm(enumerate(smileslist)):
    #     ret[idx] = get_ecfp6(smiles, nBits, r)
    # return ret
    print("Getting map...")
    map_smiles_to_fp = get_ecfp6_map(smileslist)
    print("Getting stack of ecfp6...")
    ret = np.stack(pd.Series(smileslist).map(map_smiles_to_fp).tolist())
    print("Done")
    return ret


def clean_and_save(database):
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(f"Cleaning and Saving {database}")
    print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    # load dataset
    print("Loading dataset...")
    database_path = DATABASE_PATH_DICT[database]
    ID = np.load(f"{database_path}/idx_row.npy", allow_pickle=True)
    dataframe = pd.read_pickle(f"{EXPORT_DATA_PATH}/data_drugcomb.pkl")
    smiles_col = dataframe["molecule_structures_col"].loc[ID].values
    smiles_row = dataframe["molecule_structures_row"].loc[ID].values

    smiles = np.concatenate((smiles_row, smiles_col))
    print(smiles.shape)
    print("Done load dataset!")

    # get ecfp6
    print("Converting dataset to ecfp6...")
    data_col = get_ecfp6_list(smiles_col)
    data_row = get_ecfp6_list(smiles_row)
    print("Done converting dataset to ecfp6!")

    print("Saving..")
    np.save(f"{database_path}/deepsyn_drug_row.npy", data_row)
    np.save(f"{database_path}/deepsyn_drug_col.npy", data_col)
    print("Saved!")

In [4]:
for s in DATABASE_PATH_DICT.keys():
    clean_and_save(s)

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Cleaning and Saving ALL
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
Loading dataset...
(239200,)
Done load dataset!
Converting dataset to ecfp6...
Getting map...
['Cn1nnc2c(C(N)=O)ncn2c1=O' 'Cn1nnc2c(C(N)=O)ncn2c1=O'
 'Cn1nnc2c(C(N)=O)ncn2c1=O' ...
 'N#CC[C@H](C1CCCC1)n1cc(-c2ncnc3[nH]ccc23)cn1'
 'N#CC[C@H](C1CCCC1)n1cc(-c2ncnc3[nH]ccc23)cn1'
 'N#CC[C@H](C1CCCC1)n1cc(-c2ncnc3[nH]ccc23)cn1']
Getting stack of ecfp6...
Done
Getting map...
['CN1CCc2cc3c(cc2[C@H]1[C@@H]1OC(=O)c2c1ccc1c2OCO1)OCO3'
 'CCCN1CCO[C@@H]2c3cc(O)ccc3OC[C@H]21.Cl'
 'CC(=O)C1=C(O)[C@]2(C)C(=CC1=O)Oc1c(C(C)=O)c(O)c(C)c(O)c12' ...
 'Cc1ccccc1C(=O)Nc1ccc(C(=O)N2CCCC(O)c3cc(Cl)ccc32)c(C)c1'
 'Cc1cccc(Nc2ccncc2S(=O)(=O)NC(=O)NC(C)C)c1'
 'CCOC(=O)N1c2ccc(C(F)(F)F)cc2[C@@H](N(Cc2cc(C(F)(F)F)cc(C(F)(F)F)c2)C(=O)OC)C[C@H]1CC']
Getting stack of ecfp6...
Done
Done converting dataset to ecfp6!
Saving..
Saved!
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~