# Compounds split with "Date" or "Random"
***

* Some datasets are not allowed to be uploaded.
* Therefore, some datasets have been removed from publication environment.
* The locations of the non-working without these datasets are commented out.
* The output that differ from the publication environment are also commented out.

In [1]:
# import libraries
import copy
import random

import pandas as pd


In [2]:
def standalize_df(df:pd.core.frame.DataFrame):
    """ return z score """
    return (df - df.mean()) / df.std(ddof=0)

def index_intersection(df:pd.core.frame.DataFrame,df2:pd.core.frame.DataFrame):
    """ return same index dataframe (lower) """
    ind1 = list(df.index)
    ind2 = list(df2.index)
    ind1 = [i.lower() for i in ind1]
    ind2 = [i.lower() for i in ind2]
    df.index=ind1
    df2.index=ind2
    ind_new = set(ind1) & set(ind2)
    ind_new = list(ind_new)
    df = copy.deepcopy(df.loc[ind_new,:])
    df2 = copy.deepcopy(df2.loc[ind_new,:])
    return df, df2

def load_X(comp_thresh:float=0, feature_thresh:float=0, join:str='inner'):
    # load all data and concat
    names = ["drugbank", "ctd", "semmed", "l1000", "mold2", "mol2vec", "mordred", "pubchem", "admet"] # not-working 
    files = [f"dataset/{i}.csv" for i in names]
    
    for i, (name, file) in enumerate(zip(names, files)):
        if i==0:
            df_mol = pd.read_csv(file, index_col=0)
            df_mol = nan_threshold(df_mol, threshold=comp_thresh,axis=0)
            df_mol = nan_threshold(df_mol, threshold=feature_thresh,axis=1)
            df_mol.columns=[f"{name}_{str(i)}" for i in df_mol.columns]
            print(f"{name} : {df_mol.shape}")
        else:
            df_temp = pd.read_csv(file, index_col=0)
            df_temp = nan_threshold(df_temp, threshold=comp_thresh,axis=0)
            df_temp = nan_threshold(df_temp, threshold=feature_thresh,axis=1)
            print(f"{name} : {df_temp.shape}")
            df_temp.columns=[f"{name}_{str(i)}" for i in df_temp.columns]
            df_mol = pd.concat([df_mol,df_temp],axis=1,sort=False, join=join)
    df_mol = df_mol.dropna(how="all",axis=1)
    return df_mol

def nan_threshold(df:pd.core.frame.DataFrame,threshold:float=0,axis:int=0):
    """ remove missing features and compounds """
    if threshold==0:
        df = df.dropna(how="any",axis=axis)
    else:
        if axis==0:
            df = df.T.loc[:,df.T.isnull().sum()<(len(df.columns)*threshold)]
            df = df.T
        elif axis==1:
            df = df.loc[:,df.isnull().sum()<(len(df.index)*threshold)]
    return df

def load_df(comp_thresh:float=0, feature_thresh:float=0, join:str='inner', file_sider:str="dataset/sider.csv"):
    """ main module to load and concatenate all files """
    X = load_X(comp_thresh=comp_thresh, feature_thresh=feature_thresh, join=join)
    y = pd.read_csv(file_sider, index_col=0)
    X, y = index_intersection(X, y)
    return X, y

def date_split_number(df_dates:pd.core.frame.DataFrame, number:int=10):
    """ date split from date dataframe """
    df_dates = df_dates.sort_values(by=['date'], ascending=False)
    train_comp = df_dates['name'].tolist()[number:]
    test_comp = df_dates['name'].tolist()[:number]
    return train_comp, test_comp

In [3]:
# load data
X, y=load_df(comp_thresh=1, feature_thresh=1, join='inner')
print(f"all data : {X.shape}")
compounds = X.index.tolist()

# load date information
df = pd.read_csv("dataset/drugbank_date_information.csv", index_col=0)
compounds_inter = set(df["name"].tolist()) & set(compounds)
df.index = df["name"]
df = df.loc[list(compounds_inter),:]

drugbank : (791, 882)
ctd : (984, 21650)
semmed : (1179, 5155)
l1000 : (900, 958)
mold2 : (791, 644)
mol2vec : (1089, 300)
mordred : (791, 1517)
pubchem : (1087, 30)
admet : (791, 106)
all data : (451, 31242)


In [4]:
# date split
train_comp, test_comp = date_split_number(df, number=int(.2*len(X.index)))
print(len(train_comp))
print(len(test_comp))

361
90


In [5]:
# loc
X = X.loc[compounds_inter,:]
y = y.loc[compounds_inter,:]

In [6]:
# Export all data for next step (output will changed without some datasets)
pd.to_pickle(X, "data/X.pickle") # Changed
pd.to_pickle(y, "data/y.pickle")
pd.to_pickle(X.columns.tolist(), "data/all_features.pickle")
pd.to_pickle(list(compounds_inter), "data/all_comp.pickle")
pd.to_pickle([train_comp, test_comp], "data/comp_split/date.pickle")

In [22]:
# compound split by random
seed = 24771
compounds_all = pd.read_pickle("data/all_comp.pickle")
for i in range(20):
    random.seed(seed)
    test_comp = random.sample(compounds_all, 90)
    train_comp = list(set(compounds_all) - set(test_comp))
    pd.to_pickle([train_comp, test_comp], f"data/comp_split/random_{str(i)}.pickle")
    seed += 1

# filtering features
* calculate and export filtering features of each data 
***

In [1]:
# import libraries
import copy
import random
import numpy as np
import pandas as pd

from scipy.spatial.distance import pdist
from scipy.spatial.distance import squareform

In [2]:
# filtering modules
def same_feature(df):
    """ delete same feature """
    drop = df.loc[:,df.T.duplicated()].columns.tolist()
    return df.drop(drop, axis=1)

def cv_limit(df, threshold=0.05):
    """ delete low CV feature """
    drop = df.loc[:,(df.std() / df.abs().mean())< threshold].columns
    return df.drop(drop, axis=1)

def corr(df):
    """ calculate correlation """
    values = 1 - squareform(pdist(df.T, 'correlation'))
    return pd.DataFrame(values,
                        columns=df.columns, index=df.columns)

def corr_limit(df, threshold=0.85):
    """ delete high correlation feature """
    # calc correlation
    corr_matrix = corr(df)

    # drop phase
    drop = []
    flag=True
    while flag:
        try:
            if corr_matrix.iloc[0,0]==1:
                corr_matrix = corr_matrix.iloc[1:,:]
        except:
            break
        temp_lst = corr_matrix[corr_matrix.iloc[:,0]>threshold].index.tolist()
        if len(temp_lst)>0:
            drop+=temp_lst
            corr_matrix = corr_matrix.drop(temp_lst, axis=0).drop(temp_lst, axis=1)
        else:
            corr_matrix = corr_matrix.iloc[1:,1:]
        if len(corr_matrix.columns)<2:
            flag=False
    del corr_matrix
    return df.drop(drop, axis=1)

def filter(df, cv:float=0.05, corr:float=0.85):
    """ main feature filtering module """
    features = df.columns.tolist() 
    df = same_feature(df)
    df = cv_limit(df, threshold=cv)
    df = corr_limit(df, threshold=corr)
    features_new = df.columns.tolist()
    return df

In [3]:
file_X = "data/X.pickle"
names = ["drugbank", "ctd", "semmed", "l1000", "mold2", "mol2vec", "mordred", "pubchem", "admet"] # not-working 

In [4]:
# load
X = pd.read_pickle(file_X)

# date split
train_comp, test_comp = pd.read_pickle("data/comp_split/date.pickle")
for name in names:
    X_train = copy.deepcopy(X.loc[train_comp, X.columns.str.contains(name)])
    X_train = filter(X_train, cv=0.05, corr=0.85)
    filtered_feature = X_train.columns.tolist()
    pd.to_pickle(filtered_feature, f"data/filtered_feature/{name}/date.pickle")

# random split
for i in range(20):
    train_comp, test_comp = pd.read_pickle(f"data/comp_split/random_{str(i)}.pickle")
    for name in names:
        X_train = copy.deepcopy(X.loc[train_comp, X.columns.str.contains(name)])
        X_train = filter(X_train, cv=0.05, corr=0.85)
        filtered_feature = X_train.columns.tolist()
        pd.to_pickle(filtered_feature, f"data/filtered_feature/{name}/random_{str(i)}.pickle")
   

## For Concat dataset (without protein intaraction datasets)

In [None]:
folder = "concat_bind"
# load
X = pd.read_pickle(file_X)

# date
train, test = pd.read_pickle("data/comp_split/date.pickle")
features = []
for name in names:
    features += pd.read_pickle(f"data/filtered_feature/{name}/date.pickle")
X_train = copy.deepcopy(X.loc[train, features])
X_train = filter(X_train, cv=0.05, corr=0.85)
features = X_train.columns.tolist()
pd.to_pickle(features, f"data/filtered_feature/{folder}/date.pickle")

# random
for i in range(20):
    train, test = pd.read_pickle(f"data/comp_split/random_{str(i)}.pickle")
    features = []
    for name in names:
        features += pd.read_pickle(f"data/filtered_feature/{name}/random_{str(i)}.pickle")
    X_train = copy.deepcopy(X.loc[train, features])
    X_train = filter(X_train, cv=0.05, corr=0.85)
    features = X_train.columns.tolist()
    pd.to_pickle(features, f"data/filtered_feature/{folder}/random_{str(i)}.pickle")

## binding data inter section
* For binding data set (semmed, ctd, drugbank), generate filtered features with common features
* It will not work bacause 2/3 datasets are not allowed to uploaded.
* Only filtered features of Drugbank dataset are uploaded

In [None]:
drugbank = X.loc[:,X.columns.str.contains("drugbank_")].columns.tolist()
ctd = X.loc[:,X.columns.str.contains("ctd_")].columns.tolist()
semmed = X.loc[:,X.columns.str.contains("semmed_")].columns.tolist()

drugbank = [i.split("drugbank_")[1] for i in drugbank]
ctd = [i.split("ctd_")[1] for i in ctd]
semmed = [i.split("semmed_")[1] for i in semmed]

feature_intersection = list(set(drugbank)&set(ctd)&set(semmed))
drugbank_features = [f"drugbank_{i}" for i in feature_intersection]
ctd_features = [f"ctd_{i}" for i in feature_intersection]
semmed_features = [f"semmed_{i}" for i in feature_intersection]

names = ["drugbank_inter", "ctd_inter", "semmed_inter"]
features_lst = [drugbank_features, ctd_features, semmed_features]

In [None]:
# load
X = pd.read_pickle(file_X)

# date split
train_comp, test_comp = pd.read_pickle("data/comp_split/date.pickle")
for name, features in zip(names, features_lst):
    X_train = copy.deepcopy(X.loc[train_comp, features])
    X_train = filter(X_train, cv=0.05, corr=0.85)
    filtered_feature = X_train.columns.tolist()
    pd.to_pickle(filtered_feature, f"data/filtered_feature/{name}/date.pickle")

# random split
for i in range(20):
    train_comp, test_comp = pd.read_pickle(f"data/comp_split/random_{str(i)}.pickle")
    for name, features in zip(names, features_lst):
        X_train = copy.deepcopy(X.loc[train_comp, features])
        X_train = filter(X_train, cv=0.05, corr=0.85)
        filtered_feature = X_train.columns.tolist()
        pd.to_pickle(filtered_feature, f"data/filtered_feature/{name}/random_{str(i)}.pickle")