# Calculate permutation importance (AUROC) for test compounds
* Some datasets are not allowed to be uploaded.
* Therefore, some datasets have been removed from publication environment.
* The locations of the non-working without these datasets are commented out.
* The output locations that differ from the publication environment are also commented out.
***

In [1]:
import copy
import itertools

import numpy as np
import pandas as pd
from numpy import random as rnd
from tqdm import tqdm

from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler

import xgboost as xgb

In [2]:
def load(target:str, file_split:str="", file_X:str="", file_y:str="", file_filtered_feature:str="", fillna:bool=True):
    """ load x, y files """
    X = pd.read_pickle(file_X)
    y = pd.read_pickle(file_y)
    train_comp, test_comp = pd.read_pickle(file_split)
    filtered_feature = pd.read_pickle(file_filtered_feature)
    X_train = X.loc[train_comp, filtered_feature]
    X_test = X.loc[test_comp, filtered_feature]
    y_train = y.loc[train_comp, target]
    y_test = y.loc[test_comp, target]
    if fillna:
        X_train, X_test = preprocess(X_train, X_test)
    X_train.columns = [str(i) for i in range(len(X_train.columns))]
    X_test.columns = X_train.columns.tolist()
    return X_train, X_test, y_train, y_test

def preprocess(x_train:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame):
    """ standalize / fillna with 0 """
    ss = StandardScaler()
    ss.fit(x_train)
    x_train = pd.DataFrame(ss.transform(x_train), index=x_train.index, columns=x_train.columns).fillna(0)
    x_test = pd.DataFrame(ss.transform(x_test), index=x_test.index, columns=x_test.columns).fillna(0)
    return x_train, x_test

# XGBoost
def train(X:np.ndarray, y:np.ndarray, X_val:np.ndarray, y_val:np.ndarray, params:dict={}):
    """ training """
    dtrain = xgb.DMatrix(X, label=y)
    dval = xgb.DMatrix(X_val, label=y_val)
    watchlist=[(dtrain,'train'),(dval,'eval')]
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=100000,
        early_stopping_rounds=20,
        evals=watchlist,
        verbose_eval=0
            )
    return model

def test(X:np.ndarray, y:np.ndarray, model:xgb.core.Booster):
    """ test """
    dtest = xgb.DMatrix(X)
    y_pred = model.predict(dtest)
    score = 1 - roc_auc_score(y, y_pred)
    return score

class featureselect():
    # permutation importance calculation with target data
    def __init__(self):
        self.model=None
        self.perm_imp = None
        self.perm_imp_std = None
        self.imp_features = None

    def perm_selection(self, X:pd.core.frame.DataFrame, y:pd.core.frame.DataFrame, X_target:pd.core.frame.DataFrame, y_target:pd.core.frame.DataFrame, n_repeat:int=100, threshold:float=0, params:dict=dict(), stratify:bool=True, kfold:int=5):
        """ feature selection with permutation importance method """
        self.__calc(X.values, y.values, X_target.values, y_target.values,
                    n_repeat=n_repeat, params=copy.deepcopy(params),
                    stratify=stratify, kfold=kfold)
        features = X.columns.tolist()
        perm_imp = pd.DataFrame([features]+[self.perm_imp, self.perm_imp_std], index=["feature","importance","importance_std"]).T
        imp_features = perm_imp[perm_imp['importance']>=threshold]['feature'].tolist()
        self.imp_features = imp_features
        print("{} / {} features extracted".format(len(imp_features),len(features)))

    def __calc(self, X:pd.core.frame.DataFrame, y:pd.core.frame.DataFrame, X_target:pd.core.frame.DataFrame, y_target:pd.core.frame.DataFrame, n_repeat:int=100, params:dict=dict(), stratify:bool=False, kfold:int=5):
        """ calc permutation importance with metrics loss by n_repeat repeats"""
        perm_imps=list()
        if stratify:
            kf = StratifiedKFold(n_splits=kfold, shuffle=True)

        for tr, val in kf.split(X, y):
            X_train, y_train, X_val, y_val = X[tr,:], y[tr], X[val,:], y[val]
            
            model = train(X_train, y_train, X_val, y_val, params=params)
            loss = test(X_target, y_target, model)
            
            X_tmp = copy.deepcopy(X_target)
            agents = [i for i in range(len(X_target))]
            for i in tqdm(range(len(X.T))):
                perm_imp_tmp=list()
                shuffle_lst = X_tmp[:,i]
                for v in range(int(n_repeat)):
                    rnd.shuffle(agents)
                    X_target[:,i] = shuffle_lst[agents]
                    loss_tmp = test(X_target, y_target, model)
                    perm_imp_tmp.append(loss_tmp - loss)
                
                X_target[:,i] = shuffle_lst
                perm_imps.append(perm_imp_tmp)
        # U
        perm_imps = [[v for numb, v in enumerate(perm_imps) if numb%len(X_target.T)==i] for i in range(len(X_target.T))]
        perm_imps = [list(itertools.chain(*i)) for i in perm_imps]
        perm_imp = [np.mean(i) for i in perm_imps]
        perm_imp_std = [np.std(i) for i in perm_imps]

        # export
        self.perm_imp = perm_imp
        self.perm_imp_std = perm_imp_std

xgb_params = {
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 1e-5,
    'max_delta_step': 5,
    'lambda': 1e-5,
    'alpha': 1e-5,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.03,
    'seed': 24771,
    'n_jobs': -1
    }  
    

In [3]:
# path
main_folder = "" # indicate mainfolder to which data files were exported to.

file_X = f"{main_folder}data/X.pickle"
file_y = f"{main_folder}data/y.pickle"
target = 'Hepatobiliary disorders'
target_folder=["drugbank_inter", "ctd_inter", "semmed_inter", "l1000", "mold2", "mol2vec", "mordred", "pubchem", "admet", "concat"] # not-working

In [None]:
for folder in target_folder:
    print(folder)
    df_res = pd.DataFrame()
    output = f"importance/xgb/date_{folder}.pickle"
    seed = 24771
    for i in range(20):
        file_filtered_feature = f"{main_folder}data/filtered_feature/{folder}/date.pickle"
        file_split = f"{main_folder}data/comp_split/date.pickle"

        x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=False)

        # calc permutation importance
        xgb_params['seed'] = seed
        dat = featureselect()
        dat.perm_selection(x, y, x_test, y_test, n_repeat=25, params=xgb_params, threshold=0, kfold=4)
        # result
        filtered_feature = pd.read_pickle(file_filtered_feature)
        perm_imp = pd.DataFrame(dat.perm_imp, index=filtered_feature, columns=[str(i)])
        df_res = pd.concat([df_res, perm_imp], axis=1, join="outer")
        seed+=10
    pd.to_pickle(df_res, output)

# no binding data
***

In [4]:
folder = "concat_bind"

In [None]:
df_res = pd.DataFrame()
output = f"importance/xgb/date_{folder}.pickle"
seed = 24771
for i in range(20):
    file_filtered_feature = f"{main_folder}data/filtered_feature/{folder}/date.pickle"
    file_split = f"{main_folder}data/comp_split/date.pickle"

    x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=False)

    # calc permutation importance
    xgb_params['seed'] = seed
    dat = featureselect()
    dat.perm_selection(x, y, x_test, y_test, n_repeat=25, params=xgb_params, threshold=0, kfold=4)
    # result
    filtered_feature = pd.read_pickle(file_filtered_feature)
    perm_imp = pd.DataFrame(dat.perm_imp, index=filtered_feature, columns=[str(i)])
    df_res = pd.concat([df_res, perm_imp], axis=1, join="outer")
    seed+=10
pd.to_pickle(df_res, output)