# Prediction Performance Evaluation
* Some datasets are not allowed to be uploaded.
* Therefore, some datasets have been removed from publication environment.
* The locations of the non-working without these datasets are commented out.
* The output locations that differ from the publication environment are also commented out.

In [1]:
# import libraries
from typing import Callable
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
import sklearn.metrics as metrics

# machine learning libraries
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import ElasticNet
import xgboost as xgb
import lightgbm as lgb

# tabnet
import torch
from pytorch_tabnet.tab_model import TabNetClassifier
from pytorch_tabnet.pretraining import TabNetPretrainer

import warnings
warnings.simplefilter("ignore")

In [2]:
# Machine Learning Methods
def tbn_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame,params:dict, seed:int=24771):
    """return tabnet prediction result"""
    batch_size=128
    max_epochs=500
    params['seed']=seed
    pretrainer = TabNetPretrainer(**params)
    pretrainer.fit(X_train=x_train.values,eval_set=[x_train.values],max_epochs=max_epochs,
                patience=10,batch_size=batch_size,virtual_batch_size=128,
                drop_last=True)
    model = TabNetClassifier(**params)
    model.fit(X_train=x_train.values,y_train=y_train.values,eval_set=[(x_val.values,y_val.values)],eval_name=["valid"],
            eval_metric=["logloss"],max_epochs=max_epochs,patience=10,
            batch_size=batch_size,virtual_batch_size=128,
            drop_last=False, from_unsupervised=pretrainer)
    oof_pred = model.predict_proba(x_val.values)[:,1]
    y_pred = model.predict_proba(x_test.values)[:,1]
    return oof_pred, y_pred

def xgb_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, params:dict, seed:int=24771):
    """return xgboost prediction result"""
    params['seed']=seed
    dtrain = xgb.DMatrix(x_train, label=y_train)
    dval = xgb.DMatrix(x_val, label=y_val)
    dtest = xgb.DMatrix(x_test)
    evals = [(dtrain, 'train'), (dval, 'eval')]
    model = xgb.train(
        params,
        dtrain,
        evals = evals,
        early_stopping_rounds=20,
        num_boost_round=10000,
        verbose_eval=0,
        )
    oof_pred = model.predict(dval)
    y_pred = model.predict(dtest)
    return oof_pred, y_pred

def lgb_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, params:dict, seed:int=24771):
    """return lightgbm prediction result"""
    params['seed']=seed
    train_dataset = lgb.Dataset(x_train, y_train, params={'verbose': -1})
    val_dataset = lgb.Dataset(x_val, y_val, params={'verbose': -1})
    model = lgb.train(params = params, 
                        train_set = train_dataset, 
                        valid_sets = [train_dataset, val_dataset], 
                        num_boost_round = 10000, 
                        early_stopping_rounds = 20, 
                        verbose_eval = False,
                        )
    oof_pred = model.predict(x_val)
    y_pred = model.predict(x_test)
    return oof_pred, y_pred

def nb_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, params:dict, seed:int=24771):
    """return naive bayes prediction result"""
    model = GaussianNB()
    model.fit(x_train,y_train)
    oof_pred = model.predict_proba(x_val)[:,1]
    y_pred = model.predict_proba(x_test)[:,1]
    return oof_pred, y_pred

def svm_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, params:dict, seed:int=24771):
    """return SVM prediction result"""
    params['random_state']=seed
    model = SVC(**params, probability=True)
    model.fit(x_train, y_train)
    oof_pred = model.predict_proba(x_val)[:,1]
    y_pred = model.predict_proba(x_test)[:,1]
    return oof_pred, y_pred

def els_pred(x_train:pd.core.frame.DataFrame, y_train:pd.core.frame.DataFrame, x_val:pd.core.frame.DataFrame, y_val:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, params:dict, seed:int=24771):
    """return elastic net prediction result"""
    params["random_state"]=seed
    model = ElasticNet(**params)
    model.fit(x_train, y_train)
    oof_pred = model.predict(x_val)
    y_pred = model.predict(x_test)
    return oof_pred, y_pred

# Parameters 
lgb_params=  {
    "num_leaves": 2**5,
    "min_data_in_leaf": 5,
    "min_child_weight": 1,
    "bagging_fraction": 0.9,
    "feature_fraction": 0.7,
    "lambda_l1": 1e-5,
    "lambda_l2": 1e-5,
    'boosting': 'gbdt',
    "objective":"binary",
    "metric":"binary_logloss",
    "learning_rate":0.005,
    'seed':24771,
    'verbose':-1
}

xgb_params = {
    'max_depth': 5,
    'min_child_weight': 3,
    'gamma': 1e-5,
    'max_delta_step': 5,
    'lambda': 1e-5,
    'alpha': 1e-5,
    'subsample': 0.9,
    'colsample_bytree': 0.7,
    'booster': 'gbtree',
    'objective': 'binary:logistic',
    'eval_metric': 'logloss',
    'eta': 0.03,
    'seed': 24771,
    'n_jobs': -1
}

svm_params = {
    'C': 0.05,
    'kernel':'sigmoid',
    'degree': 5,
    'gamma': 'auto',
    'coef0': 0.4
}

tbn_params = {
    'n_d':60,
    'n_a':60,
    'n_independent':2,
    'n_shared':2,
    'mask_type': 'sparsemax',
    'n_steps': 1,
    'gamma': 1.2,
    'n_shared': 2,
    'lambda_sparse': 1e-05,
    'optimizer_fn':torch.optim.Adam, 
    'optimizer_params':dict(lr=2e-2),
    'scheduler_params':dict(mode="min",
                            patience=5,
                            min_lr=1e-7,
                            factor=0.9),
    'scheduler_fn':torch.optim.lr_scheduler.ReduceLROnPlateau,
    'verbose':0,
    'device_name':"cpu"
}

els_params= {
    'alpha':0.01,
    'l1_ratio':0.2,
    'max_iter':10000,
}


In [3]:
# Modules
def preprocess(x_train:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame):
    """ standalize / fillna with 0 """
    ss = StandardScaler()
    ss.fit(x_train)
    x_train = pd.DataFrame(ss.transform(x_train), index=x_train.index, columns=x_train.columns).fillna(0)
    x_test = pd.DataFrame(ss.transform(x_test), index=x_test.index, columns=x_test.columns).fillna(0)
    return x_train, x_test

def create_model(x:pd.core.frame.DataFrame, x_test:pd.core.frame.DataFrame, y:pd.core.frame.DataFrame, params:dict, method:Callable="", seed:int=0):
    """ prediction core module / 5-fold cross validation """
    # Create a KFold object
    fold = StratifiedKFold(n_splits = 5, random_state = seed, shuffle = True)
    oof_predictions = np.zeros(x.shape[0])

    # Iterate through each fold
    predictions = np.zeros(x_test.shape[0])
    for trn_ind, val_ind in fold.split(x, y):
        x_train, x_val = x.iloc[trn_ind], x.iloc[val_ind]
        y_train, y_val = y.iloc[trn_ind], y.iloc[val_ind]
        oof_pred, y_pred = method(x_train, y_train, x_val, y_val, x_test, params, seed=seed)
        predictions += y_pred / 5
        oof_predictions[val_ind] = oof_pred
        gc.collect()
    return oof_predictions, predictions

def pred(method:Callable="", params:dict=dict(), random_state:int=0, target:str="", file_split:str="", file_X:str="", file_y:str="", file_filtered_feature:str="", fillna:bool=False):
    """ repeat 5 times of predictions """
    x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=fillna)
    oof_predictions = []
    predictions = []
    seed = random_state
    for v in range(5):
        oof_pred, pred = create_model(x, x_test, y, params, method=method, seed=seed)
        oof_predictions.append(oof_pred)
        predictions.append(pred)
        seed += 1
    return oof_predictions, predictions

def calc_statistics(y_true:np.ndarray, y_pred:np.ndarray, threshold:float):
    """ calculate performance """
    # For minus-AUPRC
    rev_y_true = [1-i for i in y_true]
    rev_y_pred = [1-i for i in y_pred]
    # calculation
    fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
    auroc = metrics.auc(fpr, tpr)
    precision, recall, thresholds = metrics.precision_recall_curve(y_true, y_pred)
    auprc = metrics.auc(recall, precision)
    precision, recall, thresholds = metrics.precision_recall_curve(rev_y_true, rev_y_pred)
    minusauprc = metrics.auc(recall, precision)
    # other scores
    y_dammy = [1 if i>threshold else 0 for i in y_pred]
    acc = metrics.accuracy_score(y_true, y_dammy)
    f1 = metrics.matthews_corrcoef(y_true, y_dammy)
    mcc = metrics.f1_score(y_true=y_true, y_pred=y_dammy)
    return auroc, auprc, minusauprc, acc, f1, mcc

def information(target:str="", file_split:str="", file_X:str="", file_y:str="", file_filtered_feature:str=""):
    """ extract sample informations """
    x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=False)
    train_all = len(x.index)
    train_p = sum(y)
    train_n = train_all - train_p
    test_all = len(x_test.index)
    test_p = sum(y_test)
    test_n = test_all - test_p
    all_compounds = len(x.index)+len(x_test.index)
    return (all_compounds, train_all, train_p, train_n, test_all, test_p, test_n)

def load(target:str, file_split:str="", file_X:str="", file_y:str="", file_filtered_feature:str="", fillna:bool=True):
    """ load x, y files """
    X = pd.read_pickle(file_X)
    y = pd.read_pickle(file_y)
    train_comp, test_comp = pd.read_pickle(file_split)
    filtered_feature = pd.read_pickle(file_filtered_feature)
    X_train = X.loc[train_comp, filtered_feature]
    X_test = X.loc[test_comp, filtered_feature]
    y_train = y.loc[train_comp, target]
    y_test = y.loc[test_comp, target]
    if fillna:
        X_train, X_test = preprocess(X_train, X_test)
    X_train.columns = [str(i) for i in range(len(X_train.columns))]
    X_test.columns = X_train.columns.tolist()
    return X_train, X_test, y_train, y_test


In [4]:
# Main modules
def main_date(method_def:Callable, params:dict=dict(), method:str="", folder:str="", fillna:bool=False, target=""):
    """ evaluate prediction scores with date split compounds """
    # path / cols for result
    file_X = "data/X.pickle"
    file_y = "data/y.pickle"
    col = [
        "all_compounds",
        "train_all",
        "train_positive",
        "train_negative",
        "test_all",
        "test_positive",
        "test_negative",
        "feature_number",
        "model",
        "threhold",
        "auroc_oof",
        "aupr_oof",
        "minusauprc_oof",
        "accuracy_oof",
        "f1score_oof",
        "mcc_oof",
        "auroc_test",
        "aupr_test",
        "minusauprc_test",
        "accuracy_test",
        "f1score_test",
        "mcc_test",
    ]
    random_state=24771
    df_res = pd.DataFrame(columns=col)
    for i in tqdm(range(20)):
        file_filtered_feature = f"data/filtered_feature/{folder}/date.pickle" # filtered feature file path
        file_split = "data/comp_split/date.pickle" # date split file path

        informations = information(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature)
        feature_numbers = len(pd.read_pickle(file_filtered_feature))
        preds = pred(method=method_def, params=params, random_state=random_state, target=target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=fillna)
        x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=False)
        # threshold
        thresh=0.5
        # calc scores
        oof_preds = pd.DataFrame(preds[0]).mean().values
        res_oof = calc_statistics(y, oof_preds, thresh)
        y_preds = pd.DataFrame(preds[1]).mean().values
        res_test = calc_statistics(y_test, y_preds, thresh)
        res_all = [*informations, feature_numbers, method, thresh, *res_oof, *res_test]
        df_res.loc[str(i),:] = res_all
        random_state+=10
    # output
    df_res.to_pickle("evaluation/date/"+method+"_"+folder+".pickle")
    print(f"finished : {method} / {folder}")

def main_random(method_def:Callable, params:dict=dict(), method:str="", folder:str="", fillna:bool=False, target=""):
    """ evaluate prediction scores with random split compounds """
    # path / cols for result
    file_X = "data/X.pickle"
    file_y = "data/y.pickle"
    col = [
        "all_compounds",
        "train_all",
        "train_positive",
        "train_negative",
        "test_all",
        "test_positive",
        "test_negative",
        "feature_number",
        "model",
        "threhold",
        "auroc_oof",
        "aupr_oof",
        "minusauprc_oof",
        "accuracy_oof",
        "f1score_oof",
        "mcc_oof",
        "auroc_test",
        "aupr_test",
        "minusauprc_test",
        "accuracy_test",
        "f1score_test",
        "mcc_test",
    ]
    random_state=24771
    df_res = pd.DataFrame(columns=col)
    for i in tqdm(range(20)):
        file_filtered_feature = f"data/filtered_feature/{folder}/random_{str(i)}.pickle" # filtered feature file path
        file_split = f"data/comp_split/random_{str(i)}.pickle" # date split file path
        informations = information(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature)
        feature_numbers = len(pd.read_pickle(file_filtered_feature))
        preds = pred(method=method_def, params=params, random_state=random_state, target=target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=fillna)
        x, x_test, y, y_test = load(target, file_split=file_split, file_X=file_X, file_y=file_y, file_filtered_feature=file_filtered_feature, fillna=False)
        # threshold
        thresh=0.5
        # calc scores
        oof_preds = pd.DataFrame(preds[0]).mean().values
        res_oof = calc_statistics(y, oof_preds, thresh)
        y_preds = pd.DataFrame(preds[1]).mean().values
        res_test = calc_statistics(y_test, y_preds, thresh)
        res_all = [*informations, feature_numbers, method, thresh, *res_oof, *res_test]
        df_res.loc[str(i),:] = res_all
        random_state+=10

    # output
    df_res.to_pickle("evaluation/random/"+method+"_"+folder+".pickle")
    print(f"finished : {method} / {folder}")

# date prediction
* samples were splitted with date (by: 1_preparing.ipynb).
* prediction scores will be calculated with 6 methods
***

In [7]:
names = ["drugbank", "ctd", "semmed","drugbank_inter", "ctd_inter", "semmed_inter", "l1000", "mold2", "mol2vec", "mordred", "pubchem", "admet", "concat_bind"] # not-working

In [None]:
pred_lst = [
    [xgb_pred, xgb_params, "xgb"],
    [lgb_pred, lgb_params, "lgb"],
]

for i in pred_lst:
    for folder in names:
        main_date(i[0], params=i[1], method=i[2], folder=folder, fillna=False)

In [None]:
pred_lst = [
    [els_pred, els_params, "els"],
    [nb_pred, {}, "nb"],
    [svm_pred, svm_params, "svm"],
    [tbn_pred, tbn_params, "tbn"],
]

for i in pred_lst:
    for folder in names:
        main_date(i[0], params=i[1], method=i[2], folder=folder, fillna=True)

# Random prediction
* samples were splitted with random (by: 1_preparing.ipynb).
* prediction scores will be calculated with 6 methods.
***

In [None]:
names = ["drugbank", "ctd", "semmed", "l1000", "drugbank_inter", "ctd_inter", "semmed_inter", "mold2", "mol2vec", "mordred", "pubchem", "admet"] # not-working
pred_lst = [
    [xgb_pred, xgb_params, "xgb"],
    [lgb_pred, lgb_params, "lgb"],
]

for i in pred_lst:
    for folder in names:
        main_random(i[0], params=i[1], method=i[2], folder=folder, fillna=False)

In [None]:
pred_lst = [
    [els_pred, els_params, "els"],
    [nb_pred, {}, "nb"],
    [svm_pred, svm_params, "svm"],
    [tbn_pred, tbn_params, "tbn"],
]

for i in pred_lst:
    for folder in names:
        main_random(i[0], params=i[1], method=i[2], folder=folder, fillna=True)