In [1]:
#Preprocessing Imports
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score, mean_squared_error, make_scorer
from scipy.stats import spearmanr

In [2]:
#Model Imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC, SVR

In [3]:
def get_formatted_regression_scores(scc_10, mse_10):
    mu = round(np.mean(scc_10),3)
    std = round(np.std(scc_10),3)
    scc = f'{mu}$\pm${std}'
    mu = round(np.mean(mse_10),3)
    std = round(np.std(mse_10),3)
    mse = f'{np.abs(mu)}$\pm${std}' 
    return scc, mse

In [4]:
def spearman_score(y_true, y_pred):
    spearman, _ = spearmanr(y_true, y_pred)
    return np.abs(spearman)

def run_regression_models(X, y):
    sccs = []
    mses = []
    models = [KNeighborsRegressor(n_neighbors=3), 
              LinearRegression(), 
              DecisionTreeRegressor(max_leaf_nodes=100), 
              RandomForestRegressor(n_estimators=100, max_depth=10),
              GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1), 
              SVR()]
    for model in models:
        clf = model 
        scc_10 = []
        mse_10 = []
        for _ in range(10):
            scores = cross_validate(clf, X, y, cv=3, scoring={'spearman': make_scorer(spearman_score), 'mean_squared_error': 'neg_mean_squared_error'})
            scc_10.extend(scores['test_spearman'])
            mse_10.extend(scores['test_mean_squared_error'])
        scc, mse = get_formatted_regression_scores(scc_10, mse_10)
        sccs.append(scc)
        mses.append(mse)  
    return sccs, mses

In [5]:
def get_formatted_classification_scores(auroc_10, auprc_10):
    mu = round(np.mean(auroc_10),3)
    std = round(np.std(auroc_10),3)
    auroc = f'{mu}$\pm${std}'
    mu = round(np.mean(auprc_10),3)
    std = round(np.std(auprc_10),3)
    auprc = f'{mu}$\pm${std}'
    return auroc, auprc

In [6]:
def run_classification_models(X, y):
    aurocs = []
    auprcs = []
    models = [KNeighborsClassifier(n_neighbors=3), 
              LinearRegression(), 
              LogisticRegression(), 
              GaussianNB(), 
              DecisionTreeClassifier(max_leaf_nodes=100), 
              RandomForestClassifier(n_estimators=100, max_depth=10),
              GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1), 
              SVC()]
    for model in models:
        clf = model
        auroc_10 = []
        auprc_10 = []
        for _ in range(10):
            scores = cross_validate(clf, X, y, cv=3, scoring=('roc_auc', 'average_precision'))
            auroc_10.extend(scores['test_roc_auc'])
            auprc_10.extend(scores['test_average_precision'])
        auroc, auprc = get_formatted_classification_scores(auroc_10, auprc_10)
        aurocs.append(auroc)
        auprcs.append(auprc)
    return aurocs, auprcs

In [7]:
def print_table_row(n, name1, metric1, name2, metric2):
    filenames = ["STITCH", "Drugbank", "Repur", "PDX", "GDSC", "CCLE", "SIDER", "OFFSIDE"]
    if n == 0:
        print("\\hline")
        print("Datasets & Metrics & KNN & Linear Regression & Logistic Regression & Naive Bayes & Decision Tree & Random Forest & Gradient Boosting & SVM \\\\")
        print("\hline")
        print("\hline")
    if n > 2 and n < 6:
        print("\\multirow{{2}}{{4em}}{{{}}} & {} & {} & {} & N/A & N/A & {} & {} & {} & {} \\\\".format(filenames[n], name1, *metric1))
        print("& {} & {} & {} & N/A & N/A & {} & {} & {} & {} \\\\".format(name2, *metric2))
        print("\\hline")        
    else:
        print("\\multirow{{2}}{{4em}}{{{}}} & {} & {} & {} & {} & {} & {} & {} & {} & {} \\\\".format(filenames[n], name1, *metric1))        
        print("& {} & {} & {} & {} & {} & {} & {} & {} & {} \\\\".format(name2, *metric2))
        print("\\hline")        
    if n in [2,5,7]:
        print("\\hline")

In [8]:
from sklearn.model_selection import train_test_split

filenames = ["STITCH", "Drugbank", "Repur", "PDX", "GDSC", "CCLE", "SIDER", "OFFSIDE"]

for i in range(len(filenames)):
    data = np.load(f'{filenames[i]}.npy')
    size = 500/len(data)
    X = data[:, :-1]
    y = data[:, -1:].flatten()
    _, X, _, y = train_test_split(X, y, test_size=size, random_state=42)
    if i > 2 and i < 6:
        sccs, mses = run_regression_models(X, y)
        print_table_row(i, "SCC", sccs, "MSE", mses)
    else:
        aurocs, auprcs = run_classification_models(X,y)
        print_table_row(i, "AUROC", aurocs, "AUPRC", auprcs)

\hline
Datasets & Metrics & KNN & Linear Regression & Logistic Regression & Naive Bayes & Decision Tree & Random Forest & Gradient Boosting & SVM \\
\hline
\hline
\multirow{2}{4em}{STITCH} & AUROC & 0.659$\pm$0.066 & 0.775$\pm$0.05 & 0.763$\pm$0.053 & 0.73$\pm$0.069 & 0.573$\pm$0.047 & 0.752$\pm$0.037 & 0.736$\pm$0.039 & 0.74$\pm$0.013 \\
& AUPRC & 0.214$\pm$0.056 & 0.34$\pm$0.132 & 0.334$\pm$0.108 & 0.303$\pm$0.119 & 0.146$\pm$0.028 & 0.295$\pm$0.087 & 0.311$\pm$0.087 & 0.295$\pm$0.059 \\
\hline
\multirow{2}{4em}{Drugbank} & AUROC & 0.688$\pm$0.024 & 0.689$\pm$0.062 & 0.665$\pm$0.078 & 0.716$\pm$0.103 & 0.583$\pm$0.058 & 0.727$\pm$0.076 & 0.652$\pm$0.057 & 0.784$\pm$0.049 \\
& AUPRC & 0.368$\pm$0.119 & 0.503$\pm$0.094 & 0.448$\pm$0.102 & 0.491$\pm$0.126 & 0.114$\pm$0.041 & 0.392$\pm$0.117 & 0.296$\pm$0.118 & 0.48$\pm$0.125 \\
\hline
\multirow{2}{4em}{Repur} & AUROC & 0.602$\pm$0.041 & 0.669$\pm$0.009 & 0.672$\pm$0.014 & 0.635$\pm$0.053 & 0.579$\pm$0.054 & 0.644$\pm$0.043 & 0.604$\pm$0

In [51]:
model_names = ["KNN", "LinReg", "DT", "RF", "GB", "SVM"]

regression_models = [KNeighborsRegressor(n_neighbors=3), 
              LinearRegression(),
              DecisionTreeRegressor(max_leaf_nodes=100), 
              RandomForestRegressor(n_estimators=100, max_depth=10),
              GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1),
              SVR()]

classification_models = [KNeighborsClassifier(n_neighbors=3), 
              LinearRegression(), 
              DecisionTreeClassifier(max_leaf_nodes=100), 
              RandomForestClassifier(n_estimators=100, max_depth=10),
              GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1), 
              SVC()]

file_names = ["STITCH", "Drugbank", "Repur", "PDX", "GDSC", "CCLE", "SIDER", "OFFSIDE"] 

task_groups = [["STITCH", "Drugbank", "Repur"], ["PDX", "GDSC", "CCLE"], ["SIDER", "OFFSIDE"]]

original_performance = {
    "KNN": {
        "STITCH": 0.659, 
        "Drugbank": 0.688, 
        "Repur": 0.602, 
        "PDX": 0.244, 
        "GDSC": 0.058, 
        "CCLE": 0.104, 
        "SIDER": 0.567, 
        "OFFSIDE": 0.544,
    },
    "LinReg": {
        "STITCH": 0.775, 
        "Drugbank": 0.689, 
        "Repur": 0.669, 
        "PDX": 0.231, 
        "GDSC": 0.061, 
        "CCLE": 0.177, 
        "SIDER": 0.541, 
        "OFFSIDE": 0.525,
    },
    "DT": {
        "STITCH": 0.573, 
        "Drugbank": 0.583, 
        "Repur": 0.579, 
        "PDX": 0.26, 
        "GDSC": 0.043, 
        "CCLE": 0.102, 
        "SIDER": 0.69, 
        "OFFSIDE": 0.535,
    },
    "RF": {
        "STITCH": 0.752, 
        "Drugbank": 0.727, 
        "Repur": 0.644, 
        "PDX": 0.284, 
        "GDSC": 0.069, 
        "CCLE": 0.063, 
        "SIDER": 0.69, 
        "OFFSIDE": 0.603,
    },
    "GB": {
        "STITCH": 0.736, 
        "Drugbank": 0.652, 
        "Repur": 0.604, 
        "PDX": 0.276, 
        "GDSC": 0.053, 
        "CCLE": 0.077, 
        "SIDER": 0.672, 
        "OFFSIDE": 0.544,
    },
    "SVM": {
        "STITCH": 0.74, 
        "Drugbank": 0.784, 
        "Repur": 0.66, 
        "PDX": 0.266, 
        "GDSC": 0.091, 
        "CCLE": 0.148, 
        "SIDER": 0.672, 
        "OFFSIDE": 0.59,
    },
}

In [24]:
def get_transfer_score(original_metric, tranfer_metric):
    transfer_score = ((tranfer_metric-original_metric)/original_metric)*100
    return transfer_score

In [54]:
from sklearn.model_selection import train_test_split

for n in range(len(classification_models)):
    for i in range(len(file_names)):
        train_data = np.load(f'{file_names[i]}.npy')
        train_size = (500*0.67)/len(train_data)
        X_train = train_data[:, :-1]
        y_train = train_data[:, -1:].flatten()
        _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=train_size, random_state=30)
        transfer_scores = []
        for j in range(len(file_names)):
            if j == i:
                transfer_scores.append(0)
            else:
                test_data = np.load(f'{file_names[j]}.npy')
                test_size = (500*0.33)/len(test_data)
                X_test = test_data[:, :-1]
                y_test = test_data[:, -1:].flatten()
                _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=test_size, random_state=30)
                if j > 2 and j < 6:
                    reg = regression_models[n]
                    sccs = []
                    for _ in range(10):
                        reg.fit(X_train, y_train)
                        y_pred = reg.predict(X_test)
                        scc, _ = spearmanr(y_test, y_pred)
                        sccs.append(scc)
                    transfer_score = get_transfer_score(original_performance[model_names[n]][file_names[j]], np.mean(sccs))
                    transfer_scores.append(round(transfer_score, 3))
                else:
                    clf = classification_models[n]
                    aurocs = []
                    for _ in range(10):
                        clf.fit(X_train, np.round(y_train))
                        y_pred = clf.predict(X_test)
                        auroc = roc_auc_score(np.sign(y_test), y_pred)
                        aurocs.append(auroc)
                    transfer_score = get_transfer_score(original_performance[model_names[n]][file_names[j]], np.mean(aurocs))
                    transfer_scores.append(round(transfer_score, 3))
        print_dataset_tranfer_table_row(n, i, transfer_scores)

\subsubsection{KNN}
\begin{table}[h!]
\centering
\begin{tabular}{l|*{8}{c}}
\hline
Datasets & STITCH & Drugbank & Repur & PDX & GDSC & CCLE & SIDER & OFFSIDE \\
STITCH & 0 & -27.817 & -16.944 & -130.489 & -193.516 & -186.817 & -9.042 & -7.458 \\
Drugbank & -20.654 & 0 & -16.944 & -101.697 & 261.581 & -74.514 & -12.433 & -6.704 \\
Repur & -24.127 & -27.326 & 0 & -135.984 & 84.426 & 5.691 & -11.817 & -7.58 \\
PDX & -23.608 & -26.835 & -16.382 & 0 & 138.82 & -164.631 & -27.738 & -30.357 \\
GDSC & -24.127 & -31.11 & -16.382 & -106.653 & 0 & -27.786 & -31.101 & -23.436 \\
CCLE & -20.298 & -20.855 & -15.755 & -22.845 & -86.559 & 0 & -0.997 & -11.732 \\
SIDER & -26.89 & -29.29 & -22.555 & -70.148 & -54.475 & -58.301 & 0 & -5.32 \\
OFFSIDE & -25.851 & -30.763 & -22.555 & -81.109 & 10.041 & -213.457 & 8.225 & 0 \\
\hline
\end{tabular}
\end{table}


\subsubsection{LinReg}
\begin{table}[h!]
\centering
\begin{tabular}{l|*{8}{c}}
\hline
Datasets & STITCH & Drugbank & Repur & PDX & GDSC & CCLE & SID

In [49]:
for n in range(len(classification_models)):
    for i in range(len(task_groups)):
        portion = 1/len(task_groups[i])
        X_train = []
        y_train = []
        transfer_scores = []
        for task in task_groups[i]:
            train_data = np.load(f'{task}.npy')
            train_size = (500*0.67*portion)/len(train_data)
            X_train_task = train_data[:, :-1]
            y_train_task = train_data[:, -1:].flatten()
            _, X_train_task, _, y_train_task = train_test_split(X_train_task, y_train_task, test_size=train_size, random_state=30)
            X_train.extend(X_train_task)
            y_train.extend(y_train_task)
        for j in range(len(task_groups)):
            if i == j:
                transfer_scores.append(0)
            else:
                portion = 1/len(task_groups[j])
                X_test = []
                y_test = []
                for task in task_groups[i]:
                    test_data = np.load(f'{task}.npy')
                    test_size = (500*0.33*portion)/len(test_data)
                    X_test_task = test_data[:, :-1]
                    y_test_task = test_data[:, -1:].flatten()
                    _, X_test_task, _, y_test_task = train_test_split(X_test_task, y_test_task, test_size=train_size, random_state=30)
                    X_test.extend(X_test_task)
                    y_test.extend(y_test_task)
                if j == 1:
                    reg = regression_models[n]
                    sccs = []
                    for _ in range(10):
                        reg.fit(X_train, y_train)
                        y_pred = reg.predict(X_test)
                        scc, _ = spearmanr(y_test, y_pred)
                        sccs.append(scc)
                    transfer_score = get_transfer_score(original_performance[model_names[n]][task], np.mean(sccs))
                    transfer_scores.append(round(transfer_score, 3))
                else:
                    clf = classification_models[n]
                    aurocs = []
                    for _ in range(10):
                        clf.fit(X_train, np.round(y_train))
                        y_pred = clf.predict(X_test)
                        auroc = roc_auc_score(np.sign(y_test), y_pred)
                        aurocs.append(auroc)
                    transfer_score = get_transfer_score(original_performance[model_names[n]][task], np.mean(aurocs))
                    transfer_scores.append(round(transfer_score, 3))
        print_task_tranfer_table_row(n, i, transfer_scores)

\subsubsection{KNN}
\begin{table}[h!]
\centering
\begin{tabular}{l|ccc}
\hline
Tasks & Drug Targets & Drug Response & Drug Side Effects \\
Drug Targets & 0 & -47.474 & -10.053 \\
Drug Response & 605.869 & 0 & 605.869 \\
Drug Side Effects & 17.101 & 21.316 & 0 \\
\hline
\end{tabular}
\end{table}



\subsubsection{LinReg}
\begin{table}[h!]
\centering
\begin{tabular}{l|ccc}
\hline
Tasks & Drug Targets & Drug Response & Drug Side Effects \\
Drug Targets & 0 & -64.854 & 9.104 \\
Drug Response & 274.14 & 0 & 274.14 \\
Drug Side Effects & 52.861 & -20.803 & 0 \\
\hline
\end{tabular}
\end{table}



\subsubsection{DT}
\begin{table}[h!]
\centering
\begin{tabular}{l|ccc}
\hline
Tasks & Drug Targets & Drug Response & Drug Side Effects \\
Drug Targets & 0 & -37.739 & 19.596 \\
Drug Response & 799.749 & 0 & 799.749 \\
Drug Side Effects & 86.916 & 86.916 & 0 \\
\hline
\end{tabular}
\end{table}



\subsubsection{RF}
\begin{table}[h!]
\centering
\begin{tabular}{l|ccc}
\hline
Tasks & Drug Targets & Drug

In [48]:
def print_task_tranfer_table_row(n, i, scores):
    tasks = ["Drug Targets", "Drug Response", "Drug Side Effects"]
    if i == 0:
        print(f"\\subsubsection{{{model_names[n]}}}")
        print("\\begin{table}[h!]")
        print("\\centering")
        print("\\begin{tabular}{l|ccc}")
        print("\\hline")
        print("Tasks & Drug Targets & Drug Response & Drug Side Effects \\\\")
    print("{} & {} & {} & {} \\\\".format(tasks[i], *scores))
    if i == 2:
        print("\\hline")
        print("\\end{tabular}")
        print("\\end{table}")
        print("\n")

In [50]:
def print_dataset_tranfer_table_row(n, i, scores):
    tasks = ["Drug Targets", "Drug Response", "Drug Side Effects"]
    if i == 0:
        print(f"\\subsubsection{{{model_names[n]}}}")
        print("\\begin{table}[h!]")
        print("\\centering")
        print("\\begin{tabular}{l|*{8}{c}}")
        print("\\hline")
        print("Datasets & {} & {} & {} & {} & {} & {} & {} & {} \\\\".format(*file_names))
    print("{} & {} & {} & {} & {} & {} & {} & {} & {} \\\\".format(file_names[i], *scores))
    if i == 7:
        print("\\hline")
        print("\\end{tabular}")
        print("\\end{table}")
        print("\n")