In [1]:
#Preprocessing Imports
import numpy as np
from sklearn.model_selection import cross_validate
from sklearn.metrics import roc_auc_score, average_precision_score, mean_squared_error, make_scorer
from scipy.stats import spearmanr

In [2]:
#Model Imports
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVC, SVR

In [3]:
def get_formatted_regression_scores(scores):
    mu = round(np.mean(scores['test_spearman']),3)
    std = round(np.std(scores['test_spearman']),3)
    scc = f'{mu} \pm {std}'
    mu = round(np.mean(scores['test_mean_squared_error']),3)
    std = round(np.std(scores['test_mean_squared_error']),3)
    mse = f'{np.abs(mu)} \pm {std}' 
    return scc, mse

In [4]:
def spearman_score(y_true, y_pred):
    spearman, _ = spearmanr(y_true, y_pred)
    return np.abs(spearman)

def run_regression_models(X, y):
    sccs = []
    mses = []
    models = [KNeighborsRegressor(n_neighbors=3), 
              LinearRegression(), 
              DecisionTreeRegressor(max_leaf_nodes=100), 
              RandomForestRegressor(n_estimators=100, max_depth=10),
              GradientBoostingRegressor(n_estimators=100, learning_rate=0.1, max_depth=1), 
              SVR()]
    for model in models:
        clf = model 
        scores = cross_validate(clf, X, y, cv=3, scoring={'spearman': make_scorer(spearman_score), 'mean_squared_error': 'neg_mean_squared_error'})
        scc, mse = get_formatted_regression_scores(scores)
        sccs.append(scc)
        mses.append(mse)  
    return sccs, mses

In [5]:
def get_formatted_classification_scores(scores):
    mu = round(np.mean(scores['test_roc_auc']),3)
    std = round(np.std(scores['test_roc_auc']),3)
    auroc = f'{mu} \pm {std}'
    mu = round(np.mean(scores['test_average_precision']),3)
    std = round(np.std(scores['test_average_precision']),3)
    auprc = f'{mu} \pm {std}'
    return auroc, auprc

In [6]:
def run_classification_models(X, y):
    aurocs = []
    auprcs = []
    models = [KNeighborsClassifier(n_neighbors=3), 
              LinearRegression(), 
              LogisticRegression(), 
              GaussianNB(), 
              DecisionTreeClassifier(max_leaf_nodes=100), 
              RandomForestClassifier(n_estimators=100, max_depth=10),
              GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=1), 
              SVC()]
    for model in models:
        clf = model
        scores = cross_validate(clf, X, y, cv=3, scoring=('roc_auc', 'average_precision'))
        auroc, auprc = get_formatted_classification_scores(scores)
        aurocs.append(auroc)
        auprcs.append(auprc)
    return aurocs, auprcs

In [7]:
def print_table_row(n, name1, metric1, name2, metric2):
    if n < 3:
        print("{} & {} & {} & N/A & N/A & {} & {} & {} & {}".format(name1, *metric1))
        print("{} & {} & {} & N/A & N/A & {} & {} & {} & {}".format(name2, *metric2))
    else:
        print("{} & {} & {} & {} & {} & {} & {} & {} & {}".format(name1, *metric1))        
        print("{} & {} & {} & {} & {} & {} & {} & {} & {}".format(name2, *metric2))

In [8]:
from sklearn.model_selection import train_test_split

filenames = ["ccle", "gdsc", "pdx", "offside", "sider", "drugbank", "repurposing_hub", "stitch"] 

for i in range(len(filenames)):
    print(filenames[i])
    data = np.load(f'{filenames[i]}.npy')
    size = 500/len(data)
    X = data[:, :-1]
    y = data[:, -1:].flatten()
    _, X, _, y = train_test_split(X, y, test_size=size, random_state=42)
    if i < 3:
        sccs, mses = run_regression_models(X, y)
        print_table_row(i, "SCC", sccs, "MSE", mses)
    else:
        aurocs, auprcs = run_classification_models(X,y)
        print_table_row(i, "AUROC", aurocs, "AUPRC", auprcs)
    print("\n\n")

ccle
SCC & 0.104 \pm 0.022 & 0.177 \pm 0.051 & N/A & N/A & 0.069 \pm 0.043 & 0.047 \pm 0.035 & 0.077 \pm 0.05 & 0.148 \pm 0.05
MSE & 1.432 \pm 0.199 & 1.102 \pm 0.092 & N/A & N/A & 2.079 \pm 0.129 & 1.178 \pm 0.117 & 1.093 \pm 0.135 & 1.086 \pm 0.166



gdsc
SCC & 0.058 \pm 0.04 & 0.061 \pm 0.036 & N/A & N/A & 0.039 \pm 0.018 & 0.092 \pm 0.013 & 0.053 \pm 0.038 & 0.091 \pm 0.011
MSE & 1.761 \pm 0.37 & 1.441 \pm 0.182 & N/A & N/A & 2.721 \pm 0.319 & 1.491 \pm 0.187 & 1.324 \pm 0.208 & 1.31 \pm 0.2



pdx
SCC & 0.244 \pm 0.073 & 0.231 \pm 0.083 & N/A & N/A & 0.273 \pm 0.049 & 0.271 \pm 0.042 & 0.276 \pm 0.054 & 0.266 \pm 0.027
MSE & 1.049 \pm 0.138 & 1.152 \pm 0.072 & N/A & N/A & 1.559 \pm 0.201 & 1.04 \pm 0.045 & 0.958 \pm 0.131 & 0.989 \pm 0.164



offside
AUROC & 0.544 \pm 0.042 & 0.525 \pm 0.072 & 0.501 \pm 0.049 & 0.584 \pm 0.075 & 0.545 \pm 0.017 & 0.591 \pm 0.069 & 0.545 \pm 0.044 & 0.59 \pm 0.061
AUPRC & 0.328 \pm 0.031 & 0.369 \pm 0.058 & 0.343 \pm 0.037 & 0.419 \pm 0.09 & 0.316

In [24]:
def print_auroc(y_true, y_score):
    auroc = roc_auc_score(y_true, y_score)
    print(f"AUROC: {round(auroc,3)}")
    
def print_scc(y_true, y_pred):
    scc, _ = spearmanr(y_true, y_pred)
    print(f"SCC: {np.abs(round(scc,3))}")

In [25]:
from sklearn.model_selection import train_test_split

filenames = ["ccle", "gdsc", "pdx", "offside", "sider", "drugbank", "repurposing_hub", "stitch"] 

for i in range(len(filenames)):
    print("Training Data: ", filenames[i])
    train_data = np.load(f'{filenames[i]}.npy')
    train_size = (500*0.67)/len(train_data)
    X_train = train_data[:, :-1]
    y_train = train_data[:, -1:].flatten()
    _, X_train, _, y_train = train_test_split(X_train, y_train, test_size=train_size, random_state=30)
    for j in range(len(filenames)):
        if j == i:
            continue
        test_data = np.load(f'{filenames[j]}.npy')
        print("Testing Data: ", filenames[j])
        test_size = (500*0.33)/len(test_data)

        X_test = test_data[:, :-1]
        y_test = test_data[:, -1:].flatten()
        _, X_test, _, y_test = train_test_split(X_test, y_test, test_size=test_size, random_state=30)
        
        clf = LinearRegression()
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        if i < 3 or j < 3:
            print_scc(y_test, y_pred)
        else:
            print_auroc(y_test, y_pred)
            
    print("\n\n")

Training Data:  ccle
Testing Data:  gdsc
SCC: 0.056
Testing Data:  pdx
SCC: 0.005
Testing Data:  offside
SCC: 0.084
Testing Data:  sider
SCC: 0.109
Testing Data:  drugbank
SCC: 0.044
Testing Data:  repurposing_hub
SCC: 0.041
Testing Data:  stitch
SCC: 0.098



Training Data:  gdsc
Testing Data:  ccle
SCC: 0.044
Testing Data:  pdx
SCC: 0.134
Testing Data:  offside
SCC: 0.022
Testing Data:  sider
SCC: 0.142
Testing Data:  drugbank
SCC: 0.117
Testing Data:  repurposing_hub
SCC: 0.015
Testing Data:  stitch
SCC: 0.05



Training Data:  pdx
Testing Data:  ccle
SCC: 0.037
Testing Data:  gdsc
SCC: 0.141
Testing Data:  offside
SCC: 0.01
Testing Data:  sider
SCC: 0.043
Testing Data:  drugbank
SCC: 0.125
Testing Data:  repurposing_hub
SCC: 0.059
Testing Data:  stitch
SCC: 0.107



Training Data:  offside
Testing Data:  ccle
SCC: 0.106
Testing Data:  gdsc
SCC: 0.02
Testing Data:  pdx
SCC: 0.065
Testing Data:  sider
AUROC: 0.678
Testing Data:  drugbank
AUROC: 0.382
Testing Data:  repurposing_hub
AU