In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import AgglomerativeClustering
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, mean_squared_error
from scipy.stats import spearmanr

In [2]:
def print_auroc_auprc(y_true, y_score):
    auroc = roc_auc_score(y_true, y_score)
    print(f"AUROC: {auroc}")
    auprc = average_precision_score(y_true, y_score)
    print(f"AUPRC: {auprc}")

In [3]:
def print_scc_mse(y_true, y_pred):
    scc, _ = spearmanr(y_true, y_pred)
    print(f"SCC: {scc}")
    mse = mean_squared_error(y_true, y_pred)
    print(f"MSE: {mse}")

In [4]:
#replace with path to data_InSummary
path = '../PM 1/data_InSummary/Drug_{}/{}/drug_embedding.csv'

ccle = pd.read_csv(path.format("response", "ccle"))
gdsc = pd.read_csv(path.format("response", "gdsc"))
pdx = pd.read_csv(path.format("response", "pdx"))
offside = pd.read_csv(path.format("se", "offside"))
sider = pd.read_csv(path.format("se", "sider"))
drugbank = pd.read_csv(path.format("target", "drugbank"))
repurposing_hub = pd.read_csv(path.format("target", "repurposing_hub"))
stitch = pd.read_csv(path.format("target", "stitch")).drop(['Unnamed: 0'], axis=1)

dataframes = [ccle, gdsc, pdx, offside, sider, drugbank, repurposing_hub, stitch]
dataframe_names = ["ccle", "gdsc", "pdx", "offside", "sider", "drugbank", "repurposing_hub", "stitch"]

In [5]:
X = []
y = []

for dataframe in dataframes:
    X_i = dataframe.to_numpy()
    clustering = AgglomerativeClustering(n_clusters=2).fit(X_i)
    y_i = clustering.labels_
    X.append(X_i)
    y.append(y_i)

# KNN

In [6]:
from sklearn.neighbors import KNeighborsClassifier

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    knn = KNeighborsClassifier(n_neighbors=3)

    knn.fit(X_train, y_train)

    y_pred = knn.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.30151134457776363
MSE: 0.17777777777777778



gdsc :
SCC: 0.284067925525857
MSE: 0.1111111111111111



pdx :
SCC: 0.510181728984944
MSE: 0.14444444444444443



offside :
AUROC: 0.5517241379310345
AUPRC: 0.7011494252873564



sider :
AUROC: 0.5576923076923077
AUPRC: 0.735632183908046



drugbank :
AUROC: 0.5625
AUPRC: 0.35833333333333334



repurposing_hub :
AUROC: 0.6071428571428571
AUPRC: 0.3365079365079365



stitch :
AUROC: 0.976766995282194
AUPRC: 0.9765372864138617





# Linear Regression

In [7]:
from sklearn.linear_model import LinearRegression

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    reg = LinearRegression().fit(X_train, y_train)

    y_pred = reg.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: -0.03207699507752053
MSE: 0.9333220441134743



gdsc :
SCC: 0.06594113429359107
MSE: 2.254195397638453



pdx :
SCC: 0.5507337213275678
MSE: 0.11773535941689309



offside :
AUROC: 0.6800452232899944
AUPRC: 0.8053648841573303



sider :
AUROC: 0.7043269230769231
AUPRC: 0.8308596572449429



drugbank :
AUROC: 0.6963383838383839
AUPRC: 0.5240823667294415



repurposing_hub :
AUROC: 0.7011278195488722
AUPRC: 0.3971646820267113



stitch :
AUROC: 0.9767773684101649
AUPRC: 0.9851045822541062





# Logistic Regression

In [8]:
from sklearn.linear_model import LogisticRegression

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = LogisticRegression(random_state=0).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.5740740740740741
MSE: 0.12222222222222222



gdsc :
SCC: 0.6499691477663601
MSE: 0.06666666666666667



pdx :
SCC: 0.29142448364039664
MSE: 0.18888888888888888



offside :
AUROC: 0.8103448275862069
AUPRC: 0.8472222222222222



sider :
AUROC: 0.8846153846153846
AUPRC: 0.9142857142857143



drugbank :
AUROC: 0.831439393939394
AUPRC: 0.679861111111111



repurposing_hub :
AUROC: 0.8862781954887218
AUPRC: 0.7535714285714286



stitch :
AUROC: 0.9263175854744254
AUPRC: 0.9243644199254194





# Naive Bayes

In [9]:
from sklearn.naive_bayes import GaussianNB

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = GaussianNB().fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.5349558922183011
MSE: 0.16666666666666666



gdsc :
SCC: 0.6980986836772425
MSE: 0.1



pdx :
SCC: 0.7241819989167364
MSE: 0.08888888888888889



offside :
AUROC: 0.8227812323346524
AUPRC: 0.8642736009044658



sider :
AUROC: 0.8257211538461539
AUPRC: 0.8833568738229756



drugbank :
AUROC: 0.8674242424242423
AUPRC: 0.615359477124183



repurposing_hub :
AUROC: 0.8430451127819549
AUPRC: 0.4336507936507936



stitch :
AUROC: 0.796051009797181
AUPRC: 0.8082230445218817





# Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = DecisionTreeClassifier(random_state=0).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.29814239699997197
MSE: 0.2111111111111111



gdsc :
SCC: 0.5986550176340047
MSE: 0.07777777777777778



pdx :
SCC: 0.47158983314854086
MSE: 0.16666666666666666



offside :
AUROC: 0.6938948558507632
AUPRC: 0.7773508652094717



sider :
AUROC: 0.7493990384615383
AUPRC: 0.8325644841269841



drugbank :
AUROC: 0.7064393939393939
AUPRC: 0.4232758620689655



repurposing_hub :
AUROC: 0.8270676691729323
AUPRC: 0.4448979591836735



stitch :
AUROC: 0.9593230902072842
AUPRC: 0.9580384411426083





# Random Forest

In [11]:
from sklearn.ensemble import RandomForestClassifier

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = RandomForestClassifier(max_depth=10, random_state=0).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.5345224838248488
MSE: 0.13333333333333333



gdsc :
SCC: 0.6499691477663601
MSE: 0.06666666666666667



pdx :
SCC: 0.5968163622088023
MSE: 0.12222222222222222



offside :
AUROC: 0.7594686263425664
AUPRC: 0.814799635701275



sider :
AUROC: 0.7998798076923077
AUPRC: 0.8606402207001522



drugbank :
AUROC: 0.7481060606060606
AUPRC: 0.5623263888888889



repurposing_hub :
AUROC: 0.8082706766917294
AUPRC: 0.5815295815295817



stitch :
AUROC: 0.9514231782536792
AUPRC: 0.9449851015564242





# Gradient Boosting Tree

In [12]:
from sklearn.ensemble import GradientBoostingClassifier

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=10, random_state=0).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.3218981532296071
MSE: 0.2



gdsc :
SCC: 0.7162311170195085
MSE: 0.05555555555555555



pdx :
SCC: 0.49922103888303265
MSE: 0.15555555555555556



offside :
AUROC: 0.710288298473714
AUPRC: 0.7866037423414473



sider :
AUROC: 0.7301682692307692
AUPRC: 0.821620696400626



drugbank :
AUROC: 0.7765151515151515
AUPRC: 0.5021505376344086



repurposing_hub :
AUROC: 0.8204887218045113
AUPRC: 0.4261904761904762



stitch :
AUROC: 0.9825031263951931
AUPRC: 0.9814223244345187





# SVM

In [13]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

for i in range(len(X)):
    
    X_train, X_test, y_train, y_test = train_test_split(X[i], y[i], test_size=0.3, random_state=42)
    
    clf = make_pipeline(StandardScaler(), SVC(kernel='poly', gamma='auto'))
    clf.fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    print(dataframe_names[i], ":")
    if i < 3:
        print_scc_mse(y_test, y_pred)
    else:
        print_auroc_auprc(y_test, y_pred)
    print("\n\n")

ccle :
SCC: 0.3713906763541037
MSE: 0.16666666666666666



gdsc :
SCC: 0.4976433490174631
MSE: 0.08888888888888889



pdx :
SCC: 0.5166403906326748
MSE: 0.14444444444444443



offside :
AUROC: 0.5689655172413793
AUPRC: 0.7093023255813954



sider :
AUROC: 0.5576923076923077
AUPRC: 0.735632183908046



drugbank :
AUROC: 0.5625
AUPRC: 0.35833333333333334



repurposing_hub :
AUROC: 0.6071428571428571
AUPRC: 0.3365079365079365



stitch :
AUROC: 0.9639644390096143
AUPRC: 0.9585213148012643



