# Phosphoproteomic aberrations associated with clinical signatures: Predicting Disease Stage

In [6]:
import pickle
import numpy as np
import pandas as pd
from scipy.stats import zscore
from sklearn.linear_model import LogisticRegressionCV   
from msresist.figures.figureM1 import filter_NaNpeptides
from msresist.figures.figureM1 import TransformCPTACdataForRegression
from msresist.logistic_regression import plotClusterCoefficients, plotPredictionProbabilities, plotConfusionMatrix, plotROC
from msresist.figures.figureM1 import SwapPatientIDs, AddTumorPerPatient

## Import and Preprocess Data

In [7]:
# X = preprocessing(CPTAC=True, log2T=True)
X = pd.read_csv("msresist/data/MS/CPTAC/CPTAC-preprocessedMotfis.csv").iloc[:, 1:]

# filter peptides with excessive missingness
X_f = filter_NaNpeptides(X, cut=0.1)
X_f.index = np.arange(X_f.shape[0])
d_f = X_f.select_dtypes(include=['float64']).T
i_f = X_f.select_dtypes(include=['object'])

X_f.head()

Unnamed: 0,Sequence,Protein,Gene,Position,C3N.01799,C3N.01799.N,C3L.01890,C3L.01890.N,C3N.00572,C3N.00572.N,C3N.02423,C3N.02423.N,C3N.02729,C3N.02729.N,C3L.00263,...,C3N.00738,C3N.00738.N,C3N.00959,C3N.00959.N,C3N.02003,C3N.02003.N,C3N.02379.N,C3N.00175,C3N.00175.N,C3N.01823,C3N.01823.N,C3L.02549,C3L.02549.N,C3L.02365,C3L.02365.N
0,AAAAAsQQGSA,NP_001308122.1,TBL1XR1,S119-p,,,,,,,,,,,,...,0.071547,-0.600298,0.172339,-1.761406,0.421872,-0.372907,-0.979491,-0.083036,-0.361962,1.275506,-0.568015,0.25423,-0.378522,0.987749,-0.189486
1,AAAAGsASPRS,NP_064520.2,WRNIP1,S151-p,,,,,,,,,,,,...,,,,,,,,,,,,,,,
2,AAAAGsGPSPP,NP_060895.1,PI4K2A,S44-p,,,,,,,,,0.83783,-0.993838,0.317079,...,0.031572,-0.554645,-0.03364,-1.268642,0.26679,-0.0633,-0.6475,,,,,,,,
3,AAAAGsGPsPP,NP_060895.1,PI4K2A,S44-p;S47-p,0.477744,-1.231926,-1.600599,-1.349858,0.388473,-0.649326,0.394909,-1.041745,,,,...,,,,,,,,,,,,,,,
4,AAAAPsPGSAR,NP_036253.2,CDC42EP4,S292-p,,,,,,,,,,,,...,,,,,,,,,,,,,,,


## Load Clustering Model from Figure 2

In [8]:
ncl = 15
with open('CPTACmodel_PAM250_W0075_15CL', 'rb') as f:
    MSC = pickle.load(f)[0]

### Logistic Regression

In [26]:
cd = pd.read_csv("msresist/data/MS/CPTAC/CPTAC_Clinical_Data_May2019.csv")
ts = cd[["case_id", "tumor_stage_pathological"]]
IDict = pd.read_csv("msresist/data/MS/CPTAC/IDs.csv", header=0)
IDict_ = dict(zip(IDict.iloc[:, 0], IDict.iloc[:, 1]))
ts = SwapPatientIDs(ts, IDict_).drop("case_id", axis=1)[["Patient_ID", "tumor_stage_pathological"]]
ts = AddTumorPerPatient(ts).sort_values(by="Patient_ID")

In [43]:
centers = MSC.transform(d_f)
centers["Patient_ID"] = X.columns[4:]
centers = centers.sort_values(by="Patient_ID").set_index("Patient_ID").drop(['C3N.02379.1', 'C3N.02587', 'C3N.02587.N']).reset_index()
assert list(ts["Patient_ID"]) == list(centers["Patient_ID"]), "Patients don't match"

ts = ts.replace("Stage I", 0)
ts = ts.replace("Stage IA", 1)
ts = ts.replace("Stage IB", 2)
ts = ts.replace("Stage IIA", 3)
ts = ts.replace("Stage IIB", 4)
ts = ts.replace("Stage III", 5)
ts = ts.replace("Stage IIIA", 6)
ts = ts.replace("Stage IV", 7)

# x, y = centers.set_index("Patient_ID"), ts.set_index("Patient_ID").iloc[:, 0]
x, y = np.array(centers.iloc[:, 1:]), np.array(ts.iloc[:, 1])

In [63]:
lr = LogisticRegressionCV(cv=5, multi_class='multinomial').fit(x, y)
y_pred = lr.predict(x)

lr_score = lr.score(x, y)
print("LR prediction score: ", lr_score)

ValueError: could not broadcast input array from shape (10,8,16) into shape (10)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
plotClusterCoefficients(ax, lr)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(15, 7))
plotPredictionProbabilities(ax, lr, y_pred, dd, yy)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plotConfusionMatrix(ax, lr, dd, yy)

### Support Vector Machine

In [None]:
from sklearn.svm import LinearSVC

clf = LinearSVC(penalty="l1", dual=False, class_weight={1:4}, C=2, random_state=100).fit(dd, yy)

clf_pred = clf.predict(dd)
print(clf_pred)

clf_score = clf.score(dd, yy)
print(clf_score)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
plotClusterCoefficients(ax, clf)
ax.set_title("SVC Cluster Coefficients")

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(8, 6))
plotROC(ax, clf, d, y, cv_fold=10)

In [None]:
fig, ax = plt.subplots(figsize=(5, 5))
plotConfusionMatrix(ax, clf, dd, yy)