In [1]:
import scipy as sp
from scipy import stats
import numpy as np 
import pandas as pd
from FileExporter import create_download_link
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
from msresist.mining_bioreplicates import MergeDfbyMean, AvsBacrossCond, AvsBvsCacrossCond
from msresist.plsr import FilteringOutPeptides, MeasuredVsPredicted_LOOCVplot
from msresist.comp_estimator import MyOwnKMEANS
from sklearn.cross_decomposition import PLSRegression
from sklearn.pipeline import Pipeline

In [2]:
#Biological Replicates, note that C contains TR1 and TR2 already (ask Jacq how she merged them)
A = pd.read_csv('./msresist/data/20180817_JG_AM_TMT10plex_R1_psms.csv', header=0)
B = pd.read_csv('./msresist/data/20190214_JG_AM_PC9_AXL_TMT10_AC28_R2_PSMs.csv', header=0)
C = pd.read_csv('./msresist/data/CombinedBR3_TR1&2.csv', header=0)


#Cell Viability
Y = np.array(pd.read_csv('./msresist/data/ydata.csv', header=0))
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

#Data
a, b, c = A.iloc[:, 2:12], B.iloc[:, 2:12], C.iloc[:, 2:12]
t = list(a.columns)

#Peptide Sequences
Aseqs, Bseqs, Cseqs = A.iloc[:, 0], B.iloc[:, 0], C.iloc[:, 0]

#Protein Names 
Anames, Bnames, Cnames = [], [], []
x = list(map(lambda  v : Anames.append(v.split("OS")[0]), A.iloc[:,1]))
y = list(map(lambda  v : Bnames.append(v.split("OS")[0]), B.iloc[:,1]))
z = list(map(lambda  v : Cnames.append(v.split("OS")[0]), C.iloc[:,1]))

A['Master Protein Descriptions'] = Anames
B['Master Protein Descriptions'] = Bnames
C['Master Protein Descriptions'] = Cnames

## Merging Biological Replicates

There are overlapping peptides across all 3 BRs... eg: gSTAENAEyLR-1, qNVPVINITyDSTPEDVk, gSHQISLDNPDyQQDFFPk, ... Same proteins have different descriptions which it's why I'm renaming them to .split("OS")[0]. Note 

In [3]:
# A[A["Master Protein Descriptions"].str.contains("UFO")]

In [4]:
# B[B["Master Protein Descriptions"].str.contains("UFO")]

In [5]:
# C[C["Master Protein Descriptions"].str.contains("UFO")]

In [6]:
# ABC_avg = MergeDfbyMean(A,B,C,t)

Note that the shape of ABC_avg is 994:*10*, meaning that the first two rows are not really considered when trying to use them with pandas

## Variability between overlapping peptides across Biological Replicates

In [7]:
# AvsB = AvsBacrossCond(A,B, t)
# plt.savefig("ABrawAcrossCond_NoOL.png")

In [8]:
# AvsC = AvsBacrossCond(A,C, t)
# plt.savefig("ACrawAcrossCond_NoOL.png")

In [9]:
# BvsC = AvsBacrossCond(B,C, t)
# plt.savefig("BCrawAcrossCond_NoOL.png")

In [10]:
# AvsBvsC = AvsBvsCacrossCond(A,B,C,t)
# plt.savefig("ABCrawAcrossCond_NoOL.png")

## Lookig within k-means clusters

The resulting loadings plot previously generated (GridSearchCV) results in three clusters correlating with cell viability across PC1 but not correaling with PC2. 

In [11]:
kmeans = MyOwnKMEANS(5, ProtNames, peptide_phosphosite).fit(A_F, Y=None)
centers = kmeans.transform(A_F)
clustermembers = kmeans.ClusterMembers(A_F)

NameError: name 'ProtNames' is not defined

In [None]:
centersT = np.transpose(centers)
for i in range(centersT.shape[0]):
    plt.plot(centersT[i,:], label = "cluster: "+str(i+1))
plt.xticks(np.arange(centersT.shape[1]), (t), rotation=70)
plt.legend(loc=0)
plt.show()
plt.savefig("cluster")

display(pd.DataFrame(centersT))
CM_df = pd.DataFrame(dict([ (k, pd.Series(v)) for k,v in clustermembers.items() ]))
create_download_link(CM_df, "ClusterMembers_4cl_2Comp_MergedBRs_2xFilter")

In [None]:
estimators = [('kmeans', MyOwnKMEANS(5, ProtNames, peptide_phosphosite)), ('plsr', PLSRegression(2))]
pipe = Pipeline(estimators)

In [None]:
X_scores, Y_scores = pipe.fit_transform(A_F,Y_cv)
PC1_scores, PC2_scores = X_scores[:, 0], X_scores[:, 1]
PC1_xload, PC2_xload = pipe.named_steps.plsr.x_loadings_[:, 0], pipe.named_steps.plsr.x_loadings_[:, 1]
PC1_yload, PC2_yload = pipe.named_steps.plsr.y_loadings_[:, 0], pipe.named_steps.plsr.y_loadings_[:, 1]

In [None]:
fig, axs = plt.subplots(1,1,figsize=(5,5))
MeasuredVsPredicted_LOOCVplot(A_F,Y_cv, pipe, fig, "none", axs)
plt.title("Correlation Measured vs Predicted")
plt.xlabel("Measured Cell Viability")
plt.ylabel("Predicted Cell Viability")
plt.savefig('Measured_Predict_5cl2co_pipe.pdf')
textstr = '\n'.join(['R: 0.79', 'p-value: 0.006'])
props = dict(boxstyle = 'square', facecolor = "white", alpha = 0.5)
plt.text(0.5, 15, textstr, bbox=props)
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2,figsize=(12,6))
# colors_ = cm.rainbow(np.linspace(0, 1, 5))
colors_ = ["blue", "orange", "green", "red", "purple"]


axs[0].scatter(PC1_scores,PC2_scores)
for j, txt in enumerate(t):
    axs[0].annotate(txt, (PC1_scores[j], PC2_scores[j]))
axs[0].set_title('PLSR Model Scores')
axs[0].set_xlabel('PC1')
axs[0].set_ylabel('PC2')
axs[0].axhline(y=0, color='0.25', linestyle='--')
axs[0].axvline(x=0, color='0.25', linestyle='--')
axs[0].set_xlim([-5, 5])
axs[0].set_ylim([-1.75, 1.75])

for i, txt in enumerate(["1", "2", "3", "4", "5"]):
    axs[1].annotate(txt, (PC1_xload[i], PC2_xload[i]))
axs[1].scatter(PC1_xload, PC2_xload, c=np.arange(5), cmap=colors.ListedColormap(colors_))
axs[1].scatter(PC1_yload, PC2_yload, color='#000000', marker='D', label='Cell Viability')
axs[1].legend(loc=4)
axs[1].set_title('PLSR Model Loadings (Averaged Clusters)')
axs[1].set_xlabel('PC1')
axs[1].set_ylabel('PC2')
axs[1].axhline(y=0, color='0.25', linestyle='--')
axs[1].axvline(x=0, color='0.25', linestyle='--')
axs[1].set_xlim([-0.65, 0.65])
axs[1].set_ylim([-1.1, 1.1])
plt.savefig('scores_loadings.pdf')
plt.show()

In [None]:
ClusterMembers = pipe.named_steps.kmeans.ClusterMembers(A_F)
count = sum(len(v) for v in ClusterMembers.values())
CM_df = pd.DataFrame(dict([ (k, pd.Series(v)) for k,v in ClusterMembers.items() ]))
create_download_link(CM_df, "ClusterMembers_5cl_2Comp_A_2xFilter")