In [1]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans
from msresist.comp_estimator import ComHyperPar, MyOwnKMEANS
from msresist.plsr import FilteringOutPeptides, ClusterAverages, GridSearch_CV, MeasuredVsPredicted_LOOCVplot
import scipy as sp, numpy as np, pandas as pd, math
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import matplotlib.colors as colors
import matplotlib.cm as cm
import warnings
warnings.simplefilter("ignore")

In [2]:
X = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=0))
Y = np.array(pd.read_csv('./msresist/data/ydata.csv', header=0))
treatments = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=None))[0,2:]
peptide_phosphosite = X[:,0]
protein_description = X[:,1]

ProtNames = []
for item in protein_description:
    ProtName = item.split("OS")
    ProtName = ProtName[0]
    ProtNames.append(ProtName)
    
PC9 = X[:,2]
Erl = X[:,3]
R428 = X[:,4]
Erl_R428 = X[:,5]
Erl_HGF = X[:,6]
Erl_FGF = X[:,7]
Erl_IGF = X[:,8]
KO_Erl = X[:,9]
KO_R428 = X[:,10]
KO_Erl_R428 = X[:,11]

# Variables: X phosphopeptides 
X = np.concatenate([PC9,Erl,R428,Erl_R428,Erl_HGF,Erl_FGF,Erl_IGF,KO_Erl,KO_R428,KO_Erl_R428])
X = np.reshape(X,(10,300))
X_F = FilteringOutPeptides(X)

# Observations: Y cell viability  (average between BR 3 and 4 at 72h)
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

## PLSR erroneous high performance computed by GridSearchCV r2_score

I've used GridSearch to do a hyperparameter search in both k-means (n_clusters) and PLSR (n_components) first separately, and then using the composite estimator. In every case where the r2_scores of the PLSR model alone are calculated by fitting either the raw data (300:10), the filtered data (96:10), or the clustered data (5:10), we always obtain erroneously high PLSR training (always close to ~0.85) and test scores (always 1.0). The latter are always 0. This may suggest overfitting, but I wouldn't expect it to be the case when fitting the clustered data, where m < n. Specially, since our R2Y/Q2Y values in the notebook "Analysis_2estimators" look reasonably good. I've tried by replacing GridSearchCV's default 'r2_score' by 'explained_variance but I got the same results.

#### PLSR GridSearch with raw data (10:96)

In [3]:
# plsr = PLSRegression()
# parameters = {'n_components': np.arange(1, 16)}
# CVresults = GridSearch_CV(plsr, X, Y_cv, parameters, cv = X.shape[0], scoring='neg_mean_squared_error')
# std_scores = {'#Components': CVresults['param_n_components'], 'mean_test_scores': CVresults["mean_test_score"], 'mean_train_scores': CVresults["mean_train_score"]}
# CVresults_min = pd.DataFrame(data=std_scores)
# display(CVresults_min)
# display(CVresults)

#### PLSR GridSearch with Filtered matrix (96:10)

In [4]:
# parameters = {'n_components': np.arange(1, 16)}
# CVresults = GridSearch_CV(plsr, X_F, Y_cv, parameters, cv=X.shape[0], scoring='neg_mean_squared_error')
# std_scores = {'#Components': CVresults['param_n_components'], 'mean_test_scores': CVresults["mean_test_score"], 'mean_train_scores': CVresults["mean_train_score"]}
# CVresults_min = pd.DataFrame(data=std_scores)
# display(CVresults_min)
# display(CVresults)

#### K-means GridSearch with Filtered matrix (96:10)

GridsearchCV's scoring method on k-means seems to work.

In [5]:
# kmeans = KMeans(init="k-means++")
# parameters = {'n_clusters': np.arange(2, 16)}
# CVresults = GridSearch_CV(kmeans, X_F.T, None, parameters, cv=X_F.T.shape[0])
# std_scores = {'#Clusters': CVresults['param_n_clusters'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
# CVresults_min = pd.DataFrame(data=std_scores)
# display(CVresults_min)


#### PLSR GridSearch fitting k-means cluster averages (5:10)

In [6]:
# kmeans_ = KMeans(n_clusters = 10)
# kmeans_.fit(X_F.T)
# centers = kmeans_.cluster_centers_.T
# parameters = {'n_components': np.arange(1, centers.shape[1] + 1)}
# CVresults = GridSearch_CV(plsr, centers, Y_cv, parameters, cv=centers.shape[0], scoring='neg_mean_squared_error')
# std_scores = {'#Components': CVresults['param_n_components'], 'mean_test_scores': CVresults["mean_test_score"], 'mean_train_scores': CVresults["mean_train_score"]}
# CVresults_min = pd.DataFrame(data=std_scores)
# display(CVresults_min)
# # display(CVresults)

In [7]:
CVresults_max, CVresults_min, best_params = ComHyperPar(X_F, Y_cv, ProtNames, peptide_phosphosite)
print(Y_cv)

[16.91133681 5.79722216 6.2415253879999995 3.1356010260000002 9.511757952
 12.90119992 6.860104729 1.786757304 1.4600741469999998 0.6989652670000001]


In [8]:
display(CVresults_min)
print(best_params)
raise SystemExit

Unnamed: 0,#Clusters,#Components,mean_test_scores,mean_train_scores
0,2,1,-8.401849,-5.920621e+00
1,2,2,-10.634269,-5.846315e+00
2,3,1,-10.149642,-7.233242e+00
3,3,2,-13.035513,-6.579188e+00
4,3,3,-14.871172,-6.520703e+00
5,4,1,-9.580767,-6.650636e+00
6,4,2,-12.731597,-6.188392e+00
7,4,3,-16.106592,-5.605427e+00
8,4,4,-17.329703,-4.712185e+00
9,5,1,-9.160055,-5.795202e+00


{'kmeans__n_clusters': 2, 'plsr__n_components': 1}


SystemExit: 

In [None]:
# MSE_test = np.abs(CVresults_min["mean_test_scores"][:54])
# bpar_idx = np.argsort(MSE_test)[:3]
# print(bpar_idx)

In [None]:
# MSE_test = np.abs(CVresults_min["mean_test_scores"][:54])
# MSE_training = np.abs(CVresults_min["mean_train_scores"][:54])

# range_ = np.arange(1,MSE_test.shape[0]+1,1)

# fig, axs = plt.subplots(1,1,figsize=(30,15))
# # plt.setp(axs, xticks=nComp)
# plt.bar(range_+0.15, MSE_test,width=0.3,align='center',label='Q2Y', color = "darkred")
# plt.bar(range_-0.15, MSE_training,width=0.3,align='center',label='R2Y', color = "black")
# plt.title("R2Y/Q2Y Cell Viability")
# plt.xlabel("Number of Components")
# plt.legend(loc=4)
# plt.show()

In [None]:
# plt.plot(MSE_training)
# plt.plot(MSE_test)

In [None]:
estimators = [('kmeans', MyOwnKMEANS(5, ProtNames, peptide_phosphosite)), ('plsr', PLSRegression(2))]
pipe = Pipeline(estimators)

In [None]:
X_scores, Y_scores = pipe.fit_transform(X_F,Y_cv)
PC1_scores, PC2_scores = X_scores[:, 0], X_scores[:, 1]
PC1_xload, PC2_xload = pipe.named_steps.plsr.x_loadings_[:, 0], pipe.named_steps.plsr.x_loadings_[:, 1]
PC1_yload, PC2_yload = pipe.named_steps.plsr.y_loadings_[:, 0], pipe.named_steps.plsr.y_loadings_[:, 1]

In [None]:
fig, axs = plt.subplots(1,1,figsize=(5,5))
MeasuredVsPredicted_LOOCVplot(X_F,Y_cv, pipe, fig, "none", axs)
plt.title("Correlation Measured vs Predicted")
plt.xlabel("Measured Cell Viability")
plt.ylabel("Predicted Cell Viability")
plt.savefig('Measured_Predict_5cl2co_pipe.pdf')
plt.show()

In [None]:
fig, axs = plt.subplots(1, 2,figsize=(12,6))
colors_ = cm.rainbow(np.linspace(0, 1, 5))

axs[0].scatter(PC1_scores,PC2_scores)
for j, txt in enumerate(treatments):
    axs[0].annotate(txt, (PC1_scores[j], PC2_scores[j]))
axs[0].set_title('PLSR Model Scores')
axs[0].set_xlabel('PC1')
axs[0].set_ylabel('PC2')
axs[0].axhline(y=0, color='0.25', linestyle='--')
axs[0].axvline(x=0, color='0.25', linestyle='--')
axs[0].set_xlim([-5, 5])
axs[0].set_ylim([-1.75, 1.75])

for i, txt in enumerate(["1", "2", "3", "4", "5"]):
    axs[1].annotate(txt, (PC1_xload[i], PC2_xload[i]))
axs[1].scatter(PC1_xload, PC2_xload, c=np.arange(5), cmap=colors.ListedColormap(colors_))
axs[1].scatter(PC1_yload, PC2_yload, color='#000000', marker='D', label='Cell Viability')
axs[1].legend(loc=4)
axs[1].set_title('PLSR Model Loadings (Averaged Clusters)')
axs[1].set_xlabel('PC1')
axs[1].set_ylabel('PC2')
axs[1].axhline(y=0, color='0.25', linestyle='--')
axs[1].axvline(x=0, color='0.25', linestyle='--')
axs[1].set_xlim([-0.65, 0.65])
axs[1].set_ylim([-1.1, 1.1])
plt.savefig('scores_loadings.pdf')
plt.show()

In [None]:
print(treatments)

In [None]:
ClusterMembers = pipe.named_steps.kmeans.ClusterMembers(X_F)
count = sum(len(v) for v in ClusterMembers.values())
df = pd.DataFrame(dict([ (k, pd.Series(v)) for k,v in ClusterMembers.items() ]))

In [None]:
from IPython.display import HTML
import base64

def create_download_link(df, title = "Download CSV file", filename = "ClusterMembers_5cl_2Comp_MSR1.csv"):  
    csv = df.to_csv()
    b64 = base64.b64encode(csv.encode())
    payload = b64.decode()
    html = '<a download="{filename}" href="data:text/csv;base64,{payload}" target="_blank">{title}</a>'
    html = html.format(payload=payload,title=title,filename=filename)
    return HTML(html)

create_download_link(df)