In [1]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans
from msresist.estimator import kmeansPLSR, TunningHyperpar
from msresist.comp_estimator import MyOwnKMEANS, ComHyperPar
from msresist.plsr import FilteringOutPeptides, ClusterAverages, GridSearch_CV
import scipy as sp, numpy as np, pandas as pd, math
from sklearn.pipeline import Pipeline
import warnings
warnings.simplefilter("ignore")

In [2]:
X = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=0))
Y = np.array(pd.read_csv('./msresist/data/ydata.csv', header=0))
treatments = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=None))[0,2:]

PC9 = X[:,2]
Erl = X[:,3]
R428 = X[:,4]
Erl_R428 = X[:,5]
Erl_HGF = X[:,6]
Erl_FGF = X[:,7]
Erl_IGF = X[:,8]
KO_Erl = X[:,9]
KO_R428 = X[:,10]
KO_Erl_R428 = X[:,11]

# Variables: X phosphopeptides 
X = np.concatenate([PC9,Erl,R428,Erl_R428,Erl_HGF,Erl_FGF,Erl_IGF,KO_Erl,KO_R428,KO_Erl_R428])
X = np.reshape(X,(10,300))
X_F = FilteringOutPeptides(X)

# Observations: Y cell viability  (average between BR 3 and 4 at 72h)
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

resh = np.reshape(Y_cv[0], (1,1))

## PLSR erroneous high performance computed by GridSearchCV r2_score

I've used GridSearch to do a hyperparameter search in both k-means (n_clusters) and PLSR (n_components) first separately, and then using the composite estimator. In every case where the r2_scores of the PLSR model alone are calculated by fitting either the raw data (300:10), the filtered data (96:10), or the clustered data (5:10), we always obtain erroneously high PLSR training (always close to ~0.85) and test scores (always 1.0). The latter are always 0. This may suggest overfitting, but I wouldn't expect it to be the case when fitting the clustered data, where m < n. Specially, since our R2Y/Q2Y values in the notebook "Analysis_2estimators" look reasonably good. I've tried by replacing GridSearchCV's default 'r2_score' by 'explained_variance but I got the same results.

#### PLSR GridSearch with raw data (10:96)

In [3]:
plsr = PLSRegression()
parameters = {'n_components': np.arange(1, 16)}
CVresults = GridSearch_CV(plsr, X, Y_cv, parameters, cv = X.shape[0], scoring='neg_mean_squared_error')
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)
# display(CVresults)

Unnamed: 0,#Components,std_test_scores,std_train_scores
0,1,25.128401,1.030234
1,2,23.368728,0.5364224
2,3,23.072499,0.0838142
3,4,21.570116,0.01538869
4,5,22.259963,0.003414036
5,6,22.098466,7.953383e-05
6,7,22.140843,9.430996e-07
7,8,22.141103,7.817949999999999e-30
8,9,22.141103,7.817949999999999e-30
9,10,22.141103,7.817949999999999e-30


#### PLSR GridSearch with Filtered matrix (96:10)

In [4]:
parameters = {'n_components': np.arange(1, 16)}
CVresults = GridSearch_CV(plsr, X_F, Y_cv, parameters, cv=X.shape[0], scoring='neg_mean_squared_error')
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)
# display(CVresults)

Unnamed: 0,#Components,std_test_scores,std_train_scores
0,1,16.67271,1.0554
1,2,19.808584,0.6958398
2,3,20.915121,0.3388252
3,4,20.694413,0.03753603
4,5,21.605289,0.002465559
5,6,21.563976,0.0004705652
6,7,21.457897,3.796483e-05
7,8,21.47813,9.62772e-30
8,9,21.47813,9.62772e-30
9,10,21.47813,9.62772e-30


#### K-means GridSearch with Filtered matrix (96:10)

GridsearchCV's scoring method on k-means seems to work.

In [5]:
kmeans = KMeans(init="k-means++")
parameters = {'n_clusters': np.arange(2, 16)}
CVresults = GridSearch_CV(kmeans, X_F.T, None, parameters, cv=X_F.T.shape[0])
std_scores = {'#Clusters': CVresults['param_n_clusters'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)

Unnamed: 0,#Clusters,std_test_scores,std_train_scores
0,2,6.042494,5.886221
1,3,2.535983,2.412507
2,4,2.517069,2.018852
3,5,1.325996,1.880687
4,6,1.188564,1.222119
5,7,1.173334,1.078507
6,8,1.225113,1.082977
7,9,1.129125,1.116504
8,10,1.181283,1.253724
9,11,1.02509,0.940377


#### PLSR GridSearch fitting k-means cluster averages (5:10)

In [6]:
parameters = {'n_components': np.arange(1, centers.shape[1] + 1)}
CVresults = GridSearch_CV(plsr, centers, Y_cv, parameters, cv=centers.shape[0])
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)

NameError: name 'centers' is not defined