In [1]:
from sklearn.cross_decomposition import PLSRegression
from sklearn.cluster import KMeans
from msresist.estimator import kmeansPLSR, TunningHyperpar
from msresist.comp_estimator import MyOwnKMEANS, MyOwnRegressor, ComHyperPar
from msresist.plsr import FilteringOutPeptides, ClusterAverages, GridSearch_CV
import scipy as sp, numpy as np, pandas as pd, math
from sklearn.pipeline import Pipeline
import warnings
warnings.simplefilter("ignore")

In [2]:
X = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=0))
Y = np.array(pd.read_csv('./msresist/data/ydata.csv', header=0))
treatments = np.array(pd.read_csv('./msresist/data/ms-initial.csv', header=None))[0,2:]

PC9 = X[:,2]
Erl = X[:,3]
R428 = X[:,4]
Erl_R428 = X[:,5]
Erl_HGF = X[:,6]
Erl_FGF = X[:,7]
Erl_IGF = X[:,8]
KO_Erl = X[:,9]
KO_R428 = X[:,10]
KO_Erl_R428 = X[:,11]

# Variables: X phosphopeptides 
X = np.concatenate([PC9,Erl,R428,Erl_R428,Erl_HGF,Erl_FGF,Erl_IGF,KO_Erl,KO_R428,KO_Erl_R428])
X = np.reshape(X,(10,300))
X_F = FilteringOutPeptides(X)

# Observations: Y cell viability  (average between BR 3 and 4 at 72h)
Y_cv = Y[:,2]
Y_cv = Y_cv[:10]

resh = np.reshape(Y_cv[0], (1,1))

## PLSR erroneous high performance computed by GridSearchCV r2_score

I've used GridSearch to do a hyperparameter search in both k-means (n_clusters) and PLSR (n_components) first separately, and then using the composite estimator. In every case where the r2_scores of the PLSR model alone are calculated by fitting either the raw data (300:10), the filtered data (96:10), or the clustered data (5:10), we always obtain erroneously high PLSR training (always close to ~0.85) and test scores (always 1.0). The latter are always 0. This may suggest overfitting, but I wouldn't expect it to be the case when fitting the clustered data, where m < n. Specially, since our R2Y/Q2Y values in the notebook "Analysis_2estimators" look reasonably good. I've tried by replacing GridSearchCV's default 'r2_score' by 'explained_variance but I got the same results.

#### PLSR GridSearch with raw data (10:96)

In [3]:
plsr = PLSRegression()
parameters = {'n_components': np.arange(1, 16)}
CVresults = GridSearch_CV(plsr, X, Y_cv, parameters, cv = X.shape[0])
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)
# display(CVresults)

Unnamed: 0,#Components,std_test_scores,std_train_scores
0,1,0.0,0.0472705
1,2,0.0,0.02235558
2,3,0.0,0.003494854
3,4,0.0,0.0006359216
4,5,0.0,0.0001356768
5,6,0.0,3.181901e-06
6,7,0.0,3.728215e-08
7,8,0.0,0.0
8,9,0.0,0.0
9,10,0.0,0.0


#### PLSR GridSearch with Filtered matrix (96:10)

In [4]:
parameters = {'n_components': np.arange(1, 16)}
CVresults = GridSearch_CV(plsr, X_F, Y_cv, parameters, cv = X.shape[0])
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)
# display(CVresults)

Unnamed: 0,#Components,std_test_scores,std_train_scores
0,1,0.0,0.051682
1,2,0.0,0.031667
2,3,0.0,0.012404
3,4,0.0,0.001302
4,5,0.0,0.00011
5,6,0.0,2e-05
6,7,0.0,2e-06
7,8,0.0,0.0
8,9,0.0,0.0
9,10,0.0,0.0


#### K-means GridSearch with Filtered matrix (96:10)

GridsearchCV's scoring method on k-means seems to work.

In [5]:
kmeans = KMeans(init="k-means++")
parameters = {'n_clusters': np.arange(2, 16)}
CVresults = GridSearch_CV(kmeans, X_F.T, None, parameters, cv = X_F.T.shape[0])
std_scores = {'#Clusters': CVresults['param_n_clusters'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)

Unnamed: 0,#Clusters,std_test_scores,std_train_scores
0,2,6.042844,5.883783
1,3,2.535983,2.412507
2,4,2.516901,1.856221
3,5,1.402138,1.723209
4,6,1.203324,1.157395
5,7,1.133454,1.107191
6,8,1.151996,1.066457
7,9,1.124321,1.2772
8,10,1.057025,1.219085
9,11,1.059176,1.094011


This is just to check that my function to calculate the cluster averages provides the same result as sklearn's cluster_centers_ attribute

In [6]:
n_clusters = 5
nObs = Y_cv.shape[0]

kmeans = KMeans(init="k-means++", n_clusters=n_clusters)

cluster_assignments = kmeans.fit_predict(X_F.T) 
X_Filt_Clust_Avgs = ClusterAverages((X_F), cluster_assignments, n_clusters, nObs)
print(X_Filt_Clust_Avgs)

print("-----")

centers = np.array(kmeans.cluster_centers_).T
print(centers)

[[ 0.          0.          0.          0.          0.        ]
 [ 0.03525638 -0.44942916 -2.73893889  0.3544127  -2.51434885]
 [ 0.13624877 -0.4561696  -0.62417616  0.53954048 -2.1876568 ]
 [ 0.41861823 -0.57510891 -2.58774982  1.15208036 -2.56034078]
 [ 0.32327444 -0.34163907 -2.19004706  1.03801985  0.07572022]
 [ 0.26771766 -0.50068708 -2.64767914  0.88901887 -1.76350976]
 [ 0.52142227 -0.29766732 -2.14934946  1.3533732  -1.99748906]
 [ 0.38743029 -0.83276904 -2.5700243   1.35517992 -2.60485443]
 [ 0.77654911 -0.38194906 -0.1893095   1.63897356 -2.03653584]
 [ 0.6257898  -0.69178304 -2.13723293  1.50661312 -2.39186178]]
-----
[[ 0.          0.          0.          0.          0.        ]
 [ 0.03525638 -0.44942916 -2.73893889  0.3544127  -2.51434885]
 [ 0.13624877 -0.4561696  -0.62417616  0.53954048 -2.1876568 ]
 [ 0.41861823 -0.57510891 -2.58774982  1.15208036 -2.56034078]
 [ 0.32327444 -0.34163907 -2.19004706  1.03801985  0.07572022]
 [ 0.26771766 -0.50068708 -2.64767914  0.8890188

#### PLSR GridSearch fitting k-means cluster averages (5:10)

In [7]:
parameters = {'n_components': np.arange(1, centers.shape[1] + 1)}
CVresults = GridSearch_CV(plsr, centers, Y_cv, parameters, cv = centers.shape[0])
std_scores = {'#Components': CVresults['param_n_components'], 'std_test_scores': CVresults["std_test_score"], 'std_train_scores': CVresults["std_train_score"]}
CVresults_min = pd.DataFrame(data=std_scores)
display(CVresults_min)

Unnamed: 0,#Components,std_test_scores,std_train_scores
0,1,0.0,0.066191
1,2,0.0,0.066918
2,3,0.0,0.07031
3,4,0.0,0.071619
4,5,0.0,0.072349


## OwnEstimator simultaneous GridSearch of n_clusters and n_components

I haven't been able to get **'msresist.estimator'** to work. By implementing both k-means and PLSR in the same 'class' I'm not able to correctly control cross-validation. The raw held-out data is the input of the score() method and if I found a way of iterating over the corresponding row of the cluster averages matrix (output of k-means), I think I would be in the situation as in **'mresist.comp_estimator'** (see below). 

I have a better intuition of how GridSearchCV performs cross-validation by using sklearn's pipeline and chaining k-means with PLSR (**'mresist.comp_estimator'**). The reason why I'm not pipelining sklearn's methods directly is because this way, for every CV iteration, I can reimplement k-means, extract the averages, and pass it to the PLSR estimator, which will fit the data and provide a (currently wrong...) score. If I'm not wrong, I think this should cross-validate the way we discussed in our last meeting, and, as shown above, the scoring error seems to apply to sklearn's PLSR method alone as well. 

In [8]:
CVresults_max, CVresults_min, bestparams = ComHyperPar(X_F, Y_cv)
display(CVresults_min)

Unnamed: 0,#Clusters,#Components,std_test_scores,std_train_scores
0,2,1,0.0,0.085483
1,2,2,0.0,0.086282
2,3,1,0.0,0.099539
3,3,2,0.0,0.097338
4,3,3,0.0,0.098656
5,4,1,0.0,0.098320
6,4,2,0.0,0.100760
7,4,3,0.0,0.083138
8,4,4,0.0,0.077897
9,5,1,0.0,0.068348
