In [None]:
# default_exp encoding

In [None]:
#hide
%load_ext autoreload
%autoreload 2

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
#export
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold
from sklearn.linear_model import RidgeCV
import warnings
import copy

def product_moment_corr(x,y):
    '''Product-moment correlation for two ndarrays x, y'''
    from sklearn.preprocessing import StandardScaler
    x = StandardScaler().fit_transform(x)
    y = StandardScaler().fit_transform(y)
    n = x.shape[0]
    r = (1/(n-1))*(x*y).sum(axis=0)
    return r

# Training and validating voxel-wise encoding models
> Functions for training independent Ridge regressions for a large number of voxels and validating their performance

In [None]:
#export

def get_model_plus_scores(X, y, estimator=None, alphas=None, n_splits=8, scorer=None,
                          voxel_selection=True, validation=True):
    '''Returns multiple estimator trained in a cross-validation on n_splits of the data and scores on the left-out folds

    Parameters

        X : ndarray of shape (samples, features)
        y : ndarray of shape (samples, targets)
        estimator : None or estimator object that implements fit and predict
                    if None, uses RidgeCV per default
        n_splits : int, optional, number of cross-validation splits
        scorer : None or any sci-kit learn compatible scoring function, optional
                 default uses product moment correlation
        voxel_selection : bool, optional, default True
                          Whether to only use voxels with variance larger than zero.
                          This will set scores for these voxels to zero.
        validation : bool, optional, default True
                     Whether to validate the model via cross-validation
                     or to just train the estimator - if False, scores will be computed on the training set
    Returns
        tuple of n_splits estimators trained on training folds or single estimator if validation is False
        and scores for all concatenated out-of-fold predictions'''
    from sklearn.utils.estimator_checks import check_regressor_multioutput
    if scorer is None:
        scorer = product_moment_corr
    kfold = KFold(n_splits=n_splits)
    models = []
    score_list = []
    if estimator is None:
        try:
            estimator = RidgeCV(alpha_per_target=True)
        except TypeError:
            # sklearn version below 0.24
            warnings.warn('scikit-learn version below 0.24.'
                          'Voxels will not have individual regularization parameters.'
                          'Update scikit-learn >= 0.24 to change.')
            estimator = RidgeCV()
        
    if voxel_selection:
        voxel_var = np.var(y, axis=0)
        y = y[:, voxel_var > 0.]
    if validation:
        for train, test in kfold.split(X, y):
            models.append(copy.deepcopy(estimator).fit(X[train], y[train]))
            if voxel_selection:
                scores = np.zeros_like(voxel_var)
                scores[voxel_var > 0.] =  scorer(y[test], models[-1].predict(X[test]))
            else:
                scores = scorer(y[test], models[-1].predict(X[test]))
            score_list.append(scores[:, None])
        score_list = np.concatenate(score_list, axis=-1)
    else:
        models = estimator.fit(X, y)
        score_list = scorer(y, estimator.predict(X))
    return models, score_list

`get_model_plus_scores` is a convenience function that trains `n_splits` Ridge regressions in a cross-validation scheme and evaluates their performance on the respective test set.

# Examples

First, we create some simulated `stimulus` and `fmri` data.

In [None]:
stimulus = np.random.randn(1000, 5)
fmri = np.random.randn(1000, 10)

## Using the default Ridge regression

We can now use `get_model_plus_scores` to estimate multiple [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html) regressions, one for each voxel (that maps the stimulus representation to this voxel) and one for each split (trained on a different training set and evaluated on the held-out set).
Since sklearn's `RidgeCV` estimator allows multi-output, we get one `RidgeCV` object per split.

In [None]:
ridges, scores = get_model_plus_scores(stimulus, fmri, n_splits=3)
assert len(ridges) == 3
ridges

[RidgeCV(alpha_per_target=True, alphas=array([ 0.1,  1. , 10. ])),
 RidgeCV(alpha_per_target=True, alphas=array([ 0.1,  1. , 10. ])),
 RidgeCV(alpha_per_target=True, alphas=array([ 0.1,  1. , 10. ]))]

Each `RidgeCV` estimator maps from the feature space to each voxel.
In our example, that means it has 10 (the number of voxels-9 independently trained regression models with 5 coeficients each (the number of features).

In [None]:
assert ridges[0].coef_.shape == (10, 5)
print(ridges[0].coef_)

[[-0.03148284 -0.03085741 -0.03709923 -0.03938465  0.00740931]
 [ 0.05844587 -0.03454859 -0.00971743  0.04779117  0.04717684]
 [-0.06610763  0.02024515 -0.00519406  0.01604151  0.06232265]
 [-0.00847565  0.01357303  0.04579654 -0.03709936  0.03114387]
 [ 0.01942072  0.0222861  -0.02364503  0.00061965  0.10216772]
 [ 0.00359469 -0.04175064 -0.0503723  -0.00041977 -0.05407095]
 [ 0.03992998 -0.00829027 -0.03257733  0.02132598 -0.02921803]
 [ 0.07730107 -0.03277048 -0.0112798  -0.10295067 -0.00743558]
 [-0.01648862 -0.00174828 -0.06699278 -0.05327637  0.01911227]
 [-0.01000438  0.00611579  0.00706875 -0.05508025  0.01144064]]


We also get a set of scores (by default the [product moment correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient), but you can supply your own via the `scorer` argument) that specifies how well we predict left-out data (with the usual caveats of using a correlation coefficient for evaluating it). In our case it is of shape (10, 3) because we predict 10 voxels and use a 3-fold cross-validation, i.e. we split 3 times.

In [None]:
assert scores.shape == (10, 3)
scores

array([[-0.00489691, -0.04027173, -0.01674204],
       [-0.06069225, -0.01664182, -0.04291744],
       [ 0.04152938,  0.00532055, -0.01025946],
       [ 0.06519698, -0.02780779, -0.04574611],
       [ 0.02882572,  0.06493467, -0.00487824],
       [ 0.00728804, -0.02319843,  0.01526909],
       [-0.03650654, -0.04296925, -0.06727077],
       [ 0.00508916,  0.07476689,  0.04086158],
       [ 0.02566022,  0.06741255,  0.05066477],
       [ 0.05676817, -0.0125262 ,  0.08245368]])

## Using your own estimator

Of course we can also use our own estimator function.
For example, we use a `RidgeCV` object with pre-specified hyperparameters, like the values of the regularization parameter $\alpha$ we want to perform a gridsearch over or whether we want to normalize features.

In [None]:
alphas = [1, 10, 100]
our_estimator = RidgeCV(alphas=alphas, normalize=True, alpha_per_target=True)

ridges, scores = get_model_plus_scores(stimulus, fmri, our_estimator,
                                       n_splits=3)
assert ridges[0].normalize

Additionally, we can use any other estimator that implements `fit` and `predict`.
For example, we can use [CCA](https://scikit-learn.org/stable/modules/generated/sklearn.cross_decomposition.CCA.html) as an encoding model.

In [None]:
from sklearn import cross_decomposition

our_estimator = cross_decomposition.CCA(n_components=2)

ccas, scores = get_model_plus_scores(stimulus, fmri, our_estimator,
                                       n_splits=3)
assert type(ccas[0]) == cross_decomposition._pls.CCA

If your favorite estimator does not work in the multioutput regime, i.e. it cannot predict multiple targets/voxels, then `get_model_plus_scores` will wrap it into sklearn's [MultiOutputRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.multioutput.MultiOutputRegressor.html) by default. However, for many voxels this can increase training time by a lot.

In [None]:
from sklearn.linear_model import Lasso
from sklearn.multioutput import MultiOutputRegressor

our_estimator = MultiOutputRegressor(Lasso())

lassos, scores = get_model_plus_scores(stimulus, fmri, our_estimator,
                                       n_splits=3)
lassos

[MultiOutputRegressor(estimator=Lasso()),
 MultiOutputRegressor(estimator=Lasso()),
 MultiOutputRegressor(estimator=Lasso())]

## Training without validation

We can also train an estimator without any validation, if, for example we want to test on a different dataset. In that case, the scores will be computed with the trained estimator on the training set, i.e. they will contain no information about the generalization performance of the estimator.

In [None]:
our_estimator = RidgeCV()

model, scores = get_model_plus_scores(stimulus, fmri, our_estimator,
                                       validation=False)
assert type(model) == RidgeCV
assert scores.shape == (10,)