In [None]:
# default_exp encoding

In [None]:
#hide
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
#export
import numpy as np
from sklearn.metrics import r2_score
from sklearn.model_selection import KFold

def product_moment_corr(x,y):
    '''Product-moment correlation for two ndarrays x, y'''
    from sklearn.preprocessing import StandardScaler
    x = StandardScaler().fit_transform(x)
    y = StandardScaler().fit_transform(y)
    n = x.shape[0]
    r = (1/(n-1))*(x*y).sum(axis=0)
    return r

# Training and validating voxel-wise encoding models
> Functions for training independent Ridge regressions for a large number of voxels and validating their performance

In [None]:
#export
def get_ridge_plus_scores(X, y, alphas=None, n_splits=8, scorer=None, voxel_selection=True, **kwargs):
    '''Returns ridge regressions trained in a cross-validation on n_splits of the data and scores on the left-out folds

    Parameters

        X : ndarray of shape (samples, features)
        y : ndarray of shape (samples, targets)
        alphas : None or list of floats, optional
                 Regularization parameters to be used for Ridge regression
        n_splits : int, optional
        scorer : None or any sci-kit learn compatible scoring function, optional
                 default uses product moment correlation
        voxel_selection : bool, optional, default True
                          Whether to only use voxels with variance larger than zero.
                          This will set scores for these voxels to zero.
        kwargs : additional arguments transferred to ridge_gridsearch_per_target

    Returns
        tuple of n_splits Ridge estimators trained on training folds
        and scores for all concatenated out-of-fold predictions'''
    if scorer is None:
        scorer = product_moment_corr
    kfold = KFold(n_splits=n_splits)
    if alphas is None:
        alphas = [1000]
    ridges = []
    score_list = []
    # TODO: likely memory inefficient, should be changed
    if voxel_selection:
        voxel_var = np.var(y, axis=0)
        y = y[:, voxel_var > 0.]
    for train, test in kfold.split(X, y):
        ridges.append(ridge_gridsearch_per_target(X[train], y[train], alphas, **kwargs))
        if voxel_selection:
            scores = np.zeros_like(voxel_var)
            scores[voxel_var > 0.] =  scorer(y[test], ridges[-1].predict(X[test]))
        else:
            scores = scorer(y[test], ridges[-1].predict(X[test]))
        score_list.append(scores[:, None])
    return ridges, np.concatenate(score_list, axis=-1)

`get_ridge_plus_scores` is a convenience function that trains `n_splits` Ridge regressions in a cross-validation scheme and evaluates their performance on the respectice held-out set.

In [None]:
#export
def ridge_gridsearch_per_target(X, y, alphas, n_splits=5, **kwargs):
    '''Runs Ridge gridsearch across alphas for each target in y

    Parameters

        X : ndarray of shape (samples, features)
        y : ndarray of shape (samples, targets)
        alphas : None or list of floats, optional
                 Regularization parameters to be used for Ridge regression
        n_splits : int, optional
        kwargs : keyword parameters to be transferred to Ridge regression

    Returns
        Ridge regression trained on X, y with optimal alpha per target
        determined by KFold cross-validation
    '''
    from sklearn.linear_model import Ridge
    from sklearn.metrics import mean_squared_error
    cv_results = {'alphas': []}
    cv = KFold(n_splits=n_splits)
    for alpha in alphas:
        scores = []
        for train, test in cv.split(X, y):
            ridge = Ridge(alpha=alpha, **kwargs)
            scores.append(mean_squared_error(y[test], ridge.fit(X[train], y[train]).predict(X[test]),
                              multioutput='raw_values'))
        scores = np.vstack(scores).mean(axis=0)
        cv_results['alphas'].append(scores)
    cv_results['alphas'] = np.vstack(cv_results['alphas'])
    best_alphas = np.array(alphas)[np.argmin(cv_results['alphas'], axis=0)]
    return Ridge(alpha=best_alphas, **kwargs).fit(X, y)

# Example

First, we create some simulated data of `stimulus` and `fmri`.

In [None]:
stimulus = np.random.randn(1000, 5)
fmri = np.random.randn(1000, 10)

We can now use `get_ridge_plus_scores` to estimate multiple [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html) regressions, one for each voxel (that maps the stimulus representation to this voxel) and one for each split (trained on a different training set and evaluated on the held-out set).
Since sklearn's `Ridge` estimator allows multi-output, we only get a `Ridge` object per split.

In [None]:
ridges, scores = get_ridge_plus_scores(stimulus, fmri, n_splits=3)
print(len(ridges))
ridges

3


[Ridge(alpha=array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])),
 Ridge(alpha=array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000])),
 Ridge(alpha=array([1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]))]

We also get a set of scores (by default the [product moment correlation](https://en.wikipedia.org/wiki/Pearson_correlation_coefficient), but you can supply your own via the `scorer` argument) that specifies how well we predict left-out data (with the usual caveats of using a correlation coefficient for evaluating it). In our case it is of shape (10, 3) because we predict 10 voxels and use a 3-fold cross-validation, i.e. we split 3 times.

In [None]:
print(scores.shape)
scores

(10, 3)


array([[ 0.06290033, -0.00997016, -0.00792117],
       [-0.01301689,  0.02528423, -0.00265887],
       [ 0.00782036, -0.01969563, -0.044481  ],
       [ 0.01435211, -0.01384505, -0.04758541],
       [ 0.03499081, -0.01899065, -0.06801739],
       [ 0.08777199,  0.01931685,  0.08959935],
       [ 0.01563399, -0.03672842, -0.05399268],
       [-0.05259006, -0.02230779, -0.02272438],
       [-0.07706426, -0.01217039, -0.07071412],
       [-0.0412908 , -0.10162923, -0.00832372]])