In [None]:
# default_exp preprocessing

In [None]:
%load_ext autoreload
%autoreload 2

# Preprocessing

> Functions for preprocessing fmri data and preparing stimulus and fmri data for training voxel-wise encoding models.

In [None]:
#hide
from nbdev.showdoc import *

In [None]:
#hide
#export
import os
import warnings
import numpy as np
import joblib
from nilearn.masking import unmask, apply_mask
from nibabel import save, load, Nifti1Image
from nilearn.signal import clean

## Preprocessing BOLD fMRI

In [None]:
#export
def preprocess_bold_fmri(bold, mask=None, detrend=True, standardize='zscore', **kwargs):
    '''Preprocesses BOLD data and returns ndarray of preprocessed data

    Parameters

        bold : path to bold nifti file or loaded bold nifti
        mask : path to mask nifti file or loaded mask nifti, optional
        detrend : bool, whether to linearly detrend the data, optional
        standardize : {‘zscore’, ‘psc’, False}, default is ‘zscore’
        kwargs : further arguments for nilearn's clean function

    Returns
        ndarray of the preprocessed bold data in (samples, voxels)
    '''
    if mask:
        data = apply_mask(bold, mask)
    else:
        if not isinstance(bold, Nifti1Image):
            data = load(bold).get_data()
        else:
            data = bold.get_data()
        data = np.reshape(data, (-1, data.shape[-1])).T
    return clean(data, detrend=detrend, standardize=standardize, **kwargs)

`preprocess_bold_fmri` preprocessed a BOLD Nifti and returns a numpy ndarray of the optionally masked and preprocessed fMRI data.

In [None]:
#hide
def test_preprocess_bold_fmri():
    test_nifti = Nifti1Image(np.ones((2, 2, 2, 2)), affine=np.eye(4))
    assert np.allclose(preprocess_bold_fmri(test_nifti, standardize=True, detrend=False), 0.)
    assert np.allclose(preprocess_bold_fmri(test_nifti, standardize=False, detrend=True), 0.)
    assert np.allclose(preprocess_bold_fmri(test_nifti, standardize=False, detrend=False), 1.)
    mask_array = np.ones((2, 2, 2))
    mask_array[:,0,:] = 0.
    mask = Nifti1Image(mask_array, affine=np.eye(4))
    assert preprocess_bold_fmri(test_nifti, mask=mask).shape == (2,4)

test_preprocess_bold_fmri()

In [None]:
#export
def get_remove_idx(lagged_stimulus, remove_nan=True):
    '''Returns indices of rows in lagged_stimulus to remove'''
    if remove_nan is True:
        return np.where(np.any(np.isnan(lagged_stimulus), axis=1))[0]
    elif remove_nan is False:
        # This will raise an error if it is supplied to np.delete
        # which is what we want
        return None
    elif remove_nan <= 1. and remove_nan >= 0.:
        return np.where(np.isnan(lagged_stimulus).mean(axis=1) > remove_nan)[0]
    else:
        raise ValueError('remove_nan needs to be either True, False, or a float between 0 and 1.')

In [None]:
#hide
def test_get_remove_idx():
    lagged_data = np.ones((100, 10))
    lagged_data[:5, :] = np.nan
    lagged_data[5:10, :3] = np.nan
    assert get_remove_idx(lagged_data, remove_nan=False) is None
    assert np.all(get_remove_idx(lagged_data, remove_nan=True)  == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) 
    assert np.all(get_remove_idx(lagged_data, remove_nan=0.5)  == np.array([0, 1, 2, 3, 4])) 
    assert np.all(get_remove_idx(lagged_data, remove_nan=0.1)  == np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])) 

test_get_remove_idx()

## Aligning stimulus and fMRI

In [None]:
#export
def generate_lagged_stimulus(stimulus, fmri_samples, TR, stim_TR,
                             lag_time=6.0, start_time=0., offset_stim=0.,
                             fill_value=np.nan):
    '''Generates a lagged stimulus representation temporally aligned with the fMRI data

    Parameters

        stimuli : ndarray, stimulus representation of shape (samples, features)
        fmri_samples : int, samples of corresponding fmri run
        TR : int, float, repetition time of the fMRI data in seconds
        stim_TR : int, float, repetition time of the stimulus in seconds
        lag_time : int, float, or None, optional,
               lag to introduce for stimuli in seconds,
               if no lagging should be done set this to TR or None
        start_time :  int, float, optional, default 0.
                  starting time of the stimulus relative to fMRI recordings in seconds
                  appends fill_value to stimulus representation to match fMRI and stimulus
        offset_stim : int, float, optional, default 0.
                  time to offset stimulus relative to fMRI in the lagged stimulus,
                  i.e. when predicting fmri at time t use only stimulus features
                  before t-offset_stim. This reduces the number of time points used
                  in the model.
        fill_value : int, float, or any valid numpy array element, optional, default np.nan
                 appends fill_value to stimulus array to account for starting_time
                 use np.nan here with remove_nans=True to remove fmri/stimulus samples where no stimulus was presented

    Returns:
        ndarray of the lagged stimulus of shape (samples, lagged features)
    '''
    from skimage.util import view_as_windows
    # find out temporal alignment
    stim_samples_per_TR = TR / stim_TR
    if stim_samples_per_TR < 1:
        raise ValueError('Stimulus TR is larger than fMRI TR')
    # check if result is close to an integer
    if not np.isclose(stim_samples_per_TR, np.round(stim_samples_per_TR)):
        warnings.warn('Stimulus timing and fMRI timing do not align. '
        'Stimulus samples per fMRI samples: {0} for stimulus TR {1} and fMRI TR {2}. '
        'Proceeds by rounding stimulus samples '
        'per TR.'.format(stim_samples_per_TR, stim_TR, TR), RuntimeWarning)
    stim_samples_per_TR = int(np.round(stim_samples_per_TR))
    if lag_time is None:
        lag_time = TR
    if lag_time < TR:
        warnings.warn('lag_time ({}) should not be smaller than TR ({}).'.format(lag_time, TR))
    # check if lag time is multiple of TR
    if not np.isclose(lag_time / TR, np.round(lag_time / TR)):
        raise ValueError('lag_time should be a multiple of TR so '
                'that stimulus/fMRI alignment does not change.')
    if lag_time == TR:
            warnings.warn('lag_time is equal to TR, no stimulus lagging will be done.', RuntimeWarning)
    lag_TR = int(np.round(lag_time / TR))
    offset_TR = int(np.round(offset_stim / TR))

    n_features = stimulus.shape[1]
    n_append = 0
    n_prepend = 0
    # check if the stimulus start time is moved w.r.t. fmri
    n_prepend += int(np.round(start_time / stim_TR))
    stimulus = np.vstack([np.full((n_prepend, n_features), fill_value), stimulus])

    # make reshapeable by appending filler
    if stimulus.shape[0] % stim_samples_per_TR > 0:
        # either remove part of the stimulus (if it is longer than fmri) or append filler
        if stimulus.shape[0] / stim_samples_per_TR > fmri_samples:
            stimulus = stimulus[:-(stimulus.shape[0] % stim_samples_per_TR)]
        else:
            n_append = stim_samples_per_TR - ((stimulus.shape[0]) % stim_samples_per_TR)
            stimulus = np.vstack([stimulus, np.full((n_append, n_features), fill_value)])

    # now reshape and lag
    stimulus = np.reshape(stimulus, (-1, stim_samples_per_TR * n_features))

    # check if stimulus is longer than fmri and remove part of the stimulus
    if stimulus.shape[0] > fmri_samples:
        warnings.warn('Stimulus ({0}) is longer than recorded fMRI '
                      '({1}). Removing last part of stimulus.'.format(stimulus.shape[0]*TR, fmri_samples*TR))
        stimulus = stimulus[:fmri_samples]


    # check if lagging should be done
    if lag_time != TR:
        # account for lagging
        n_prepend_lag = (lag_TR + offset_TR) - 1
        # and add filler such that length is the same for fmri
        n_append_lag = fmri_samples - stimulus.shape[0]
        stimulus = np.vstack(
                             [np.full((n_prepend_lag, n_features * stim_samples_per_TR), fill_value),
                              stimulus,
                              np.full((n_append_lag, n_features * stim_samples_per_TR), fill_value)])
        # here we create a stimulus representation that incorporates a time window
        # i.e. we go from time X features to (time - window_size + 1) X window_size X features
        # where window size is (lag_TR + offset_TR)
        stimulus = np.swapaxes(np.squeeze(view_as_windows(stimulus, ((lag_TR + offset_TR), 1))), 1, 2)
        # and here we reshape into (time - window_size + 1) X (window_size * features)
        stimulus = np.reshape(stimulus, (stimulus.shape[0], -1))

    # remove stimulus representations that are more recent than offset_stim
    if offset_stim > 0:
        stimulus = stimulus[:, :-(offset_TR *stim_samples_per_TR * n_features)]
    return stimulus

In [None]:
#export
def make_X_Y(stimuli, fmri, TR, stim_TR, lag_time=6.0, start_times=None, offset_stim=0., fill_value=np.nan, remove_nans=True):
    '''Creates (lagged) features and fMRI matrices concatenated along runs

    Parameters

        stimuli : list, list of stimulus representations
        fmri : list, list of fMRI ndarrays
        TR : int, float, repetition time of the fMRI data in seconds
        stim_TR : int, float, repetition time of the stimulus in seconds
        lag_time : int, float, optional,
                   lag to introduce for stimuli in seconds,
                   if no lagging should be done set this to TR
        start_times : list, list of int, float, optional,
                      starting time of the stimuli relative to fMRI recordings in seconds
                      appends fill_value to stimulus representation to match fMRI and stimulus
        offset_stim : int, float, optional,
                      time to offset stimulus relative to fMRI in the lagged stimulus,
                      i.e. when predicting fmri at time t use only stimulus features
                      before t-offset_stim. This reduces the number of time points used
                      in the model.
        fill_value : int, float, or any valid numpy array element, optional,
                     appends fill_value to stimulus array to account for starting_time
                     use np.nan here with remove_nans=True to remove fmri/stimulus samples where no stimulus was presented
        remove_nans : bool, bool or float 0<=remove_nans<=1, optional
                      True/False indicate whether to remove all or none
                      stimulus/fmri samples that contain nans
                      a proportion keeps all samples in the lagged stimulus that have
                      lower number of nans than this proportion.
                      Replace nans with zeros in this case.

    Returns:
    tuple of two ndarrays,
    the first element are the (lagged) stimuli,
    the second element is the aligned fMRI data
    '''
    from skimage.util import view_as_windows
    if len(stimuli) != len(fmri):
        raise ValueError('Stimulus and fMRI need to have the same number of runs. '
        'Instead fMRI has {} and stimulus {} runs.'.format(len(fmri), len(stimuli)))
    n_features = stimuli[0].shape[1]
    if not np.all(np.array([stim.shape[1] for stim in stimuli]) == n_features):
        raise ValueError('Stimulus has different number of features per run.')

    lagged_stimuli = []
    aligned_fmri = []
    for i, (stimulus, fmri_run) in enumerate(zip(stimuli, fmri)):
        stimulus = generate_lagged_stimulus(
            stimulus, fmri_run.shape[0], TR, stim_TR, lag_time=lag_time,
            start_time=start_times[i] if start_times else 0.,
            offset_stim=offset_stim, fill_value=fill_value)
        # remove nans in stim/fmri here
        if remove_nans:
            remove_idx = get_remove_idx(stimulus, remove_nans)
            stimulus = np.delete(stimulus, remove_idx, axis=0)
            fmri_run = np.delete(fmri_run, remove_idx, axis=0)

        # remove fmri samples recorded after stimulus has ended
        if fmri_run.shape[0] != stimulus.shape[0]:
            warnings.warn('fMRI data and stimulus samples differ. '
            'Removing additional fMRI samples. This could mean that you recorded '
            'long after stimulus ended or that something went wrong in the '
            'preprocessing. fMRI: {}s stimulus: {}s'.format(
                TR*fmri_run.shape[0], TR*stimulus.shape[0]), RuntimeWarning)
            if fmri_run.shape[0] > stimulus.shape[0]:
                fmri_run = fmri_run[:-(fmri_run.shape[0]-stimulus.shape[0])]
        lagged_stimuli.append(stimulus)
        aligned_fmri.append(fmri_run)
    return np.vstack(lagged_stimuli), np.vstack(aligned_fmri)

### Example

`make_X_Y` allows you to align the (preprocessed) fMRI and stimulus data by specifying fMRI `TR` and stimulus `stim_TR`, as well as the `lag_time` (how long a stimulus window should be in seconds to predict a single fMRI TR) and potential stimulus offsets.
Since we potentially want to preprocess and concatenate multiple runs, both `fmri` and `stimuli` are supposed to be lists. To process only a single run, you can use a list of one element.

Let's look at an example, where the stimulus is sample every 100 ms and fMRI every 2s, i.e. every fMRI sample corresponds to 20 stimulus samples.

In [None]:
stim_TR, TR = 0.1, 2

Now create a simulated `stimulus` object of 80 samples.

In [None]:
stimulus = np.tile(np.arange(80)[:, None], (1, 1))
print(stimulus.shape)

(80, 1)


And an according `fmri` object of 4 samples and one voxel (since we TRs differ).

In [None]:
fmri = np.tile(np.arange(0, 4)[:, None], (1, 1))
print(fmri.shape)

(4, 1)


Let's first align `fMRI` and `stimulus` without any offset or lag:

In [None]:
X, y = make_X_Y([stimulus], [fmri], TR, stim_TR, lag_time=None, offset_stim=0, start_times=[0])
assert X.shape == (4, 20)
assert y.shape == (4, 1)



We keep the original number of samples in fMRI, but represent stimulus (and hence X) by the number of samples per fmri TR: stimulus thus becomes a (4, 20) array.

### Lagging the stimulus

We can now call `make_X_Y` with the stimulus and fMRI TRs and a specified `lag_time`.
Here we want to use 4 seconds of the stimulus to predict fMRI, but do not want to shift `fmri` relative to `stimulus` (`offset_stim` is 0.).
This means that our encoding model can approximate a hemodynamic response function (HRF) by estimating a finite impulse response (FIR) that is 4 seconds long.

In [None]:
X, y = make_X_Y([stimulus], [fmri], TR, stim_TR, lag_time=4, offset_stim=0, start_times=[0])
assert X.shape == (3, 40)
assert y.shape == (3, 1)

### Shifting the stimulus

We could also shift `fmri` relative to `stimulus`, to account for the delayed onset of the hemodynamic response - this is different than estimating the hemodynamic response from the window given by `lag_time`.
In practice this means we estimate an hemodynamic response function (HRF) by a FIR in the time period from -6s to -2s before each fMRI sample. 

In [None]:
X, y = make_X_Y([stimulus], [fmri], TR, stim_TR, lag_time=4, offset_stim=2, start_times=[0])
assert X.shape == (2, 40)
assert y.shape == (2, 1)

### Handling out-of-recording data

Because of our shift we "lose" one sample, because by default `fill_value` fills values that lie outside the recording interval by NaNs and by default `remove_nans` specifies that all samples with NaNs are dropped.

To check that behavior, we see what we get when we don't remove NaNs:

In [None]:
X, y = make_X_Y([stimulus], [fmri], TR, stim_TR, lag_time=4, offset_stim=2, start_times=[0], remove_nans=False)
assert X.shape == (4, 40)
assert y.shape == (4, 1)

We keep the original number of samples, but some are filled with NaNs now:

In [None]:
assert np.isnan(X).sum() == 60
print(X)

[[nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan nan nan]
 [nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan nan
  nan nan  0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15.
  16. 17. 18. 19.]
 [ 0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16. 17.
  18. 19. 20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35.
  36. 37. 38. 39.]
 [20. 21. 22. 23. 24. 25. 26. 27. 28. 29. 30. 31. 32. 33. 34. 35. 36. 37.
  38. 39. 40. 41. 42. 43. 44. 45. 46. 47. 48. 49. 50. 51. 52. 53. 54. 55.
  56. 57. 58. 59.]]


We can see that the first samples completely consists of NaNs, because by lagging and offsetting we assume that the   fMRI sample at time point t can be predicted by the time period in the stimulus of t-6s to t-2s.
However, we don't have any stimulus presented in that time!
In the second sample we can see that the first half of the stimulus still consists of NaNs: that's because for t=2s, the time period in the stimulus from t-6s to t-2s has only data for t=0s but not t=4s.
Keep in mind that the stimulus at t=0s corresponds to the first 2s of the stimulus (because we reshaped the stimulus TR to correspond to the 2s fmri TR).

In [None]:
show_doc(generate_lagged_stimulus)

<h4 id="generate_lagged_stimulus" class="doc_header"><code>generate_lagged_stimulus</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>generate_lagged_stimulus</code>(**`stimulus`**, **`fmri_samples`**, **`TR`**, **`stim_TR`**, **`lag_time`**=*`6.0`*, **`start_time`**=*`0.0`*, **`offset_stim`**=*`0.0`*, **`fill_value`**=*`nan`*)

Generates a lagged stimulus representation temporally aligned with the fMRI data

Parameters:

    stimuli : ndarray, stimulus representation of shape (samples, features)
    fmri_samples : int, samples of corresponding fmri run
    TR : int, float, repetition time of the fMRI data in seconds
    stim_TR : int, float, repetition time of the stimulus in seconds
    lag_time : int, float, or None, optional,
           lag to introduce for stimuli in seconds,
           if no lagging should be done set this to TR or None
    start_time :  int, float, optional, default 0.
              starting time of the stimulus relative to fMRI recordings in seconds
              appends fill_value to stimulus representation to match fMRI and stimulus
    offset_stim : int, float, optional, default 0.
              time to offset stimulus relative to fMRI in the lagged stimulus,
              i.e. when predicting fmri at time t use only stimulus features
              before t-offset_stim. This reduces the number of time points used
              in the model.
    fill_value : int, float, or any valid numpy array element, optional, default np.nan
             appends fill_value to stimulus array to account for starting_time
             use np.nan here with remove_nans=True to remove fmri/stimulus samples where no stimulus was presented

Returns:
    ndarray of the lagged stimulus of shape (samples, lagged features)

`generate_lagged_stimulus` takes care of aligning fMRI and stimulus data, it is used internally by `make_X_Y`.

In [None]:
show_doc(get_remove_idx)

<h4 id="get_remove_idx" class="doc_header"><code>get_remove_idx</code><a href="__main__.py#L2" class="source_link" style="float:right">[source]</a></h4>

> <code>get_remove_idx</code>(**`lagged_stimulus`**, **`remove_nan`**=*`True`*)

Returns indices of rows in lagged_stimulus to remove