In [1]:
# import basic modules
import sys
import os
import time
import numpy as np
from tqdm import tqdm
import gc
import torch
import argparse
import skimage.transform

# import custom modules
code_dir = '/Users/margarethenderson/Box Sync/imStat/code/'
sys.path.append(code_dir)
from feature_extraction import texture_statistics_gabor, texture_statistics_pyramid, sketch_token_features
from utils import nsd_utils, roi_utils, default_paths

from model_fitting import initialize_fitting, arg_parser, merge_features, fwrf_fit, fwrf_predict

fpX = np.float32
device = 'cpu:0'
# device = initialize_fitting.init_cuda()

In [2]:
# Testing fitting code
fitting_type='pyramid_texture'
do_avg_pool=True
subject=1
volume_space = True
up_to_sess = 1
n_ori = 4
n_sf = 4
nonlin_fn = False
padding_mode = 'circular';
group_all_hl_feats = True; \
sample_batch_size = 50; voxel_batch_size = 100; \
zscore_features = True; ridge = True; \
shuffle_images = False; random_images = False; random_voxel_data = False; \
do_fitting = True; do_val = True; do_varpart = True; date_str = None;
shuff_rnd_seed = 0; debug = True; \
do_pca_pyr_hl=False; do_pca_st=False; do_pca_st=False;  
min_pct_var = 99; max_pc_to_retain = 400; map_ind = -1; \
n_prf_sd_out = 2; mult_patch_by_prf = True; \
downsample_factor = 1.0; do_nms = False

In [3]:
os.chdir(default_paths.nsd_path)

In [4]:
if 'pyramid' in fitting_type:
    model_name = initialize_fitting.get_pyramid_model_name(ridge, n_ori, n_sf, do_pca_hl = do_pca_pyr_hl)
    feature_types_exclude = []        
    name1 = 'pyramid_texture'

elif 'gabor_texture' in fitting_type:        
    model_name = initialize_fitting.get_gabor_texture_model_name(ridge, n_ori, n_sf)
    feature_types_exclude = []
    name1 = 'gabor_texture'

elif 'gabor_solo' in fitting_type:        
    model_name = initialize_fitting.get_gabor_solo_model_name(ridge, n_ori, n_sf)
    feature_types_exclude = ['pixel', 'simple_feature_means', 'autocorrs', 'crosscorrs']
    name1 = 'gabor_solo'

elif 'bdcn' in fitting_type:
    model_name = initialize_fitting.get_bdcn_model_name(do_pca_bdcn, map_ind)   
    name1 = 'bdcn'

elif 'sketch_tokens' in fitting_type:
    model_name = initialize_fitting.get_sketch_tokens_model_name(do_pca_st)   
    name1 = 'sketch_tokens'

else:
    raise ValueError('your string for fitting_type was not recognized')

if 'plus_sketch_tokens' in fitting_type:
    model_name2 = initialize_fitting.get_sketch_tokens_model_name(do_pca)
    model_name = model_name + '_plus_' + model_name2
elif 'plus_bdcn' in fitting_type:
    model_name2 = initialize_fitting.get_bdcn_model_name(do_pca, map_ind)
    model_name = model_name + '_plus_' + model_name2


output_dir, fn2save = initialize_fitting.get_save_path(subject, volume_space, model_name, shuffle_images, random_images, random_voxel_data, debug, date_str)

# decide what voxels to use  
voxel_mask, voxel_index, voxel_roi, voxel_ncsnr, brain_nii_shape = roi_utils.get_voxel_roi_info(subject, volume_space)

sessions = np.arange(0,up_to_sess)
zscore_betas_within_sess = True
# get all data and corresponding images, in two splits. always fixed set that gets left out
trn_stim_data, trn_voxel_data, val_stim_data, val_voxel_data, \
        image_order, image_order_trn, image_order_val = nsd_utils.get_data_splits(subject, sessions=sessions, \
                                                                     voxel_mask=voxel_mask, volume_space=volume_space, \
                                                                      zscore_betas_within_sess=zscore_betas_within_sess, \
                                                                      shuffle_images=shuffle_images, random_images=random_images, \
                                                                                         random_voxel_data=random_voxel_data)

Time Stamp: Sep-24-2021_1914_28

Will save final output file to /Users/margarethenderson/Box Sync/imStat/model_fits/S01/texture_pyramid_ridge_4ori_4sf/Sep-24-2021_1914_28_DEBUG/


Volume space: ROI defs are located at: /Users/margarethenderson/Box Sync/nsd_betas_for_testing/nsddata/ppdata/subj01/func1pt8mm/roi

3794 voxels of overlap between kastner and prf definitions, using prf defs
unique values in retino labels:
[-1.  0.  1.  2.  3.  4.  5.  6.  7.  8.  9. 10. 11. 12. 13. 14. 15. 16.
 17. 18. 19. 20. 21. 22. 23. 24. 25.]
0 voxels of overlap between face and place definitions, using place defs
unique values in categ labels:
[-1.  0. 26. 27. 28. 30. 31. 32. 33.]
1535 voxels are defined (differently) in both retinotopic areas and category areas

14913 voxels are defined across all areas, and will be used for analysis

Loading numerical label/name mappings for all ROIs:
[1, 2, 3, 4, 5, 6, 7]
['V1v', 'V1d', 'V2v', 'V2d', 'V3v', 'V3d', 'hV4']
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 1

In [8]:
trn_stim_data = image_order_trn
val_stim_data = image_order_val

In [424]:
# More params for fitting
holdout_size, lambdas = initialize_fitting.get_fitting_pars(trn_voxel_data, zscore_features, ridge=ridge)
# Params for the spatial aspect of the model (possible pRFs)
aperture_rf_range = 1.1
aperture, models = initialize_fitting.get_prf_models(aperture_rf_range=aperture_rf_range)    
    
group_all_hl_feats = True
# default_paths.pyramid_texture_feat_path = os.path.join(default_paths.root, 'features/pyramid_texture/')
# Set up the pyramid
compute_features = False
_fmaps_fn = texture_statistics_pyramid.steerable_pyramid_extractor(pyr_height = n_sf, n_ori = n_ori)
# Initialize the "texture" model which builds on first level feature maps
_feature_extractor = texture_statistics_pyramid.texture_feature_extractor(_fmaps_fn,sample_batch_size=sample_batch_size, \
                              subject=subject, feature_types_exclude=feature_types_exclude, n_prf_sd_out=n_prf_sd_out,\
                              aperture=aperture, do_varpart = False, \
                              group_all_hl_feats = group_all_hl_feats, compute_features = compute_features, \
                              do_pca_hl = do_pca_pyr_hl, min_pct_var = min_pct_var, max_pc_to_retain = max_pc_to_retain, \
                                                                          device=device)
feature_info = [_feature_extractor.feature_column_labels, _feature_extractor.feature_types_include]

map_resolution = 227
_feature_extractor2 = sketch_token_features.sketch_token_feature_extractor(subject, device, map_resolution=map_resolution, \
                                                                           aperture = aperture, \
                                                     n_prf_sd_out = n_prf_sd_out, \
                               batch_size=sample_batch_size, mult_patch_by_prf=mult_patch_by_prf, do_avg_pool = do_avg_pool,\
                                           do_pca = do_pca_st, min_pct_var = min_pct_var, max_pc_to_retain = max_pc_to_retain)
_feature_extractor = merge_features.combined_feature_extractor([_feature_extractor, _feature_extractor2], \
                                                                   [name1,'sketch_tokens'], do_varpart = do_varpart)




Possible lambda values are:
[1.0000000e+00 4.2169652e+00 1.7782795e+01 7.4989418e+01 3.1622775e+02
 1.3335215e+03 5.6234131e+03 2.3713736e+04 1.0000000e+05]
most extreme RF positions:
[-0.55 -0.55  0.04]
[0.55       0.55       0.40000001]
Feature types to exclude from the model:
[]


In [420]:
_feature_extractor.modules[0].get_partial_versions()

RuntimeError: need to run init_for_fitting first

In [386]:
_feature_extractor.modules

[texture_feature_extractor(
   (fmaps_fn): steerable_pyramid_extractor()
 ),
 sketch_token_feature_extractor()]

In [425]:
# add an intercept
add_bias=True
# determines whether to shuffle before separating the nested heldout data for lambda and param selection. 
# always using true.
shuffle=True 
best_losses, best_lambdas, best_params, best_train_preds = fit_fwrf_model(trn_stim_data, trn_voxel_data, _feature_extractor, models, \
                                               lambdas, zscore=zscore_features, add_bias=add_bias, \
                                               voxel_batch_size=voxel_batch_size, holdout_size=holdout_size, \
                                               shuffle=shuffle, shuff_rnd_seed=shuff_rnd_seed, device=device, \
                                               dtype=fpX, debug=debug)

trn_voxel_data_pred = best_train_preds

dtype = <class 'numpy.float32'>
device = cpu:0
trn_size = 619 (90.0%)
Seeding random number generator: seed is 291125
Initializing for fitting
Clearing precomputed features from memory.
Initializing for fitting
Clearing features from memory
---------------------------------------


Getting features for prf 0: [x,y,sigma] is [-0.55 -0.55 0.0400]
Loading pre-computed features for models [0 - 49] from /Users/margarethenderson/Box Sync/features/pyramid_texture/S1_features_each_prf_4ori_4sf.h5py
Took 20.41089 seconds to load file
Index into batch for prf 0: 0
Size of features array for this image set and prf is:
(688, 641)
Final size of features concatenated is [688 x 641]
Feature types included are:
['pixel_stats', 'mean_magnitudes', 'mean_realparts', 'marginal_stats_lowpass_recons', 'variance_highpass_resid', 'magnitude_feature_autocorrs', 'lowpass_recon_autocorrs', 'highpass_resid_autocorrs', 'magnitude_within_scale_crosscorrs', 'real_within_scale_crosscorrs', 'magnitude_across_scale_cro

In [426]:
val_cc, val_r2, val_voxel_data_pred = validate_fwrf_model(best_params, models, val_voxel_data, val_stim_data, _feature_extractor, \
                               sample_batch_size=sample_batch_size, voxel_batch_size=voxel_batch_size, debug=debug, dtype=fpX)


Clearing precomputed features from memory.
Clearing features from memory
Getting features for prf 0: [x,y,sigma] is [-0.55 -0.55 0.0400]
Loading pre-computed features for models [0 - 49] from /Users/margarethenderson/Box Sync/features/pyramid_texture/S1_features_each_prf_4ori_4sf.h5py
Took 17.59987 seconds to load file
Index into batch for prf 0: 0
Size of features array for this image set and prf is:
(62, 641)
Final size of features concatenated is [62 x 641]
Feature types included are:
['pixel_stats', 'mean_magnitudes', 'mean_realparts', 'marginal_stats_lowpass_recons', 'variance_highpass_resid', 'magnitude_feature_autocorrs', 'lowpass_recon_autocorrs', 'highpass_resid_autocorrs', 'magnitude_within_scale_crosscorrs', 'real_within_scale_crosscorrs', 'magnitude_across_scale_crosscorrs', 'real_imag_across_scale_crosscorrs', 'real_spatshift_within_scale_crosscorrs', 'real_spatshift_across_scale_crosscorrs']
Final size of features concatenated is [62 x 641]
Final size of features concaten

In [427]:
partial_masks, partial_version_names = _feature_extractor.get_partial_versions()
partial_versions_use = []
hasattr(_feature_extractor, 'module_names')
for mm in range(len(_feature_extractor.module_names)):
    this_module = np.where([(_feature_extractor.module_names[mm] in pp) for pp in partial_version_names])[0]
    if len(this_module)>1:
        # this means there are 'subsets' of features within this module that we will want to consider separately.
        # so finding just the ones that we want here.
        partial_versions_use += list(np.where([(_feature_extractor.module_names[mm] in pp and '_just' in pp and '_no_other_modules' in pp) \
                     for pp in partial_version_names])[0])
    else:
        partial_versions_use += list(this_module)
print('Subsets of features that are going into the stacking analysis:')
print([partial_version_names[pp] for pp in partial_versions_use])
 
 
 

Subsets of features that are going into the stacking analysis:
['just_pyramid_texture', 'just_sketch_tokens']


In [430]:
stack_result, stack_result_lo, partial_models_use = run_stacking(_feature_extractor, \
                     trn_voxel_data, val_voxel_data, trn_voxel_data_pred, val_voxel_data_pred)
    

Subsets of features that are going into the stacking analysis:
['just_pyramid_texture', 'just_sketch_tokens']
Running stacking, feat_use is:
[0 1]
Solving for stacking weights for voxel 0 of 14913
Stacking weights matrix is size:
(14913, 2)
Solving for stacking weights for voxel 1 of 14913
Computing performance of stacked models


  c /= stddev[:, None]
  c /= stddev[None, :]


In [444]:
partial_models_use

[1, 2]

In [429]:
def run_stacking(_feature_extractor, trn_voxel_data, val_voxel_data, trn_voxel_data_pred, val_voxel_data_pred):
    
    
    n_voxels = trn_voxel_data.shape[1]

    # To get the "features" to use for stacking - i'm using the "partial models" that are defined 
    # by the feature extractor. The first one is the full model, so we don't want to use that - just want 
    # the ones that include just a subset of the full feature space.
    partial_masks, partial_version_names = _feature_extractor.get_partial_versions()
    partial_models_use = []
    hasattr(_feature_extractor, 'module_names')
    for mm in range(len(_feature_extractor.module_names)):
        this_module = np.where([(_feature_extractor.module_names[mm] in pp) for pp in partial_version_names])[0]
        if len(this_module)>1:
            # this means there are 'subsets' of features within this module that we will want to consider separately.
            # so finding just the ones that we want here.
            partial_models_use += list(np.where([(_feature_extractor.module_names[mm] in pp and '_just' in pp and '_no_other_modules' in pp) \
                         for pp in partial_version_names])[0])
        else:
            partial_models_use += list(this_module)
    print('Subsets of features that are going into the stacking analysis:')
    print([partial_version_names[pp] for pp in partial_models_use])

    n_feature_groups = len(partial_models_use)

    # Creating a list where each element is predictions for one of the partial models - these will be 
    # the 'features' elements input to stacking code.
    preds_train = [trn_voxel_data_pred[:,:,pp].T for pp in partial_models_use]
    preds_val = [val_voxel_data_pred[:,:,pp] for pp in partial_models_use]
    # Compute trial-wise training errors
    # each element of err is [ntrials x nvoxels]
    train_err = [trn_voxel_data - trn_voxel_data_pred[:,:,pp].T for pp in partial_models_use]

    # Also computing the performance of each of the partial versions on training set data.
    # this is sort of a sanity check that things are working, since the performance of the partial models
    # should roughly predict what the stacking weights will be.
    train_r2 = np.array([get_r2(trn_voxel_data, trn_voxel_data_pred[:,:,pp].T, axis=0) \
                         for pp in range(len(partial_version_names))]).T
    train_r2 = np.nan_to_num(train_r2)
    train_cc = np.array([get_corrcoef(trn_voxel_data, trn_voxel_data_pred[:,:,pp].T, axis=0) \
                         for pp in range(len(partial_version_names))]).T
    train_cc = np.nan_to_num(train_cc)

    # First running stacking w all features included
    feat_use = np.arange(0,n_feature_groups)
    # Stack result will be a tuple including the stacking weights, performance.
    stack_result = stacked_core(feat_use, train_err, train_data=trn_voxel_data,\
                     val_data = val_voxel_data, preds_train = preds_train, preds_val = preds_val,\
                     debug=debug);

    # Then going to repeat it leaving out one feature group at a time
    # This will only make sense to do there are more than 2 feature groups, otherwise it's just single models.
    if n_feature_groups>2:   
        stack_result_lo = dict()
        for leave_one in range(n_feature_groups):
            feat_use_lo = list(copy.deepcopy(feat_use))
            feat_use_lo.remove(leave_one)
            tmp = stacked_core(feat_use_lo, train_err, train_data=trn_voxel_data,\
                             val_data = val_voxel_data, preds_train = preds_train, preds_val = preds_val,\
                             debug=debug);
            stack_result_lo[leave_one] = tmp
    else:       
        stack_result_lo = None


    return stack_result, stack_result_lo, partial_models_use

In [372]:
np.nan_to_num(stack_result[4])

array([-0.1031163 , -0.08829378,  0.        , ...,  0.        ,
        0.        ,  0.        ], dtype=float32)

In [373]:
from __future__ import division
import numpy as np
import os
from scipy.stats import zscore
from numpy.linalg import inv, svd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import Ridge, RidgeCV
import time
from scipy.stats import zscore
from cvxopt import matrix, solvers

solvers.options["show_progress"] = False

def stacked_core(feat_use, train_err, train_data, val_data, preds_train, preds_val, debug=False):
    """
    Compute weights for stacking models (linearly combining predictions of multiple encoding models).
    Outputs weights and performance of the stacked model.
    Code from Ruogu Lin (modified slightly for this project).
    """
    print('Running stacking, feat_use is:')
    print(feat_use)
    n_voxels = train_data.shape[1]
    n_feature_groups = len(feat_use) # feat use is the sub-set of feature groups to stack.
    n_trials_train = train_data.shape[0]
    n_trials_val = preds_val[0].shape[0]
        
    dtype = train_data.dtype
    stacked_pred_train = np.full(fill_value=0, shape=(n_trials_train, n_voxels), dtype=dtype)
    stacked_pred_val = np.full(fill_value=0, shape=(n_trials_val, n_voxels), dtype=dtype)

    # calculate error matrix for stacking
    P = np.zeros((n_voxels, n_feature_groups, n_feature_groups))
    idI = 0
    for i in feat_use:
        idJ = 0
        for j in feat_use:
            # err is the trialwise, voxelwise, error for each model.
            # P will store the summed products of the error for each pair of models 
            # (if i=j, then it's the summed squared error).
            P[:, idI, idJ] = np.mean(train_err[i] * train_err[j], 0)
            idJ += 1
        idI += 1

    idI = 0
    idJ = 0

    # PROGRAMATICALLY SET THIS FROM THE NUMBER OF FEATURES
    q = matrix(np.zeros((n_feature_groups)))
    G = matrix(-np.eye(n_feature_groups, n_feature_groups))
    h = matrix(np.zeros(n_feature_groups))
    A = matrix(np.ones((1, n_feature_groups)))
    b = matrix(np.ones(1))

    # Stacking weights will be stored here
    S = np.zeros((n_voxels, n_feature_groups))

    for vv in range(0, n_voxels):
        if debug and vv>1:
            continue
            
        print('Solving for stacking weights for voxel %d of %d'%(vv, n_voxels))
        PP = matrix(P[vv])
        # solve for stacking weights for every voxel
        # This essentially is minimizing the quantity x.T @ PP @ x, subject to the constraint that
        # the elements of x have to be positive, and have to sum to 1. 
        # x will be the weights for the stacking model.
        # Weights will be dependent on the error of each model individually (this is contained in PP).
        S[vv, :] = np.array(solvers.qp(PP, q, G, h, A, b)["x"]).reshape(n_feature_groups,)
        if vv==0:
            print('Stacking weights matrix is size:')
            print(S.shape)
            
        # Combine the predictions from the individual feature spaces for voxel i
        z = np.array([preds_val[feature_j][:, vv] for feature_j in feat_use])
        # multiply the predictions by S[vv,:]
        stacked_pred_val[:, vv] = np.dot(S[vv, :], z)
        
        # Same thing for the training trials
        z = np.array([preds_train[feature_j][:, vv] for feature_j in feat_use])
        stacked_pred_train[:, vv] = np.dot(S[vv, :], z)
        
    print('Computing performance of stacked models')
    # Compute r2 of the stacked model for training data
    stacked_r2_train = get_r2(stacked_pred_train, train_data, axis=0)
    stacked_cc_train = get_corrcoef(stacked_pred_train, train_data, axis=0)
    stacked_r2_train = np.nan_to_num(stacked_r2_train)
    stacked_cc_train = np.nan_to_num(stacked_cc_train) 
    
    # And for validation data
    stacked_r2_val = get_r2(stacked_pred_val, val_data, axis=0)
    stacked_cc_val = get_corrcoef(stacked_pred_val, val_data, axis=0)
    stacked_r2_val = np.nan_to_num(stacked_r2_val)
    stacked_cc_val = np.nan_to_num(stacked_cc_val) 
    
    return S, stacked_r2_train, stacked_cc_train, stacked_r2_val, stacked_cc_val


In [121]:
import sys
import os
import struct
import time
import copy
import numpy as np
import h5py
from tqdm import tqdm
import pickle
import math
import sklearn
from sklearn import decomposition

import torch
import torch.nn as nn
import torch.nn.init as I
import torch.nn.functional as F
import torch.optim as optim

from utils import numpy_utils, torch_utils, texture_utils

"""
General code for fitting a 'feature weighted receptive field' model to fmri data - looping over many candidate pRF 
models for each voxel, find a set of weights that best predict its responses based on feature space of interest.
Can work for many different types of feature spaces, feature extraction implemented with nn.Module.

Original source of some of this code is the github repository:
https://github.com/styvesg/nsd
It was modified by MH to work for this project.
"""


def _cofactor_fn_cpu(_x, lambdas):
    '''
    Generating a matrix needed to solve ridge regression model for each lambda value.
    Ridge regression (Tikhonov) solution is :
    w = (X^T*X + I*lambda)^-1 * X^T * Y
    This func will return (X^T*X + I*lambda)^-1 * X^T. 
    So once we have that, can just multiply by training data (Y) to get weights.
    returned size is [nLambdas x nFeatures x nTrials]
    This version makes sure that the torch inverse operation is done on the cpu, and in floating point-64 precision.
    Otherwise get bad results for small lambda values. This seems to be a torch-specific bug, noted around May 2021.
    
    '''
    device_orig = _x.device
    type_orig = _x.dtype
    # switch to this specific format which works with inverse
    _x = _x.to('cpu').to(torch.float64)
    _f = torch.stack([(torch.mm(torch.t(_x), _x) + torch.eye(_x.size()[1], device='cpu', dtype=torch.float64) * l).inverse() for l in lambdas], axis=0) 
    
    # [#lambdas, #feature, #feature] 
    cof = torch.tensordot(_f, _x, dims=[[2],[1]]) # [#lambdas, #feature, #sample]
    
    # put back to whatever way it was before, so that we can continue with other operations as usual
    return cof.to(device_orig).to(type_orig)



def _loss_fn(_cofactor, _vtrn, _xout, _vout):
    '''
    Calculate loss given "cofactor" from cofactor_fn, training data, held-out design matrix, held out data.
    returns weights (betas) based on equation
    w = (X^T*X + I*lambda)^-1 * X^T * Y
    also returns loss for these weights w the held out data. SSE is loss func here.
    '''

    _beta = torch.tensordot(_cofactor, _vtrn, dims=[[2], [0]]) # [#lambdas, #feature, #voxel]
    _pred = torch.tensordot(_xout, _beta, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
    _loss = torch.sum(torch.pow(_vout[:,None,:] - _pred, 2), dim=0) # [#lambdas, #voxels]
    return _beta, _loss



def fit_fwrf_model(images, voxel_data, _feature_extractor, prf_models, lambdas, \
                   zscore=False, add_bias=False, voxel_batch_size=100, holdout_size=100, \
                       shuffle=True, shuff_rnd_seed=0, device=None, dtype=np.float32, debug=False):
    
    """
    Solve for encoding model weights using ridge regression.
    Inputs:
        images: the training images, [n_trials x 1 x height x width]
            OR for models where features were pre-computed, this is a list of indices [n_trials,] into the 10,000 long feature array.
        voxel_data: the training voxel data, [n_trials x n_voxels]
        _feature_extractor_fn: module that maps from images to model features
        prf_models: the list of possible pRFs to test, columns are [x, y, sigma]
        lambdas: ridge lambda parameters to test
        zscore: want to zscore each column of feature matrix before fitting?
        add_bias: add a column of ones to feature matrix, for an additive bias?
        voxel_batch_size: how many voxels to use at a time for model fitting
        holdout_size: how many training trials to hold out for computing loss/lambda selection?
        shuffle: do we shuffle training data order before holding trials out?      
        shuff_rnd_seed: if we do shuffle training data (shuffle=True), what random seed to use? if zero, choose a new random seed in this code.
        device: what device to use? cpu/cuda
        debug: want to run a shortened version of this, to test it?
    Outputs:
        best_losses: loss value for each voxel (with best pRF and best lambda), eval on held out set
        best_lambdas: best lambda for each voxel (chosen based on loss w held out set)
        best_params: 
            [0] best pRF for each voxel [x,y,sigma]
            [1] best weights for each voxel/feature
            [2] if add_bias=True, best bias value for each voxel
            [3] if zscore=True, the mean of each feature before z-score
            [4] if zscore=True, the std of each feature before z-score
            [5] index of the best pRF for each voxel (i.e. index of row in "prf_models")
        
    """

    if device is None:
        device=torch.device('cpu:0')

    print ('dtype = %s' % dtype)
    print ('device = %s' % device)

    n_trials = len(images)
    n_prfs = len(prf_models)
    n_voxels = voxel_data.shape[1]   

    # Get train/holdout splits.
    # Held-out data here is used for lamdba selection.
    # This is the inner part of nested cross-validation; there is another portion of data ('val') which never enters this function.
    trn_size = n_trials - holdout_size
    assert trn_size>0, 'Training size needs to be greater than zero'
    print ('trn_size = %d (%.1f%%)' % (trn_size, float(trn_size)*100/len(voxel_data)))
    order = np.arange(len(voxel_data), dtype=int)
    if shuffle:
        if shuff_rnd_seed==0:
            print('Computing a new random seed')
            shuff_rnd_seed = int(time.strftime('%M%H%d', time.localtime()))
        print('Seeding random number generator: seed is %d'%shuff_rnd_seed)
        np.random.seed(shuff_rnd_seed)
        np.random.shuffle(order)
        
    images = images[order]
    voxel_data_shuff = copy.deepcopy(voxel_data)
    voxel_data_shuff = voxel_data_shuff[order]  
    trn_data = voxel_data_shuff[:trn_size]
    out_data = voxel_data_shuff[trn_size:]

    
    # Here is where any model-specific additional initialization steps are done
    # Includes initializing pca params arrays, if doing pca
    if len(images.shape)>1:
        image_size = images.shape[2:4]
    else:
        image_size = None
    _feature_extractor.init_for_fitting(image_size, prf_models, dtype)
    max_features = _feature_extractor.max_features

    # Decide whether to do any "partial" versions of the models (leaving out subsets of features)
    # Purpose is for variance partition
    masks, partial_version_names = _feature_extractor.get_partial_versions()
    n_partial_versions = len(partial_version_names) # will be one if skipping varpart
    if add_bias:
        masks = np.concatenate([masks, np.ones([masks.shape[0],1])], axis=1) # always include intercept 
    masks = np.transpose(masks)
    # masks is [n_features_total (including intercept) x n_partial_versions]

    # Initialize arrays to store model fitting params
    best_w_params = np.zeros(shape=(n_voxels, max_features ,n_partial_versions), dtype=dtype)
    best_prf_models = np.full(shape=(n_voxels,n_partial_versions), fill_value=-1, dtype=int)   
    best_lambdas = np.full(shape=(n_voxels,n_partial_versions), fill_value=-1, dtype=int)
    best_losses = np.full(fill_value=np.inf, shape=(n_voxels,n_partial_versions), dtype=dtype)

    # Initialize arrays to store the trial-wise predictions (need these for stacking)
    # Note that this is all training set trials - including the held out trials.
    best_train_preds = np.zeros(shape=(n_voxels, n_trials, n_partial_versions), dtype=dtype)

    # Additional params that are optional
    if add_bias:
        best_w_params = np.concatenate([best_w_params, np.zeros(shape=(n_voxels,1,n_partial_versions), dtype=dtype)], axis=1)

    if zscore:
        features_mean = np.zeros(shape=(n_voxels, max_features), dtype=dtype)
        features_std  = np.zeros(shape=(n_voxels, max_features), dtype=dtype)
    else:
        features_mean = None
        features_std = None

    start_time = time.time()
    vox_loop_time = 0

    print ('---------------------------------------\n')
    
    with torch.no_grad(): # make sure local gradients are off to save memory
        
        # Looping over prf_models (here prf_models are different spatial RF definitions)
        for m,(x,y,sigma) in enumerate(prf_models):
            if debug and m>1:
                break
                
            print('\nGetting features for prf %d: [x,y,sigma] is [%.2f %.2f %.4f]'%(m, prf_models[m,0],  prf_models[m,1],  prf_models[m,2]))

            t = time.time()            

            # Get features for the desired pRF, across all trn set image  
            # Features is size [ntrials x nfeatures]
            # nfeatures may be less than max_features, because max_features is the largest number possible for any pRF.
            # feature_inds_defined is length max_features, and tells which of the features in max_features are includes in features.
            features, feature_inds_defined = _feature_extractor(images, (x,y,sigma), m, fitting_mode=True)
            features = features.detach().cpu().numpy() 
            
            elapsed = time.time() - t

            n_features_actual = features.shape[1]
            
            if zscore:  
                features_m = np.mean(features, axis=0, keepdims=True) #[:trn_size]
                features_s = np.std(features, axis=0, keepdims=True) + 1e-6          
                features -= features_m
                features /= features_s    

            if add_bias:
                features = np.concatenate([features, np.ones(shape=(len(features), 1), dtype=dtype)], axis=1)
                feature_inds_defined = np.concatenate((feature_inds_defined, [True]), axis=0)
                
            trn_features = features[:trn_size,:]
            out_features = features[trn_size:,:]
            
            
            # Going to keep track of whether current prf is better than running best, for each voxel.
            # This is for the full model only.
            # Will use this to make sure for each partial model, we end up saving the params for the prf that was best w full model.
            full_model_improved = np.zeros((n_voxels,),dtype=bool)

            # Looping over versions of model w different features set to zero (variance partition)
            for pp in range(n_partial_versions):

                print('\nFitting version %d of %d: %s, '%(pp, n_partial_versions, partial_version_names[pp]))

                # nonzero_inds_full is length max_features (or max_features+1 if bias=True)
                # same size as the final params matrices will be.
                nonzero_inds_full = np.logical_and(masks[:,pp], feature_inds_defined)             
                # nonzero_inds_full is restricted to just indices that are defined for this prf - ie same size as features.
                nonzero_inds_short = masks[feature_inds_defined,pp]==1
        
                # Send matrices to gpu    
                _xtrn = torch_utils._to_torch(trn_features[:, nonzero_inds_short], device=device)
                _xout = torch_utils._to_torch(out_features[:, nonzero_inds_short], device=device)   

                # Do part of the matrix math involved in ridge regression optimization out of the loop, 
                # because this part will be same for all the voxels.
                _cof = _cofactor_fn_cpu(_xtrn, lambdas = lambdas) 

                # Now looping over batches of voxels (only reason is because can't store all in memory at same time)
                vox_start = time.time()
                vi=-1
                for rv,lv in numpy_utils.iterate_range(0, n_voxels, voxel_batch_size):
                    vi=vi+1
                    sys.stdout.write('\rfitting model %4d of %-4d, voxels [%6d:%-6d] of %d' % (m, n_prfs, rv[0], rv[-1], n_voxels))

                    # Send matrices to gpu
                    _vtrn = torch_utils._to_torch(trn_data[:,rv], device=device)
                    _vout = torch_utils._to_torch(out_data[:,rv], device=device)

                    # Here is where optimization happens - relatively simple matrix math inside loss fn.
                    _betas, _loss = _loss_fn(_cof, _vtrn, _xout, _vout) #   [#lambda, #feature, #voxel, ], [#lambda, #voxel]
                    
                    # Get trial-by-trial predictions for each training set trial (need for stacking)
                    _pred_train = torch.tensordot(_xtrn, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
                    _pred_out = torch.tensordot(_xout, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
                    pred_train = torch_utils.get_value(_pred_train)
                    pred_out = torch_utils.get_value(_pred_out)
                    # Going to combine the training and held out trials and re-create their original order here.
                    preds_all_shuffled = np.concatenate((pred_train, pred_out), axis=0)
                    preds_all_origorder = unshuffle(preds_all_shuffled, order) # [#samples x lambdas x voxels]
    
                    # Now have a set of weights (in betas) and a loss value for every voxel and every lambda. 
                    # goal is then to choose for each voxel, what is the best lambda and what weights went with that lambda.

                    # choose best lambda value and the loss that went with it.
                    _loss_values, _lambda_index = torch.min(_loss, dim=0)
                    loss_values, lambda_index = torch_utils.get_value(_loss_values), torch_utils.get_value(_lambda_index)
                    betas = torch_utils.get_value(_betas)
                    pred = torch_utils.get_value(_pred)

                    if pp==0:

                        # comparing this loss to the other prf_models for each voxel (e.g. the other RF position/sizes)
                        assert(partial_version_names[pp]=='full_model' or partial_version_names[pp]=='full_combined_model')               
                        imp = loss_values<best_losses[rv,pp]
                        full_model_improved[rv] = imp

                    else:

                        # for the partial models we don't actually care which was best for the partial model itself,
                        # just care what was best for the full model
                        imp = full_model_improved[rv]


                    if np.sum(imp)>0:

                        # for whichever voxels had improvement relative to previous prf_models, save parameters now
                        # this means we won't have to save all params for all prf_models, just best.
                        arv = np.array(rv)[imp]

                        lambda_inds = lambda_index[imp]
                        best_lambdas[arv,pp] = lambda_inds
                        best_losses[arv,pp] = loss_values[imp]                        
                        best_prf_models[arv,pp] = m
                        if zscore and pp==0:
                            
                            # only need to update the mean/std if we're working with the full model, because those will be same for all partial versions.
                            fmean_tmp = copy.deepcopy(features_mean[arv,:])
                            fstd_tmp = copy.deepcopy(features_std[arv,:])
                            fmean_tmp[:,nonzero_inds_full[0:-1]] = features_m[0,nonzero_inds_short[0:-1]] # broadcast over updated voxels
                            fmean_tmp[:,~nonzero_inds_full[0:-1]] = 0.0
                            fstd_tmp[:,nonzero_inds_full[0:-1]] = features_s[0,nonzero_inds_short[0:-1]] # broadcast over updated voxels
                            fstd_tmp[:,~nonzero_inds_full[0:-1]] = 0.0
                            features_mean[arv,:] = fmean_tmp
                            features_std[arv,:] = fstd_tmp
                            
                        # taking the weights associated with the best lambda value
                        # remember that they won't fill entire matrix, rest of values stay at zero
                        best_w_tmp = copy.deepcopy(best_w_params[arv,:,pp])
                        best_w_tmp[:,nonzero_inds_full] = numpy_utils.select_along_axis(betas[:,:,imp], lambda_inds, run_axis=2, choice_axis=0).T
                        best_w_tmp[:,~nonzero_inds_full] = 0.0 # make sure to fill zeros here

                        best_w_params[arv,:,pp] = best_w_tmp
                        
                        # Save the trialwise predictions for all trials in their original order.
                        # Choosing predictions from whichever lambda was best.
                        best_train_preds[arv,:,pp] = numpy_utils.select_along_axis(preds_all_origorder[:,:,imp], \
                                                                               lambda_inds, run_axis=2, choice_axis=1).T;

#                         best_pred_tmp. =
                
                vox_loop_time += (time.time() - vox_start)
                elapsed = (time.time() - vox_start)
                sys.stdout.flush()

    # Print information about how fitting went...
    total_time = time.time() - start_time
    inv_time = total_time - vox_loop_time
    return_params = [best_w_params[:,0:max_features,:],]
    if add_bias:
        return_params += [best_w_params[:,-1,:],]
    else: 
        return_params += [None,]
    print ('\n---------------------------------------')
    print ('total time = %fs' % total_time)
    print ('total throughput = %fs/voxel' % (total_time / n_voxels))
    print ('voxel throughput = %fs/voxel' % (vox_loop_time / n_voxels))
    print ('setup throughput = %fs/model' % (inv_time / n_prfs))
    
    # This step clears the big feature maps for training data from feature extractor (no longer needed)
    _feature_extractor.clear_big_features()
    
    best_params = [prf_models[best_prf_models],]+return_params+[features_mean, features_std]+[best_prf_models]
    sys.stdout.flush()

    return best_losses, best_lambdas, best_params, best_train_preds

In [321]:
import sys
import os
import struct
import time
import numpy as np
import tqdm
import copy

import torch

from utils import numpy_utils, torch_utils


def validate_fwrf_model(best_params, prf_models, voxel_data, images, _feature_extractor, \
                                   sample_batch_size=100, voxel_batch_size=100, debug=False, dtype=np.float32):
    
    """ 
    Evaluate trained model, leaving out a subset of features at a time.
    """
    
    params = best_params
    device = _feature_extractor.device
    
    n_trials, n_voxels = len(images), len(params[0])
    n_prfs = prf_models.shape[0]
    n_features = params[1].shape[1]  
    n_voxels = np.shape(voxel_data)[1]

    best_models, weights, bias, features_mt, features_st, best_model_inds = params
    masks, partial_version_names = _feature_extractor.get_partial_versions()
    masks = np.transpose(masks)    
    n_features_max = _feature_extractor.max_features
    n_partial_versions = len(partial_version_names)
    
    # val_cc is the correlation coefficient bw real and predicted responses across trials, for each voxel.
    val_cc  = np.zeros(shape=(n_voxels, n_partial_versions), dtype=dtype)
    val_r2 = np.zeros(shape=(n_voxels, n_partial_versions), dtype=dtype)

    pred_models = np.full(fill_value=0, shape=(n_trials, n_features_max, n_prfs), dtype=dtype)
    feature_inds_defined_each_prf = np.full(fill_value=0, shape=(n_features_max, n_prfs), dtype=bool)
    
    # Saving full trial-by-trial predictions for each voxel, each partial model.
    # Need these for stacking.
    pred_voxel_data = np.full(fill_value=0, shape=(n_trials, n_voxels, n_partial_versions), dtype=dtype)
    
    start_time = time.time()    
    with torch.no_grad(): # make sure local gradients are off to save memory
        
        # First gather texture features for all pRFs.
        
        _feature_extractor.clear_big_features()
        
        for mm in range(n_prfs):
            if mm>1 and debug:
                break
            print('Getting features for prf %d: [x,y,sigma] is [%.2f %.2f %.4f]'%(mm, prf_models[mm,0],  prf_models[mm,1],  prf_models[mm,2] ))
            # all_feat_concat is size [ntrials x nfeatures]
            # nfeatures may be less than n_features_max, because n_features_max is the largest number possible for any pRF.
            # feature_inds_defined is length max_features, and tells which of the features in max_features are includes in features.
            all_feat_concat, feature_inds_defined = _feature_extractor(images, prf_models[mm,:], mm, fitting_mode=False)
            
            pred_models[:,feature_inds_defined,mm] = torch_utils.get_value(all_feat_concat)
            feature_inds_defined_each_prf[:,mm] = feature_inds_defined
            
        _feature_extractor.clear_big_features()
        
        vv=-1
        ## Looping over voxels here in batches, will eventually go through all.
        for rv, lv in numpy_utils.iterate_range(0, n_voxels, voxel_batch_size):
            vv=vv+1
            print('Getting predictions for voxels [%d-%d] of %d'%(rv[0],rv[-1],n_voxels))

            if vv>1 and debug:
                break

            # Looping over versions of model w different features set to zero (variance partition)
            for pp in range(n_partial_versions):

                print('\nEvaluating version %d of %d: %s'%(pp, n_partial_versions, partial_version_names[pp]))

                # masks describes the indices of the features that are included in this partial model
                # n_features_max in length
                features_to_use = masks[:,pp]==1
                print('Includes %d features'%np.sum(features_to_use))

                # [trials x features x voxels]
                features_full = pred_models[:,:,best_model_inds[rv,pp]]
                # Take out the relevant features now
                features_full = features_full[:,features_to_use,:]
                # Note there may be some zeros in this matrix, if we used fewer than the max number of features.
                # But they are zero in weight matrix too, so turns out ok.

                _weights = torch_utils._to_torch(weights[rv,:,pp], device=device)   
                _weights = _weights[:, features_to_use]
                _bias = torch_utils._to_torch(bias[rv,pp], device=device)

                print('number of zeros:')
                print(np.sum(features_full[0,:,0]==0))

                print('size of weights is:')
                print(_weights.shape)

                if features_mt is not None:
                    _features_m = torch_utils._to_torch(features_mt[rv,:], device=device)
                    _features_m = _features_m[:,features_to_use]
                if features_st is not None:
                    _features_s = torch_utils._to_torch(features_st[rv,:], device=device)
                    _features_s = _features_s[:,features_to_use]

                pred_block = np.full(fill_value=0, shape=(n_trials, lv), dtype=dtype)

                # Now looping over validation set trials in batches
                for rt, lt in numpy_utils.iterate_range(0, n_trials, sample_batch_size):

                    _features = torch_utils._to_torch(features_full[rt,:], device=device) # trials x features
                    if features_mt is not None:    
                        # features_m is [nvoxels x nfeatures] - need [trials x features x voxels]
                        _features = _features - torch.tile(torch.unsqueeze(_features_m, dim=0), [_features.shape[0], 1, 1]).moveaxis([1],[2])

                    if features_st is not None:
                        _features = _features/torch.tile(torch.unsqueeze(_features_s, dim=0), [_features.shape[0], 1, 1]).moveaxis([1],[2])
                        # if any entries in std are zero or nan, this gives bad result - fix these now.
                        # these bad entries will also be zero in weights, so doesn't matter. just want to avoid nans.
                        _features[torch.isnan(_features)] = 0.0 
                        _features[torch.isinf(_features)] = 0.0
                        
                    # features is [#samples, #features, #voxels] - swap dims to [#voxels, #samples, features]
                    _features = torch.transpose(torch.transpose(_features, 0, 2), 1, 2)
                    # weights is [#voxels, #features]
                    # _r will be [#voxels, #samples, 1] - then [#samples, #voxels]

                    _r = torch.squeeze(torch.bmm(_features, torch.unsqueeze(_weights, 2)), dim=2).t() 

                    if _bias is not None:
                        _r = _r + torch.tile(torch.unsqueeze(_bias, 0), [_r.shape[0],1])

                    pred_block[rt] = torch_utils.get_value(_r) 
                
                # Making sure to save these so that we can get stacking performance later.
                pred_voxel_data[:,rv,pp] = pred_block
                
                # Now for this batch of voxels and this partial version of the model, measure performance.
                val_cc[rv,pp] = get_corrcoef(voxel_data[:,rv], pred_block, axis=0)
                val_r2[rv,pp] = get_r2(voxel_data[:,rv], pred_block, axis=0)

                sys.stdout.flush()

    # any nans become zeros here.
    val_cc = np.nan_to_num(val_cc)
    val_r2 = np.nan_to_num(val_r2) 
    
    return val_cc, val_r2, pred_voxel_data


In [285]:
train_data=trn_voxel_data; val_data = val_voxel_data;
preds_val = [val_voxel_data_pred[:,:,pp] for pp in partial_models_use]
a = stacked_core(feat_use, err, train_data, val_data, preds_train, preds_val, debug=True)

Solving for stacking weights for voxel 0 of 14913
Stacking weights matrix is size:
(14913, 2)
Solving for stacking weights for voxel 1 of 14913
Computing performance of stacked models


In [288]:
isinstance(a, tuple)

True

In [216]:
stacked_pred, stacked_train_r2s_fold, S_average, S = stacked_core(
    n_voxels,
    feat_use,
    err,
    train_data,
    preds_test,
    preds_train,
    test_ind,
    ind_num,
    stacked_pred,
    stacked_train_r2s_fold,
    S_average,
    debug=True,
)

In [231]:
S[0,:]

array([1.04497046e-06, 9.99998955e-01])

In [232]:
S_average[0,:]

array([1.04497046e-06, 9.99998955e-01])

In [173]:
n_features = len(feat_use)
# calculate error matrix for stacking
P = np.zeros((n_voxels, n_features, n_features))
idI = 0
for i in feat_use:
    idJ = 0
    for j in feat_use:
        # err is the trialwise, voxelwise, error for each model.
        # P will store the summed products of the error for each pair of models 
        # (if i=j, then it's the summed squared error).
        P[:, idI, idJ] = np.mean(err[i] * err[j], 0)
        idJ += 1
    idI += 1

idI = 0
idJ = 0

# PROGRAMATICALLY SET THIS FROM THE NUMBER OF FEATURES
q = matrix(np.zeros((n_features)))
G = matrix(-np.eye(n_features, n_features))
h = matrix(np.zeros(n_features))
A = matrix(np.ones((1, n_features)))
b = matrix(np.ones(1))

S = np.zeros((n_voxels, n_features))

stacked_pred_train = np.zeros_like(train_data)


In [186]:
i=0
PP = matrix(P[i])
# solve for stacking weights for every voxel
# This essentially is minimizing the quantity x.T @ PP @ x, subject to the constraint that
# the elements of x have to be positive, and have to sum to 1. 
# x will be the weights for the stacking model.
# Weights will be dependent on the error of each model individually (this is contained in PP).
S[i, :] = np.array(solvers.qp(PP, q, G, h, A, b)["x"]).reshape(
    n_features,
)
# Combine the predictions from the individual feature spaces for voxel i
z = np.array([preds_test[feature_j][test_ind, i] for feature_j in feat_use])
# multiply the predictions by S[i,:]
stacked_pred[test_ind, i] = np.dot(S[i, :], z)
# combine the training predictions from the individual feature spaces for voxel i
z = np.array([preds_train[feature_j][:, i] for feature_j in feat_use])
stacked_pred_train[:, i] = np.dot(S[i, :], z)


In [200]:
S[0,:]

array([1.04497046e-06, 9.99998955e-01])

In [189]:
preds_test[0].shape

(62, 14913)

In [185]:
preds_test.shape

AttributeError: 'list' object has no attribute 'shape'

In [236]:
# Compute r2 of the stacked model for training data
stacked_r2_train = score_f(stacked_pred_train, train_data)
stacked_cc_train = np.zeros_like(stacked_r2_train)
for vv in range(n_voxels):
    stacked_cc_train[vv] = np.corrcoef(stacked_pred_train[:,vv], trn_voxel_data[:,vv])[0,1]  


  c /= stddev[:, None]
  c /= stddev[None, :]


In [264]:
predicted = np.random.normal(0,1,[3,10]).T
actual = np.random.normal(0,1,[3,10]).T

In [252]:
# calculate r2 for this fit.
axis=0

ssres = np.sum(np.power((predicted - actual),2), axis=axis);
sstot = np.sum(np.power((actual - np.mean(actual)),2), axis=axis);
r2 = 1-(ssres/sstot)
r2

array([-0.53964502])

In [322]:


def get_r2(actual,predicted,axis=0):
    """
    This computes the coefficient of determination (R2).
    Specify which axis to compute along (i.e. the trials/samples dimension)
    """
    ssres = np.sum(np.power((predicted - actual),2), axis=axis);
    sstot = np.sum(np.power((actual - np.mean(actual)),2), axis=axis);
    r2 = 1-(ssres/sstot)
    
    return r2

def get_corrcoef(actual,predicted,axis=0,dtype=np.float32):
    """
    This computes the linear correlation coefficient.
    Specify which axis to compute along (i.e. the trials/samples dimension)
    Assume input is 2D.
    """
    assert(len(actual.shape)==2)
    if axis==1:
        actual = actual.T
        predicted = predicted.T
    vals_cc = np.full(fill_value=0, shape=(actual.shape[1],), dtype=dtype)
    for vv in range(actual.shape[1]):
        vals_cc[vv] = np.corrcoef(actual[:,vv], predicted[:,vv])[0,1] 
    return vals_cc

In [None]:
images=trn_stim_data;
voxel_data = trn_voxel_data;
prf_models = models;
zscore=zscore_features; add_bias=add_bias;
dtype=fpX;

In [82]:
if device is None:
    device=torch.device('cpu:0')

print ('dtype = %s' % dtype)
print ('device = %s' % device)

n_trials = len(images)
n_prfs = len(prf_models)
n_voxels = voxel_data.shape[1]   

# Get train/holdout splits.
# Held-out data here is used for lamdba selection.
# This is the inner part of nested cross-validation; there is another portion of data ('val') which never enters this function.
trn_size = n_trials - holdout_size
assert trn_size>0, 'Training size needs to be greater than zero'
print ('trn_size = %d (%.1f%%)' % (trn_size, float(trn_size)*100/len(voxel_data)))
order = np.arange(len(voxel_data), dtype=int)
if shuffle:
    if shuff_rnd_seed==0:
        print('Computing a new random seed')
        shuff_rnd_seed = int(time.strftime('%M%H%d', time.localtime()))
    print('Seeding random number generator: seed is %d'%shuff_rnd_seed)
    np.random.seed(shuff_rnd_seed)
    np.random.shuffle(order)
images = images[order]
voxel_data = voxel_data[order]  
trn_data = voxel_data[:trn_size]
out_data = voxel_data[trn_size:]


# Here is where any model-specific additional initialization steps are done
# Includes initializing pca params arrays, if doing pca
if len(images.shape)>1:
    image_size = images.shape[2:4]
else:
    image_size = None
_feature_extractor.init_for_fitting(image_size, prf_models, dtype)
max_features = _feature_extractor.max_features

# Decide whether to do any "partial" versions of the models (leaving out subsets of features)
# Purpose is for variance partition
masks, partial_version_names = _feature_extractor.get_partial_versions()
n_partial_versions = len(partial_version_names) # will be one if skipping varpart
if add_bias:
    masks = np.concatenate([masks, np.ones([masks.shape[0],1])], axis=1) # always include intercept 
masks = np.transpose(masks)
# masks is [n_features_total (including intercept) x n_partial_versions]

# Initialize arrays to store model fitting params
best_w_params = np.zeros(shape=(n_voxels, max_features ,n_partial_versions), dtype=dtype)
best_prf_models = np.full(shape=(n_voxels,n_partial_versions), fill_value=-1, dtype=int)   
best_lambdas = np.full(shape=(n_voxels,n_partial_versions), fill_value=-1, dtype=int)
best_losses = np.full(fill_value=np.inf, shape=(n_voxels,n_partial_versions), dtype=dtype)

# Initialize arrays to store the trial-wise predictions (need these for stacking)
# Note that this is all training set trials - including the held out trials.
best_train_preds = np.zeros(shape=(n_voxels, n_trials, n_partial_versions), dtype=dtype)

# Additional params that are optional
if add_bias:
    best_w_params = np.concatenate([best_w_params, np.zeros(shape=(n_voxels,1,n_partial_versions), dtype=dtype)], axis=1)

if zscore:
    features_mean = np.zeros(shape=(n_voxels, max_features), dtype=dtype)
    features_std  = np.zeros(shape=(n_voxels, max_features), dtype=dtype)
else:
    features_mean = None
    features_std = None

start_time = time.time()
vox_loop_time = 0

print ('---------------------------------------\n')

dtype = <class 'numpy.float32'>
device = cpu:0
trn_size = 619 (90.0%)
Seeding random number generator: seed is 291125
Initializing for fitting
Clearing precomputed features from memory.
---------------------------------------



In [83]:
# Looping over prf_models (here prf_models are different spatial RF definitions)
m=0
(x,y,sigma) = prf_models[m]
 
print('\nGetting features for prf %d: [x,y,sigma] is [%.2f %.2f %.4f]'%(m, prf_models[m,0],  prf_models[m,1],  prf_models[m,2]))

t = time.time()            

# Get features for the desired pRF, across all trn set image  
# Features is size [ntrials x nfeatures]
# nfeatures may be less than max_features, because max_features is the largest number possible for any pRF.
# feature_inds_defined is length max_features, and tells which of the features in max_features are includes in features.
features, feature_inds_defined = _feature_extractor(images, (x,y,sigma), m, fitting_mode=True)
features = features.detach().cpu().numpy() 

elapsed = time.time() - t

n_features_actual = features.shape[1]

if zscore:  
    features_m = np.mean(features, axis=0, keepdims=True) #[:trn_size]
    features_s = np.std(features, axis=0, keepdims=True) + 1e-6          
    features -= features_m
    features /= features_s    

if add_bias:
    features = np.concatenate([features, np.ones(shape=(len(features), 1), dtype=dtype)], axis=1)
    feature_inds_defined = np.concatenate((feature_inds_defined, [True]), axis=0)

trn_features = features[:trn_size,:]
out_features = features[trn_size:,:]


# Going to keep track of whether current prf is better than running best, for each voxel.
# This is for the full model only.
# Will use this to make sure for each partial model, we end up saving the params for the prf that was best w full model.
full_model_improved = np.zeros((n_voxels,),dtype=bool)


Getting features for prf 0: [x,y,sigma] is [-0.55 -0.55 0.0400]
Loading pre-computed features for models [0 - 49] from /Users/margarethenderson/Box Sync/features/pyramid_texture/S1_features_each_prf_4ori_4sf.h5py
Took 19.84153 seconds to load file
Index into batch for prf 0: 0
Size of features array for this image set and prf is:
(688, 641)
Final size of features concatenated is [688 x 641]
Feature types included are:
['pixel_stats', 'mean_magnitudes', 'mean_realparts', 'marginal_stats_lowpass_recons', 'variance_highpass_resid', 'magnitude_feature_autocorrs', 'lowpass_recon_autocorrs', 'highpass_resid_autocorrs', 'magnitude_within_scale_crosscorrs', 'real_within_scale_crosscorrs', 'magnitude_across_scale_crosscorrs', 'real_imag_across_scale_crosscorrs', 'real_spatshift_within_scale_crosscorrs', 'real_spatshift_across_scale_crosscorrs']
Final size of features concatenated is [688 x 641]
Final size of features concatenated is [688 x 641]


In [101]:
pp=0
print('\nFitting version %d of %d: %s, '%(pp, n_partial_versions, partial_version_names[pp]))

# nonzero_inds_full is length max_features (or max_features+1 if bias=True)
# same size as the final params matrices will be.
nonzero_inds_full = np.logical_and(masks[:,pp], feature_inds_defined)             
# nonzero_inds_full is restricted to just indices that are defined for this prf - ie same size as features.
nonzero_inds_short = masks[feature_inds_defined,pp]==1

# Send matrices to gpu    
_xtrn = torch_utils._to_torch(trn_features[:, nonzero_inds_short], device=device)
_xout = torch_utils._to_torch(out_features[:, nonzero_inds_short], device=device)   

# Do part of the matrix math involved in ridge regression optimization out of the loop, 
# because this part will be same for all the voxels.
_cof = _cofactor_fn_cpu(_xtrn, lambdas = lambdas) 

# Now looping over batches of voxels (only reason is because can't store all in memory at same time)
vox_start = time.time()
vi=-1
rv = np.arange(0,voxel_batch_size)
lv = len(rv)
#                 for rv,lv in numpy_utils.iterate_range(0, n_voxels, voxel_batch_size):
vi=vi+1
sys.stdout.write('\rfitting model %4d of %-4d, voxels [%6d:%-6d] of %d' % (m, n_prfs, rv[0], rv[-1], n_voxels))

# Send matrices to gpu
_vtrn = torch_utils._to_torch(trn_data[:,rv], device=device)
_vout = torch_utils._to_torch(out_data[:,rv], device=device)

# Here is where optimization happens - relatively simple matrix math inside loss fn.
_betas, _loss = _loss_fn(_cof, _vtrn, _xout, _vout) #[#lambda, #feature, #voxel], [#lambda, #voxel]
    
# Now have a set of weights (in betas) and a loss value for every voxel and every lambda. 
# goal is then to choose for each voxel, what is the best lambda and what weights went with that lambda.
# choose best lambda value and the loss that went with it.
_loss_values, _lambda_index = torch.min(_loss, dim=0)
loss_values, lambda_index = torch_utils.get_value(_loss_values), torch_utils.get_value(_lambda_index)
betas = torch_utils.get_value(_betas)

# Get trial-by-trial predictions for each training set trial (need for stacking)
_pred_train = torch.tensordot(_xtrn, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
_pred_test = torch.tensordot(_xout, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
pred_train = torch_utils.get_value(_pred_train)
pred_test = torch_utils.get_value(_pred_test)
# Going to combine the training and held out trials and re-create their original order here.
preds_all_shuffled = np.concatenate((pred_train, pred_test), axis=0)
preds_all_origorder = unshuffle(preds_all_shuffled, order) # [#samples x lambdas x voxels]
 
if pp==0:

    # comparing this loss to the other prf_models for each voxel (e.g. the other RF position/sizes)
    assert(partial_version_names[pp]=='full_model' or partial_version_names[pp]=='full_combined_model')               
    imp = loss_values<best_losses[rv,pp]
    full_model_improved[rv] = imp

else:

    # for the partial models we don't actually care which was best for the partial model itself,
    # just care what was best for the full model
    imp = full_model_improved[rv]


Fitting version 0 of 3: full_model, 
fitting model    0 of 875 , voxels [     0:99    ] of 14913

In [103]:
if np.sum(imp)>0:

    # for whichever voxels had improvement relative to previous prf_models, save parameters now
    # this means we won't have to save all params for all prf_models, just best.
    arv = np.array(rv)[imp]

    lambda_inds = lambda_index[imp]
    best_lambdas[arv,pp] = lambda_inds
    best_losses[arv,pp] = loss_values[imp]                        
    best_prf_models[arv,pp] = m
    if zscore and pp==0:

        # only need to update the mean/std if we're working with the full model, because those will be same for all partial versions.
        fmean_tmp = copy.deepcopy(features_mean[arv,:])
        fstd_tmp = copy.deepcopy(features_std[arv,:])
        fmean_tmp[:,nonzero_inds_full[0:-1]] = features_m[0,nonzero_inds_short[0:-1]] # broadcast over updated voxels
        fmean_tmp[:,~nonzero_inds_full[0:-1]] = 0.0
        fstd_tmp[:,nonzero_inds_full[0:-1]] = features_s[0,nonzero_inds_short[0:-1]] # broadcast over updated voxels
        fstd_tmp[:,~nonzero_inds_full[0:-1]] = 0.0
        features_mean[arv,:] = fmean_tmp
        features_std[arv,:] = fstd_tmp

    # taking the weights associated with the best lambda value
    # remember that they won't fill entire matrix, rest of values stay at zero
    best_w_tmp = copy.deepcopy(best_w_params[arv,:,pp])
    best_w_tmp[:,nonzero_inds_full] = numpy_utils.select_along_axis(betas[:,:,imp], lambda_inds, run_axis=2, choice_axis=0).T
    best_w_tmp[:,~nonzero_inds_full] = 0.0 # make sure to fill zeros here

    best_w_params[arv,:,pp] = best_w_tmp
    
    # Save the trialwise predictions for all trials in their original order.
    # Choosing predictions from whichever lambda was best.
    best_train_preds[arv,:,pp] = numpy_utils.select_along_axis(preds_all_origorder[:,:,imp], \
                                                           lambda_inds, run_axis=2, choice_axis=1).T;


In [112]:
betas.shape

(9, 642, 100)

In [114]:
preds_all_origorder.shape

(688, 9, 100)

In [115]:
best_preds_tmp = copy.deepcopy(best_train_preds[arv,:,pp])
best_preds_tmp[:,:] = numpy_utils.select_along_axis(preds_all_origorder[:,:,imp], lambda_inds, run_axis=2, choice_axis=1).T;




In [118]:
best_train_preds[arv,:,pp] = numpy_utils.select_along_axis(preds_all_origorder[:,:,imp], \
                                                           lambda_inds, run_axis=2, choice_axis=1).T;


In [116]:
numpy_utils.select_along_axis(preds_all_origorder[:,:,imp], lambda_inds, run_axis=2, choice_axis=1).T.shape

(100, 688)

In [111]:
best_preds_tmp.shape

(100, 688)

In [106]:
best_w_params.shape

(14913, 642, 3)

In [108]:
best_train_preds.shape

(14913, 688, 3)

In [102]:
preds_all_origorder.shape

(688, 9, 100)

In [117]:
best_preds_tmp.shape

(100, 688)

In [40]:
pred[40,:,0]

array([0.2669742 , 0.26461464, 0.27132496, 0.24727994, 0.16866647,
       0.08729514, 0.04442686, 0.02763834, 0.01381715], dtype=float32)

In [52]:
shuf_order = order;
original_data = np.arange(0,n_trials);
shuffled_data = original_data[shuf_order] # Shuffle the original data
unshuffled_data = unshuffle(shuffled_data, order)


In [51]:
def unshuffle(shuffled_data, shuffle_order):
    # Assumes that first dim of data is what needs to be unshuffled.
    
    unshuffle_order = np.zeros_like(shuffle_order);
    unshuffle_order[shuffle_order] = np.arange(shuffled_data.shape[0])
    unshuffled_data = shuffled_data[unshuffle_order] # Unshuffle the shuffled data

    return unshuffled_data

In [54]:
# Get trial-by-trial predictions for each training set trial.
_pred_train = torch.tensordot(_xtrn, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
_pred_test = torch.tensordot(_xout, _betas, dims=[[1],[1]]) # [#samples, #lambdas, #voxels]
pred_train = torch_utils.get_value(_pred_train)
pred_test = torch_utils.get_value(_pred_test)

# Going to combine the training and held out trials and re-create their original order here.
pred_train = trn_data
pred_test = out_data
preds_all_shuffled = np.concatenate((pred_train, pred_test), axis=0)
preds_all_origorder = numpy_utils.unshuffle(preds_all_shuffled, order)



In [73]:
voxel_data = np.random.normal(0,1,(80,4))
voxel_data_orig = copy.deepcopy(voxel_data)
images = np.random.normal(0,1,(80,6))
n_trials = len(images)
n_voxels = voxel_data.shape[1]   

# Get train/holdout splits.
# Held-out data here is used for lamdba selection.
# This is the inner part of nested cross-validation; there is another portion of data ('val') which never enters this function.
trn_size = n_trials - holdout_size
assert trn_size>0, 'Training size needs to be greater than zero'
print ('trn_size = %d (%.1f%%)' % (trn_size, float(trn_size)*100/len(voxel_data)))
order = np.arange(len(voxel_data), dtype=int)
if shuffle:
    if shuff_rnd_seed==0:
        print('Computing a new random seed')
        shuff_rnd_seed = int(time.strftime('%M%H%d', time.localtime()))
    print('Seeding random number generator: seed is %d'%shuff_rnd_seed)
    np.random.seed(shuff_rnd_seed)
    np.random.shuffle(order)
images = images[order]
voxel_data = voxel_data[order]  
trn_data = voxel_data[:trn_size]
out_data = voxel_data[trn_size:]



trn_size = 11 (13.8%)
Seeding random number generator: seed is 291125


In [78]:
preds_all_origorder[:,3]

array([ 1.54634801,  2.22595881,  0.19889429, -0.79778817,  1.53375134,
       -0.0949584 ,  0.06196644,  0.09437636, -1.414537  , -1.56983386,
        0.0660035 ,  0.35398885,  2.34682547,  0.2089229 ,  0.4671567 ,
        0.53173037,  1.55089489,  0.30510638,  1.84373332, -0.51093351,
       -0.08730406, -0.45746689,  0.7193505 ,  0.44867053,  0.71251256,
       -1.10815184,  0.4767032 , -0.13708607, -0.88613917,  0.89895124,
        1.73636182, -1.3365064 ,  1.16923248,  0.43083788,  0.74175572,
       -0.11780459, -1.91741005,  1.78147399,  0.64809841, -0.91499099,
        1.59393034,  0.12575663, -1.18292914,  0.31120017, -3.43065348,
        0.1224519 ,  0.48308776, -0.8014592 , -2.84558417,  0.54096982,
        1.29339576,  0.32470974,  0.33711501,  0.69027089, -1.63324072,
       -0.30328126,  2.50247367, -0.15472073, -1.07030358,  0.14638405,
        0.1804376 , -0.35323568, -0.87883493, -1.52475246,  0.02008066,
        0.56855626,  0.86329133, -1.25374837,  0.51984632, -0.62

In [79]:
voxel_data_orig[:,3]

array([ 1.54634801,  2.22595881,  0.19889429, -0.79778817,  1.53375134,
       -0.0949584 ,  0.06196644,  0.09437636, -1.414537  , -1.56983386,
        0.0660035 ,  0.35398885,  2.34682547,  0.2089229 ,  0.4671567 ,
        0.53173037,  1.55089489,  0.30510638,  1.84373332, -0.51093351,
       -0.08730406, -0.45746689,  0.7193505 ,  0.44867053,  0.71251256,
       -1.10815184,  0.4767032 , -0.13708607, -0.88613917,  0.89895124,
        1.73636182, -1.3365064 ,  1.16923248,  0.43083788,  0.74175572,
       -0.11780459, -1.91741005,  1.78147399,  0.64809841, -0.91499099,
        1.59393034,  0.12575663, -1.18292914,  0.31120017, -3.43065348,
        0.1224519 ,  0.48308776, -0.8014592 , -2.84558417,  0.54096982,
        1.29339576,  0.32470974,  0.33711501,  0.69027089, -1.63324072,
       -0.30328126,  2.50247367, -0.15472073, -1.07030358,  0.14638405,
        0.1804376 , -0.35323568, -0.87883493, -1.52475246,  0.02008066,
        0.56855626,  0.86329133, -1.25374837,  0.51984632, -0.62

In [55]:
preds_all_shuffled = np.concatenate((pred_train, pred_test), axis=0)

In [56]:
preds_all_origorder = unshuffle(preds_all_shuffled, order)

In [58]:
preds_all_origorder.shape

(688, 9, 100)