/
space_net.py
1330 lines (1104 loc) · 49.5 KB
/
space_net.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
"""
sklearn-compatible implementation of spatially structured learners (
TV-L1, Graph-Net, etc.)
"""
# Author: DOHMATOB Elvis Dopgima,
# PIZARRO Gaspar,
# VAROQUAUX Gael,
# GRAMFORT Alexandre,
# EICKENBERG Michael,
# THIRION Bertrand
# License: simplified BSD
import warnings
import collections
import time
import sys
from functools import partial
import numpy as np
from scipy import stats
from scipy.ndimage import (
gaussian_filter,
binary_dilation,
binary_erosion,
)
from sklearn.utils.extmath import safe_sparse_dot
from sklearn.utils import check_array
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import (SelectPercentile, f_regression,
f_classif)
from joblib import Memory, Parallel, delayed
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import accuracy_score
from nilearn.maskers._masker_validation import _check_embedded_nifti_masker
from .._utils.param_validation import _adjust_screening_percentile
from .._utils import fill_doc
from sklearn.utils import check_X_y
from sklearn.model_selection import check_cv
try:
from sklearn.linear_model._base import _preprocess_data as center_data
except ImportError:
# Sklearn < 0.23
from sklearn.linear_model.base import _preprocess_data as center_data
from .._utils.cache_mixin import CacheMixin
from nilearn.masking import _unmask_from_to_3d_array
from .space_net_solvers import (tvl1_solver, _graph_net_logistic,
_graph_net_squared_loss)
from nilearn.image import get_data
def _crop_mask(mask):
"""Crops input mask to produce tighter (i.e smaller) bounding box with
the same support (active voxels)"""
idx = np.where(mask)
if idx[0].size == 0:
raise ValueError("Empty mask: if you have given a mask, it is "
"empty, and if you have not given a mask, the "
"mask-extraction routines have failed. Please "
"provide an appropriate mask.")
i_min = max(idx[0].min() - 1, 0)
i_max = idx[0].max()
j_min = max(idx[1].min() - 1, 0)
j_max = idx[1].max()
k_min = max(idx[2].min() - 1, 0)
k_max = idx[2].max()
return mask[i_min:i_max + 1, j_min:j_max + 1, k_min:k_max + 1]
@fill_doc
def _univariate_feature_screening(
X, y, mask, is_classif, screening_percentile, smoothing_fwhm=2.):
"""
Selects the most import features, via a univariate test
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Design matrix.
y : ndarray, shape (n_samples,)
Response Vector.
mask: ndarray or booleans, shape (nx, ny, nz)
Mask defining brain Rois.
is_classif: bool
Flag telling whether the learning task is classification or regression.
screening_percentile : float in the closed interval [0., 100.]
Only the `screening_percentile * 100" percent most import voxels will
be retained.
%(smoothing_fwhm)s
Default=2.
Returns
-------
X_: ndarray, shape (n_samples, n_features_)
Reduced design matrix with only columns corresponding to the voxels
retained after screening.
mask_ : ndarray of booleans, shape (nx, ny, nz)
Mask with support reduced to only contain voxels retained after
screening.
support : ndarray of ints, shape (n_features_,)
Support of the screened mask, as a subset of the support of the
original mask.
"""
# smooth the data (with isotropic Gaussian kernel) before screening
if smoothing_fwhm > 0.:
sX = np.empty(X.shape)
for sample in range(sX.shape[0]):
sX[sample] = gaussian_filter(
_unmask_from_to_3d_array(X[sample].copy(), # avoid modifying X
mask), (smoothing_fwhm, smoothing_fwhm,
smoothing_fwhm))[mask]
else:
sX = X
# do feature screening proper
selector = SelectPercentile(f_classif if is_classif else f_regression,
percentile=screening_percentile).fit(sX, y)
support = selector.get_support()
# erode and then dilate mask, thus obtaining a "cleaner" version of
# the mask on which a spatial prior actually makes sense
mask_ = mask.copy()
mask_[mask] = (support > 0)
mask_ = binary_dilation(binary_erosion(
mask_)).astype(bool)
mask_[np.logical_not(mask)] = 0
support = mask_[mask]
X = X[:, support]
return X, mask_, support
def _space_net_alpha_grid(X, y, eps=1e-3, n_alphas=10, l1_ratio=1.,
logistic=False):
"""Compute the grid of alpha values for TV-L1 and Graph-Net.
Parameters
----------
X : ndarray, shape (n_samples, n_features)
Training data (design matrix).
y : ndarray, shape (n_samples,)
Target / response vector.
l1_ratio : float, optional
The ElasticNet mixing parameter, with ``0 <= l1_ratio <= 1``.
For ``l1_ratio = 0`` the penalty is purely a spatial prior
(Graph-Net, TV, etc.). ``For l1_ratio = 1`` it is an L1 penalty.
For ``0 < l1_ratio < 1``, the penalty is a combination of L1
and a spatial prior.
Default=1.
eps : float, optional
Length of the path. ``eps=1e-3`` means that
``alpha_min / alpha_max = 1e-3``.
Default=1e-3.
n_alphas : int, optional
Number of alphas along the regularization path.
Default=10.
logistic : bool, optional
Indicates where the underlying loss function is logistic.
Default=False.
"""
if logistic:
# Computes the theoretical upper bound for the overall
# regularization, as derived in "An Interior-Point Method for
# Large-Scale l1-Regularized Logistic Regression", by Koh, Kim,
# Boyd, in Journal of Machine Learning Research, 8:1519-1555,
# July 2007.
# url: http://www.stanford.edu/~boyd/papers/pdf/l1_logistic_reg.pdf
m = float(y.size)
m_plus = float(y[y == 1].size)
m_minus = float(y[y == -1].size)
b = np.zeros_like(y)
b[y == 1] = m_minus / m
b[y == -1] = - m_plus / m
alpha_max = np.max(np.abs(X.T.dot(b)))
# tt may happen that b is in the kernel of X.T!
if alpha_max == 0.:
alpha_max = np.abs(np.dot(X.T, y)).max()
else:
alpha_max = np.abs(np.dot(X.T, y)).max()
# prevent alpha_max from exploding when l1_ratio = 0
if l1_ratio == 0.:
l1_ratio = 1e-3
alpha_max /= l1_ratio
if n_alphas == 1:
return np.array([alpha_max])
alpha_min = alpha_max * eps
return np.logspace(np.log10(alpha_min), np.log10(alpha_max),
num=n_alphas)[::-1]
class _EarlyStoppingCallback:
"""Out-of-bag early stopping
A callable that returns True when the test error starts
rising. We use a Spearman correlation (between X_test.w and y_test)
for scoring.
"""
def __init__(self, X_test, y_test, is_classif, debias=False, verbose=0):
self.X_test = X_test
self.y_test = y_test
self.is_classif = is_classif
self.debias = debias
self.verbose = verbose
self.tol = -1e-4 if self.is_classif else -1e-2
self.test_scores = []
self.counter = 0.
def __call__(self, variables):
"""The callback proper """
# misc
if not isinstance(variables, dict):
variables = dict(w=variables)
self.counter += 1
w = variables['w']
# use Spearman score as stopping criterion
score = self.test_score(w)[0]
self.test_scores.append(score)
if not (self.counter > 20 and (self.counter % 10) == 2):
return
# check whether score increased on average over last 5 iterations
if len(self.test_scores) > 4:
if np.mean(np.diff(self.test_scores[-5:][::-1])) >= self.tol:
if self.verbose:
if self.verbose > 1:
print('Early stopping. Test score: %.8f %s' % (
score, 40 * '-'))
else:
sys.stderr.write('.')
return True
if self.verbose > 1:
print('Test score: %.8f' % score)
return False
def _debias(self, w):
""""Debias w by rescaling the coefficients by a fixed factor.
Precisely, the scaling factor is: <y_pred, y_test> / ||y_test||^2.
"""
y_pred = np.dot(self.X_test, w)
scaling = np.dot(y_pred, y_pred)
if scaling > 0.:
scaling = np.dot(y_pred, self.y_test) / scaling
w *= scaling
return w
def test_score(self, w):
"""Compute test score for model, given weights map `w`.
We use correlations between linear prediction and
ground truth (y_test).
We return 2 scores for model selection: one is the Spearman
correlation, which captures ordering between input and
output, but tends to have 'flat' regions. The other
is the Pearson correlation, that we can use to disambiguate
between regions with equivalent Spearman correlation.
"""
if self.is_classif:
w = w[:-1]
if w.ptp() == 0:
# constant map, there is nothing
return (-np.inf, -np.inf)
y_pred = np.dot(self.X_test, w)
spearman_score = stats.spearmanr(y_pred, self.y_test)[0]
pearson_score = np.corrcoef(y_pred, self.y_test)[1, 0]
if self.is_classif:
return spearman_score, pearson_score
else:
return pearson_score, spearman_score
@fill_doc
def path_scores(solver, X, y, mask, alphas, l1_ratios, train, test,
solver_params, is_classif=False, n_alphas=10, eps=1E-3,
key=None, debias=False, Xmean=None,
screening_percentile=20., verbose=1):
"""Function to compute scores of different alphas in regression and
classification used by CV objects
Parameters
----------
X : 2D array of shape (n_samples, n_features)
Design matrix, one row per sample point.
y : 1D array of length n_samples
Response vector; one value per sample.
mask : 3D arrays of boolean
Mask defining brain regions that we work on.
alphas : list of floats
List of regularization parameters being considered.
train : array or list of integers
List of indices for the train samples.
test : array or list of integers
List of indices for the test samples.
l1_ratios : float or list of floats in the interval [0, 1];
optional (default .5)
Constant that mixes L1 and TV (resp. Graph-Net) penalization.
l1_ratios == 0: just smooth. l1_ratios == 1: just lasso.
eps : float, optional (default 1e-3)
Length of the path. For example, ``eps=1e-3`` means that
``alpha_min / alpha_max = 1e-3``.
n_alphas : int, optional (default 10).
Generate this number of alphas per regularization path.
This parameter is mutually exclusive with the `alphas` parameter.
solver : function handle
See for example tv.TVl1Classifier documentation.
solver_params : dict
Dictionary of param-value pairs to be passed to solver.
is_classif : bool, optional
Indicates whether the loss is a classification loss or a
regression loss. Default=False.
Xmean: ??? TODO: Add description.
key: ??? TODO: Add description.
debias : bool, optional
If set, then the estimated weights maps will be debiased.
Default=False.
screening_percentile : float in the interval [0, 100], optional
Percentile value for ANOVA univariate feature selection. A value of
100 means 'keep all features'. This percentile is expressed
w.r.t the volume of a standard (MNI152) brain, and so is corrected
at runtime to correspond to the volume of the user-supplied mask
(which is typically smaller). If '100' is given, all the features
are used, regardless of the number of voxels.
Default=20.
%(verbose)s
"""
if l1_ratios is None:
raise ValueError("l1_ratios must be specified!")
# misc
_, n_features = X.shape
verbose = int(verbose if verbose is not None else 0)
# Univariate feature screening. Note that if we have only as few as 100
# features in the mask's support, then we should use all of them to
# learn the model i.e disable this screening)
do_screening = (n_features > 100) and screening_percentile < 100.
if do_screening:
X, mask, support = _univariate_feature_screening(
X, y, mask, is_classif, screening_percentile)
# crop the mask to have a tighter bounding box
mask = _crop_mask(mask)
# get train and test data
X_train, y_train = X[train].copy(), y[train].copy()
X_test, y_test = X[test].copy(), y[test].copy()
# it is essential to center the data in regression
X_train, y_train, _, y_train_mean, _ = center_data(
X_train, y_train, fit_intercept=True, normalize=False,
copy=False)
# misc
if not isinstance(l1_ratios, collections.abc.Iterable):
l1_ratios = [l1_ratios]
l1_ratios = sorted(l1_ratios)[::-1] # from large to small l1_ratios
best_score = -np.inf
best_secondary_score = -np.inf
best_l1_ratio = l1_ratios[0]
best_alpha = None
best_init = None
all_test_scores = []
if len(test) > 0.:
# do l1_ratio path
for l1_ratio in l1_ratios:
this_test_scores = []
# make alpha grid
if alphas is None:
alphas_ = _space_net_alpha_grid(
X_train, y_train, l1_ratio=l1_ratio, eps=eps,
n_alphas=n_alphas, logistic=is_classif)
else:
alphas_ = alphas
alphas_ = sorted(alphas_)[::-1] # from large to small l1_ratios
# do alpha path
if best_alpha is None:
best_alpha = alphas_[0]
init = None
path_solver_params = solver_params.copy()
# Use a lighter tol during the path
path_solver_params['tol'] = 2 * path_solver_params.get('tol', 1e-4)
for alpha in alphas_:
# setup callback mechanism for early stopping
early_stopper = _EarlyStoppingCallback(
X_test, y_test, is_classif=is_classif, debias=debias,
verbose=verbose)
w, _, init = solver(
X_train, y_train, alpha, l1_ratio, mask=mask, init=init,
callback=early_stopper, verbose=max(verbose - 1, 0.),
**path_solver_params)
# We use 2 scores for model selection: the second one is to
# disambiguate between regions of equivalent Spearman
# correlations
score, secondary_score = early_stopper.test_score(w)
this_test_scores.append(score)
if (np.isfinite(score) and
(score > best_score
or (score == best_score and
secondary_score > best_secondary_score))):
best_secondary_score = secondary_score
best_score = score
best_l1_ratio = l1_ratio
best_alpha = alpha
best_init = init.copy()
all_test_scores.append(this_test_scores)
else:
if alphas is None:
alphas_ = _space_net_alpha_grid(
X_train, y_train, l1_ratio=best_l1_ratio, eps=eps,
n_alphas=n_alphas, logistic=is_classif)
else:
alphas_ = alphas
best_alpha = alphas_[0]
# re-fit best model to high precision (i.e without early stopping, etc.)
best_w, _, init = solver(X_train, y_train, best_alpha, best_l1_ratio,
mask=mask, init=best_init,
verbose=max(verbose - 1, 0), **solver_params)
if debias:
best_w = _EarlyStoppingCallback(
X_test, y_test, is_classif=is_classif, debias=debias,
verbose=verbose)._debias(best_w)
if len(test) == 0.:
all_test_scores.append(np.nan)
# unmask univariate screening
if do_screening:
w_ = np.zeros(len(support))
if is_classif:
w_ = np.append(w_, best_w[-1])
w_[:-1][support] = best_w[:-1]
else:
w_[support] = best_w
best_w = w_
if len(best_w) == n_features:
# TODO: do something with Xmean
best_w = np.append(best_w, 0.)
all_test_scores = np.array(all_test_scores)
return (all_test_scores, best_w, best_alpha, best_l1_ratio, alphas_,
y_train_mean, key)
@fill_doc
class BaseSpaceNet(LinearRegression, CacheMixin):
"""
Regression and classification learners with sparsity and spatial priors
`SpaceNet` implements Graph-Net and TV-L1 priors /
penalties. Thus, the penalty is a sum of an L1 term and a spatial term. The
aim of such a hybrid prior is to obtain weights maps which are structured
(due to the spatial prior) and sparse (enforced by L1 norm).
Parameters
----------
penalty : string, optional (default 'graph-net')
Penalty to used in the model. Can be 'graph-net' or 'tv-l1'.
loss : string, optional (default None)
Loss to be used in the model. Must be an one of "mse", or "logistic".
is_classif : bool, optional (default False)
Flag telling whether the learning task is classification or regression.
l1_ratios : float or list of floats in the interval [0, 1];
optional (default .5)
Constant that mixes L1 and spatial prior terms in penalization.
l1_ratios == 1 corresponds to pure LASSO. The larger the value of this
parameter, the sparser the estimated weights map. If list is provided,
then the best value will be selected by cross-validation.
alphas : float or list of floats, optional (default None)
Choices for the constant that scales the overall regularization term.
This parameter is mutually exclusive with the `n_alphas` parameter.
If None or list of floats is provided, then the best value will be
selected by cross-validation.
n_alphas : int, optional (default 10).
Generate this number of alphas per regularization path.
This parameter is mutually exclusive with the `alphas` parameter.
eps : float, optional (default 1e-3)
Length of the path. For example, ``eps=1e-3`` means that
``alpha_min / alpha_max = 1e-3``
mask : filename, niimg, NiftiMasker instance, optional (default None)
Mask to be used on data. If an instance of masker is passed,
then its mask will be used. If no mask is it will be computed
automatically by a NiftiMasker.
%(target_affine)s
An important use-case of this parameter is for downsampling the
input data to a coarser resolution (to speed of the model fit).
%(target_shape)s
%(low_pass)s
%(high_pass)s
%(t_r)s
screening_percentile : float in the interval [0, 100]; Optional (
default 20)
Percentile value for ANOVA univariate feature selection. A value of
100 means 'keep all features'. This percentile is expressed
w.r.t the volume of a standard (MNI152) brain, and so is corrected
at runtime to correspond to the volume of the user-supplied mask
(which is typically smaller). If '100' is given, all the features
are used, regardless of the number of voxels.
standardize : bool, optional (default True):
If set, then the data (X, y) are centered to have mean zero along
axis 0. This is here because nearly all linear models will want
their data to be centered.
fit_intercept : bool, optional (default True)
Fit or not an intercept.
max_iter : int (default 200)
Defines the iterations for the solver.
tol : float, optional (default 5e-4)
Defines the tolerance for convergence for the backend FISTA solver.
%(verbose)s
%(n_jobs)s
%(memory)s
%(memory_level1)s
cv : int, a cv generator instance, or None (default 8)
The input specifying which cross-validation generator to use.
It can be an integer, in which case it is the number of folds in a
KFold, None, in which case 3 fold is used, or another object, that
will then be used as a cv generator.
debias : bool, optional (default False)
If set, then the estimated weights maps will be debiased.
Attributes
----------
`all_coef_` : ndarray, shape (n_l1_ratios, n_folds, n_features)
Coefficients for all folds and features.
`alpha_grids_` : ndarray, shape (n_folds, n_alphas)
Alpha values considered for selection of the best ones
(saved in `best_model_params_`)
`best_model_params_` : ndarray, shape (n_folds, n_parameter)
Best model parameters (alpha, l1_ratio) saved for the different
cross-validation folds.
`classes_` : ndarray of labels (`n_classes_`)
Labels of the classes (for classification problems)
`n_classes_` : int
Number of classes (for classification problems)
`coef_` : ndarray, shape
(1, n_features) for 2 class classification problems (i.e n_classes = 2)
(n_classes, n_features) for n_classes > 2
Coefficient of the features in the decision function.
`coef_img_` : nifti image
Masked model coefficients
`mask_` : ndarray 3D
An array contains values of the mask image.
`masker_` : instance of NiftiMasker
The nifti masker used to mask the data.
`mask_img_` : Nifti like image
The mask of the data. If no mask was supplied by the user,
this attribute is the mask image computed automatically from the
data `X`.
`memory_` : joblib memory cache
`intercept_` : narray, shape
(1,) for 2 class classification problems (i.e n_classes = 2)
(n_classes,) for n_classes > 2
Intercept (a.k.a. bias) added to the decision function.
It is available only when parameter intercept is set to True.
`cv_` : list of pairs of lists
Each pair is the list of indices for the train and test samples
for the corresponding fold.
`cv_scores_` : ndarray, shape (n_folds, n_alphas) or (n_l1_ratios, n_folds, n_alphas)
Scores (misclassification) for each alpha, and on each fold
`screening_percentile_` : float
Screening percentile corrected according to volume of mask,
relative to the volume of standard brain.
`w_` : ndarray, shape
(1, n_features + 1) for 2 class classification problems (i.e n_classes = 2)
(n_classes, n_features + 1) for n_classes > 2, and (n_features,) for
regression
Model weights
`ymean_` : array, shape (n_samples,)
Mean of prediction targets
`Xmean_` : array, shape (n_features,)
Mean of X across samples
`Xstd_` : array, shape (n_features,)
Standard deviation of X across samples
"""
SUPPORTED_PENALTIES = ["graph-net", "tv-l1"]
SUPPORTED_LOSSES = ["mse", "logistic"]
def __init__(self, penalty="graph-net", is_classif=False, loss=None,
l1_ratios=.5, alphas=None, n_alphas=10, mask=None,
target_affine=None, target_shape=None, low_pass=None,
high_pass=None, t_r=None, max_iter=200, tol=5e-4,
memory=None, memory_level=1, standardize=True, verbose=1,
mask_args=None,
n_jobs=1, eps=1e-3, cv=8, fit_intercept=True,
screening_percentile=20., debias=False):
self.penalty = penalty
self.is_classif = is_classif
self.loss = loss
self.n_alphas = n_alphas
self.eps = eps
self.l1_ratios = l1_ratios
self.alphas = alphas
self.mask = mask
self.fit_intercept = fit_intercept
self.memory = memory
self.memory_level = memory_level
self.max_iter = max_iter
self.tol = tol
self.verbose = verbose
self.standardize = standardize
self.n_jobs = n_jobs
self.cv = cv
self.screening_percentile = screening_percentile
self.debias = debias
self.low_pass = low_pass
self.high_pass = high_pass
self.t_r = t_r
self.target_affine = target_affine
self.target_shape = target_shape
self.mask_args = mask_args
# sanity check on params
self.check_params()
def check_params(self):
"""Makes sure parameters are sane"""
if self.l1_ratios is not None:
l1_ratios = self.l1_ratios
if not isinstance(l1_ratios, collections.abc.Iterable):
l1_ratios = [l1_ratios]
for l1_ratio in l1_ratios:
if not 0 <= l1_ratio <= 1.:
raise ValueError(
"l1_ratio must be in the interval [0, 1]; got %g" % (
l1_ratio))
elif l1_ratio in (0., 1.):
warnings.warn(
("Specified l1_ratio = %g. It's advised to only "
"specify values of l1_ratio strictly between 0 "
"and 1." % l1_ratio))
if not (0. <= self.screening_percentile <= 100.):
raise ValueError(
("screening_percentile should be in the interval"
" [0, 100], got %g" % self.screening_percentile))
if self.penalty not in self.SUPPORTED_PENALTIES:
raise ValueError(
"'penalty' parameter must be one of %s%s or %s; got %s" % (
",".join(self.SUPPORTED_PENALTIES[:-1]), "," if len(
self.SUPPORTED_PENALTIES) > 2 else "",
self.SUPPORTED_PENALTIES[-1], self.penalty))
if not (self.loss is None or self.loss in self.SUPPORTED_LOSSES):
raise ValueError(
"'loss' parameter must be one of %s%s or %s; got %s" % (
",".join(self.SUPPORTED_LOSSES[:-1]), "," if len(
self.SUPPORTED_LOSSES) > 2 else "",
self.SUPPORTED_LOSSES[-1], self.loss))
if self.loss is not None and not self.is_classif and (
self.loss == "logistic"):
raise ValueError(
("'logistic' loss is only available for classification "
"problems."))
def _set_coef_and_intercept(self, w):
"""Sets the loadings vector (coef) and the intercept of the fitted
model."""
self.w_ = np.array(w)
if self.w_.ndim == 1:
self.w_ = self.w_[np.newaxis, :]
self.coef_ = self.w_[:, :-1]
if self.is_classif:
self.intercept_ = self.w_[:, -1]
else:
self._set_intercept(self.Xmean_, self.ymean_, self.Xstd_)
def fit(self, X, y):
"""Fit the learner
Parameters
----------
X : list of Niimg-like objects
See :ref:`extracting_data`.
Data on which model is to be fitted. If this is a list,
the affine is considered the same for all.
y : array or list of length n_samples
The dependent variable (age, sex, QI, etc.).
Notes
-----
self : `SpaceNet` object
Model selection is via cross-validation with bagging.
"""
# misc
self.check_params()
if self.memory is None or isinstance(self.memory, str):
self.memory_ = Memory(self.memory,
verbose=max(0, self.verbose - 1))
else:
self.memory_ = self.memory
if self.verbose:
tic = time.time()
# nifti masking
self.masker_ = _check_embedded_nifti_masker(self, multi_subject=False)
X = self.masker_.fit_transform(X)
X, y = check_X_y(X, y, ['csr', 'csc', 'coo'], dtype=float,
multi_output=True, y_numeric=not self.is_classif)
if not self.is_classif and np.all(np.diff(y) == 0.):
raise ValueError("The given input y must have at least 2 targets"
" to do regression analysis. You provided only"
" one target {0}".format(np.unique(y)))
# misc
self.Xmean_ = X.mean(axis=0)
self.Xstd_ = X.std(axis=0)
self.Xstd_[self.Xstd_ < 1e-8] = 1
self.mask_img_ = self.masker_.mask_img_
self.mask_ = get_data(self.mask_img_).astype(bool)
n_samples, _ = X.shape
y = np.array(y).copy()
l1_ratios = self.l1_ratios
if not isinstance(l1_ratios, collections.abc.Iterable):
l1_ratios = [l1_ratios]
alphas = self.alphas
if alphas is not None:
if not isinstance(alphas, collections.abc.Iterable):
alphas = [alphas]
if self.loss is not None:
loss = self.loss
elif self.is_classif:
loss = "logistic"
else:
loss = "mse"
# set backend solver
if self.penalty.lower() == "graph-net":
if not self.is_classif or loss == "mse":
solver = _graph_net_squared_loss
else:
solver = _graph_net_logistic
else:
if not self.is_classif or loss == "mse":
solver = partial(tvl1_solver, loss="mse")
else:
solver = partial(tvl1_solver, loss="logistic")
# generate fold indices
case1 = (None in [alphas, l1_ratios]) and self.n_alphas > 1
case2 = (alphas is not None) and min(len(l1_ratios), len(alphas)) > 1
if case1 or case2:
self.cv_ = list(check_cv(
self.cv, y=y, classifier=self.is_classif).split(X, y))
else:
# no cross-validation needed, user supplied all params
self.cv_ = [(np.arange(n_samples), [])]
n_folds = len(self.cv_)
# number of problems to solve
if self.is_classif:
y = self._binarize_y(y)
else:
y = y[:, np.newaxis]
if self.is_classif and self.n_classes_ > 2:
n_problems = self.n_classes_
else:
n_problems = 1
# standardize y
self.ymean_ = np.zeros(y.shape[0])
if n_problems == 1:
y = y[:, 0]
# scores & mean weights map over all folds
self.cv_scores_ = [[] for i in range(n_problems)]
w = np.zeros((n_problems, X.shape[1] + 1))
self.all_coef_ = np.ndarray((n_problems, n_folds, X.shape[1]))
self.screening_percentile_ = _adjust_screening_percentile(
self.screening_percentile, self.mask_img_, verbose=self.verbose)
# main loop: loop on classes and folds
solver_params = dict(tol=self.tol, max_iter=self.max_iter)
self.best_model_params_ = []
self.alpha_grids_ = []
for (test_scores, best_w, best_alpha, best_l1_ratio, alphas,
y_train_mean, (cls, fold)) in Parallel(
n_jobs=self.n_jobs, verbose=2 * self.verbose)(
delayed(self._cache(path_scores, func_memory_level=2))(
solver, X, y[:, cls] if n_problems > 1 else y,
self.mask_, alphas, l1_ratios, self.cv_[fold][0],
self.cv_[fold][1], solver_params, n_alphas=self.n_alphas,
eps=self.eps, is_classif=self.loss == "logistic",
key=(cls, fold), debias=self.debias,
verbose=self.verbose,
screening_percentile=self.screening_percentile_,
) for cls in range(n_problems) for fold in range(n_folds)):
self.best_model_params_.append((best_alpha, best_l1_ratio))
self.alpha_grids_.append(alphas)
self.ymean_[cls] += y_train_mean
self.all_coef_[cls, fold] = best_w[:-1]
if len(np.atleast_1d(l1_ratios)) == 1:
test_scores = test_scores[0]
self.cv_scores_[cls].append(test_scores)
w[cls] += best_w
# misc
self.cv_scores_ = np.array(self.cv_scores_)
self.best_model_params_ = np.array(self.best_model_params_)
self.alpha_grids_ = np.array(self.alpha_grids_)
self.ymean_ /= n_folds
if not self.is_classif:
self.all_coef_ = np.array(self.all_coef_)
w = w[0]
self.ymean_ = self.ymean_[0]
# bagging: average best weights maps over folds
w /= n_folds
# set coefs and intercepts
self._set_coef_and_intercept(w)
# unmask weights map as a niimg
self.coef_img_ = self.masker_.inverse_transform(self.coef_)
# report time elapsed
if self.verbose:
duration = time.time() - tic
print("Time Elapsed: %g seconds, %i minutes." % (
duration, duration / 60.))
return self
def decision_function(self, X):
"""Predict confidence scores for samples
The confidence score for a sample is the signed distance of that
sample to the hyperplane.
Parameters
----------
X : {array-like, sparse matrix}, shape = (n_samples, n_features)
Samples.
Returns
-------
array, shape=(n_samples,) if n_classes == 2 else (n_samples, n_classes)
Confidence scores per (sample, class) combination. In the binary
case, confidence score for `self.classes_[1]` where >0 means this
class would be predicted.
"""
# handle regression (least-squared loss)
if not self.is_classif:
raise ValueError(
'There is no decision_function in classification')
X = check_array(X)
n_features = self.coef_.shape[1]
if X.shape[1] != n_features:
raise ValueError("X has %d features per sample; expecting %d"
% (X.shape[1], n_features))
scores = safe_sparse_dot(X, self.coef_.T,
dense_output=True) + self.intercept_
return scores.ravel() if scores.shape[1] == 1 else scores
def predict(self, X):
"""Predict class labels for samples in X.
Parameters
----------
X : list of Niimg-like objects
See :ref:`extracting_data`.
Data on prediction is to be made. If this is a list,
the affine is considered the same for all.
Returns
-------
y_pred : ndarray, shape (n_samples,)
Predicted class label per sample.
"""
# cast X into usual 2D array
if not hasattr(self, "masker_"):
raise RuntimeError("This %s instance is not fitted yet!" % (
self.__class__.__name__))
X = self.masker_.transform(X)
# handle regression (least-squared loss)
if not self.is_classif:
return LinearRegression.predict(self, X)
# prediction proper
scores = self.decision_function(X)
if len(scores.shape) == 1:
indices = (scores > 0).astype(int)
else:
indices = scores.argmax(axis=1)
return self.classes_[indices]
@fill_doc
class SpaceNetClassifier(BaseSpaceNet):
"""Classification learners with sparsity and spatial priors.
`SpaceNetClassifier` implements Graph-Net and TV-L1
priors / penalties for classification problems. Thus, the penalty
is a sum an L1 term and a spatial term. The aim of such a hybrid prior
is to obtain weights maps which are structured (due to the spatial
prior) and sparse (enforced by L1 norm).
Parameters
----------
penalty : string, optional (default 'graph-net')
Penalty to used in the model. Can be 'graph-net' or 'tv-l1'.
loss : string, optional (default "logistic")
Loss to be used in the classifier. Must be one of "mse", or "logistic".
l1_ratios : float or list of floats in the interval [0, 1];
optional (default .5)
Constant that mixes L1 and spatial prior terms in penalization.
l1_ratios == 1 corresponds to pure LASSO. The larger the value of this
parameter, the sparser the estimated weights map. If list is provided,
then the best value will be selected by cross-validation.
alphas : float or list of floats, optional (default None)
Choices for the constant that scales the overall regularization term.
This parameter is mutually exclusive with the `n_alphas` parameter.
If None or list of floats is provided, then the best value will be
selected by cross-validation.
n_alphas : int, optional (default 10).
Generate this number of alphas per regularization path.
This parameter is mutually exclusive with the `alphas` parameter.
eps : float, optional (default 1e-3)
Length of the path. For example, ``eps=1e-3`` means that
``alpha_min / alpha_max = 1e-3``.