Remove whiteline

lukassnoek · Dec 29, 2016 · 4468fcf · 4468fcf
1 parent 30da99a
commit 4468fcf
Show file tree

Hide file tree

Showing 5 changed files with 186 additions and 91 deletions.
diff --git a/skbold/core/mvp_between.py b/skbold/core/mvp_between.py
@@ -20,7 +20,7 @@
     print("Skbold's searchlight functionality not available.")
 
 from sklearn.preprocessing import StandardScaler, LabelEncoder
-from ..preproc import MajorityUndersampler
+from ..preproc import MajorityUndersampler, LabelBinarizer
 
 
 class MvpBetween(Mvp):
@@ -230,7 +230,7 @@ def calculate_confound_weighting(self, file_path, col_name, sep='\t',
         file_path : str
             Absolute path to spreadsheet-like file including the confounding
             variable.
-        col_name : str
+        col_name : str or List[str]
             Column name in spreadsheet containing the confouding variable
         sep : str
             Separator to parse the spreadsheet-like file.
@@ -266,6 +266,10 @@ def calculate_confound_weighting(self, file_path, col_name, sep='\t',
         confound = np.array(df.loc[common_idx, col_name])
 
         # Fit confounds to y
+
+        if confound.ndim == 1:
+            confound = confound[:, np.newaxis]
+
         estimator.fit(confound, self.y)
 
         # Calculate p(y=1 | confounds)
@@ -284,7 +288,7 @@ def regress_out_confounds(self, file_path, col_name, backend='numpy',
         file_path : str
             Absolute path to spreadsheet-like file including the confounding
             variable.
-        col_name : str
+        col_name : str or List[str]
             Column name in spreadsheet containing the confouding variable
         backend : str
             Which algorithm to use to regress out the confound. The option
@@ -365,10 +369,7 @@ def _update_common_subjects(self, idx):
                                 enumerate(self.common_subjects) if idx[i]]
 
     def add_y(self, file_path, col_name, sep='\t', index_col=0,
-              normalize=False, binarize=None, remove=None,
-              save_binarization_params=None,
-              apply_binarization_params=None,
-              ensure_balanced=False):
+              normalize=False, remove=None, ensure_balanced=False):
         """ Sets ``y`` attribute to an outcome-variable (target).
 
         Parameters
@@ -383,21 +384,6 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
             Which column to use as index (should correspond to subject-name).
         normalize : bool
             Whether to normalize (0 mean, unit std) the outcome variable.
-        binarize : dict
-            If not None, the outcome variable will be binarized along the
-            key-value pairs in the binarize-argument. Options:
-
-            >>> binarize = {'type': 'percentile', 'high': .75, 'low': .25}
-            >>> binarize = {'type': 'zscore', 'std': 1}
-            >>> binarize = {'type': 'constant', 'cutoff': 10}
-            >>> binarize = {'type': 'median'}
-
-        save_binarization_params : str
-            If not none, it refers to the path to save the binarization params
-            to.
-        apply_binarization_params : str
-            If not none, it refers to the path to load the binarization params
-            from and apply them to the loaded target variable.
         remove : int or float or str
             Removes instances in which y == remove from MvpBetween object.
         ensure_balanced : bool
@@ -428,76 +414,22 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
         if normalize:
             self.y = (self.y - self.y.mean()) / self.y.std()
 
-        if apply_binarization_params is not None:
-
-            with open(apply_binarization_params) as fin:
-                params = json.load(fin)
-
-            if params['type'] == 'zscore':
-                y_norm = (self.y - params['mean']) / params['std']
-                idx = np.abs(y_norm) > params['n_std']
-                y = (y_norm[idx] > 0).astype(int)
-            else:
-                msg = ("Apply binarization params other than 'zscore is "
-                       "not yet implemented.")
-                raise ValueError(msg)
-
-            self.y = y
-
-            if idx is not None:
-                self._update_common_subjects(idx)
-                self.X = self.X[idx, :]
+        if ensure_balanced:
+            self._undersample_majority()
 
-            if ensure_balanced:
-                self._undersample_majority()
+    def apply_binarization_params(self, param_file, ensure_balanced=False):
 
-            return 0
+        with open(param_file) as fin:
+            params = json.load(fin)
 
-        if binarize is None:
-
-            if ensure_balanced:
-                self._undersample_majority()
-            return 0
-        else:
-            y = self.y
-
-        if binarize['type'] == 'percentile':
-            y_rank = [stat.percentileofscore(y, a, 'rank') for a in y]
-            y_rank = np.array(y_rank)
-            idx = (y_rank < binarize['low']) | (y_rank > binarize['high'])
-            low = stat.scoreatpercentile(y, binarize['low'])
-            high = stat.scoreatpercentile(y, binarize['high'])
-            self.binarize_params = {'type': 'percentile',
-                                    'low': low,
-                                    'high': high}
-            y = (y_rank[idx] > 50).astype(int)
-
-        elif binarize['type'] == 'zscore':
-            y_norm = (y - y.mean()) / y.std()  # just to be sure
-            idx = np.abs(y_norm) > binarize['std']
-            self.binarize_params = {'type': binarize['type'],
-                                    'mean': y.mean(),
-                                    'std': y.std(),
-                                    'n_std': binarize['std']}
+        if params['type'] == 'zscore':
+            y_norm = (self.y - params['mean']) / params['std']
+            idx = np.abs(y_norm) > params['n_std']
             y = (y_norm[idx] > 0).astype(int)
-
-        elif binarize['type'] == 'constant':
-            y = (y > binarize['cutoff']).astype(int)
-            idx = None
-            self.binarize_params = {'type': binarize['type'],
-                                    'cutoff': binarize['cutoff']}
-        elif binarize['type'] == 'median':  # median-split
-            median = np.median(y)
-            y = (y > median).astype(int)
-            idx = None
-            self.binarize_params = {'type': binarize['type'],
-                                    'median': median}
-
-        if save_binarization_params is not None:
-
-            with open(op.join(save_binarization_params,
-                              'binarization_params.json'), 'w') as fout:
-                json.dump(self.binarize_params, fout)
+        else:
+            msg = ("Apply binarization params other than 'zscore is "
+                   "not yet implemented.")
+            raise ValueError(msg)
 
         self.y = y
 
@@ -508,6 +440,41 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
         if ensure_balanced:
             self._undersample_majority()
 
+    def binarize_y(self, params, save_path=None, ensure_balanced=False):
+        """ Binarizes mvp's y-attribute using a specified method.
+
+        Parameters
+        ----------
+        params : dict
+            The outcome variable (y) will be binarized along the
+            key-value pairs in the params-argument. Options:
+
+            >>> params = {'type': 'percentile', 'high': .75, 'low': .25}
+            >>> params = {'type': 'zscore', 'std': 1}
+            >>> params = {'type': 'constant', 'cutoff': 10}
+            >>> params = {'type': 'median'}
+        save_path : str
+            If not None (default), this should be an absolute path referring
+            to where the binarization-params should be saved.
+        ensure_balanced : bool
+            Whether to ensure balanced classes (if True, done by undersampling
+            the majority class).
+        """
+        options = ['percentile', 'zscore', 'constant', 'median']
+
+        labb = LabelBinarizer(params)
+        self.X, y = labb.fit_transform(self.X, self.y)
+
+        if labb.idx_ is not None:
+            self._update_common_subjects(labb.idx_)
+
+        if ensure_balanced:
+            self._undersample_majority()
+
+        if save_path is not None:
+            # to do: save params as json
+            pass
+
     def split(self, file_path, col_name, target, sep='\t', index_col=0):
         """ Splits an MvpBetween object based on some external index.
 

diff --git a/skbold/core/tests/test_mvp_between.py b/skbold/core/tests/test_mvp_between.py
@@ -78,6 +78,59 @@ def test_mvp_between_split():
     mvp.create()
     fpath = op.join(testdata_path, 'sample_behav.tsv')
     mvp.split(fpath, col_name='group', target='train')
+
+
+def test_mvp_between_calculate_confound_weighting():
+
+    source = dict()
+    source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
+                                           'sub*', 'run1.feat', 'stats',
+                                           'cope1.nii.gz')}
+
+    mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
+    mvp.create()
+    fpath = op.join(testdata_path, 'sample_behav.tsv')
+    mvp.add_y(fpath, col_name='var_categorical', index_col=0,
+              remove=999)
+    mvp.calculate_confound_weighting(fpath, 'confound_categorical')
+
+    assert(len(mvp.ipw) == mvp.X.shape[0])
+
+
+def test_mvp_between_calculate_confound_weighting_two_vars():
+
+    source = dict()
+    source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
+                                           'sub*', 'run1.feat', 'stats',
+                                           'cope1.nii.gz')}
+
+    mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
+    mvp.create()
+    fpath = op.join(testdata_path, 'sample_behav.tsv')
+    mvp.add_y(fpath, col_name='var_categorical', index_col=0,
+              remove=999)
+    mvp.calculate_confound_weighting(fpath, ['confound_categorical',
+                                             'confound_continuous'])
+
+    assert(len(mvp.ipw) == mvp.X.shape[0])
+
+
+def test_mvp_between_regress_out_confounds():
+
+    source = dict()
+    source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
+                                           'sub*', 'run1.feat', 'stats',
+                                           'cope1.nii.gz')}
+
+    mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
+    mvp.create()
+    fpath = op.join(testdata_path, 'sample_behav.tsv')
+    mvp.add_y(fpath, col_name='var_categorical', index_col=0,
+              remove=999)
+    mvp.regress_out_confounds(fpath, ['confound_categorical',
+                                      'confound_continuous'])
+
+    # assert(len(mvp.ipw) == mvp.X.shape[0])
     spaths = glob(op.join(testdata_path, 'mock_subjects',
                           'sub*', 'run1.feat'))
     _ = [shutil.rmtree(s) for s in spaths]
diff --git a/skbold/data/test_data/sample_behav.tsv b/skbold/data/test_data/sample_behav.tsv
@@ -1,4 +1,4 @@
-	var_categorical	var_continuous	var_multinomial	confound_categoric	confound_continuous	group
+	var_categorical	var_continuous	var_multinomial	confound_categorical	confound_continuous	group
 sub001	1	23.123	2	0	384.123	train
 sub002	1	24.129	2	0	819.32	train
 sub003	1	20.001	2	1	381.33	train

diff --git a/skbold/preproc/__init__.py b/skbold/preproc/__init__.py
@@ -1,3 +1,4 @@
-from .label_preproc import MajorityUndersampler, LabelFactorizer
+from .label_preproc import MajorityUndersampler, LabelFactorizer,\
+    LabelBinarizer
 
-__all__ = ['LabelFactorizer', 'MajorityUndersampler']
+__all__ = ['LabelFactorizer', 'MajorityUndersampler', 'LabelBinarizer']
diff --git a/skbold/preproc/label_preproc.py b/skbold/preproc/label_preproc.py
@@ -7,6 +7,7 @@
 from __future__ import print_function, division, absolute_import
 import numpy as np
 from sklearn.base import BaseEstimator, TransformerMixin
+import scipy.stats as stat
 
 
 class LabelFactorizer(BaseEstimator, TransformerMixin):
@@ -143,3 +144,76 @@ def transform(self, X, y):
         self.idx_ = all_idx
 
         return X[all_idx, :], y[all_idx]
+
+
+class LabelBinarizer(BaseEstimator, TransformerMixin):
+
+    def __init__(self, params):
+        """ Initializes LabelBinarizer object. """
+        self.params = params
+        self.idx_ = None
+        self.binarize_params = None
+
+    def fit(self, X=None, y=None):
+        """ Does nothing, but included for scikit-learn pipelines. """
+        return self
+
+    def transform(self, X, y):
+        """ Binarizes y-attribute.
+
+        Parameters
+        ----------
+        X : ndarray
+            Numeric (float) array of shape = [n_samples, n_features]
+
+        Returns
+        -------
+        X : ndarray
+            Transformed array of shape = [n_samples, n_features] given the
+            indices calculated during fit().
+        """
+
+        options = ['percentile', 'zscore', 'constant', 'median']
+        params = self.params
+
+        if params['type'] == 'percentile':
+            y_rank = [stat.percentileofscore(y, a, 'rank') for a in y]
+            y_rank = np.array(y_rank)
+            idx = (y_rank < params['low']) | (y_rank > params['high'])
+            low = stat.scoreatpercentile(y, params['low'])
+            high = stat.scoreatpercentile(y, params['high'])
+            self.binarize_params = {'type': 'percentile',
+                                    'low': low,
+                                    'high': high}
+            y = (y_rank[idx] > 50).astype(int)
+
+        elif params['type'] == 'zscore':
+            y_norm = (y - y.mean()) / y.std()  # just to be sure
+            idx = np.abs(y_norm) > params['std']
+            self.binarize_params = {'type': params['type'],
+                                    'mean': y.mean(),
+                                    'std': y.std(),
+                                    'n_std': params['std']}
+            y = (y_norm[idx] > 0).astype(int)
+
+        elif params['type'] == 'constant':
+            y = (y > params['cutoff']).astype(int)
+            idx = None
+            self.binarize_params = {'type': params['type'],
+                                    'cutoff': params['cutoff']}
+        elif params['type'] == 'median':  # median-split
+            median = np.median(y)
+            y = (y > median).astype(int)
+            idx = None
+            self.binarize_params = {'type': params['type'],
+                                    'median': median}
+        else:
+            msg = 'Unknown type; please choose from: %r' % options
+            raise KeyError(msg)
+
+        if idx is not None:
+            X = X[idx, :]
+
+        self.idx_ = idx
+
+        return X, y