Skip to content
This repository has been archived by the owner on Mar 7, 2022. It is now read-only.

Commit

Permalink
Remove whiteline
Browse files Browse the repository at this point in the history
  • Loading branch information
lukassnoek committed Dec 29, 2016
1 parent 30da99a commit 4468fcf
Show file tree
Hide file tree
Showing 5 changed files with 186 additions and 91 deletions.
143 changes: 55 additions & 88 deletions skbold/core/mvp_between.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
print("Skbold's searchlight functionality not available.")

from sklearn.preprocessing import StandardScaler, LabelEncoder
from ..preproc import MajorityUndersampler
from ..preproc import MajorityUndersampler, LabelBinarizer


class MvpBetween(Mvp):
Expand Down Expand Up @@ -230,7 +230,7 @@ def calculate_confound_weighting(self, file_path, col_name, sep='\t',
file_path : str
Absolute path to spreadsheet-like file including the confounding
variable.
col_name : str
col_name : str or List[str]
Column name in spreadsheet containing the confouding variable
sep : str
Separator to parse the spreadsheet-like file.
Expand Down Expand Up @@ -266,6 +266,10 @@ def calculate_confound_weighting(self, file_path, col_name, sep='\t',
confound = np.array(df.loc[common_idx, col_name])

# Fit confounds to y

if confound.ndim == 1:
confound = confound[:, np.newaxis]

estimator.fit(confound, self.y)

# Calculate p(y=1 | confounds)
Expand All @@ -284,7 +288,7 @@ def regress_out_confounds(self, file_path, col_name, backend='numpy',
file_path : str
Absolute path to spreadsheet-like file including the confounding
variable.
col_name : str
col_name : str or List[str]
Column name in spreadsheet containing the confouding variable
backend : str
Which algorithm to use to regress out the confound. The option
Expand Down Expand Up @@ -365,10 +369,7 @@ def _update_common_subjects(self, idx):
enumerate(self.common_subjects) if idx[i]]

def add_y(self, file_path, col_name, sep='\t', index_col=0,
normalize=False, binarize=None, remove=None,
save_binarization_params=None,
apply_binarization_params=None,
ensure_balanced=False):
normalize=False, remove=None, ensure_balanced=False):
""" Sets ``y`` attribute to an outcome-variable (target).
Parameters
Expand All @@ -383,21 +384,6 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
Which column to use as index (should correspond to subject-name).
normalize : bool
Whether to normalize (0 mean, unit std) the outcome variable.
binarize : dict
If not None, the outcome variable will be binarized along the
key-value pairs in the binarize-argument. Options:
>>> binarize = {'type': 'percentile', 'high': .75, 'low': .25}
>>> binarize = {'type': 'zscore', 'std': 1}
>>> binarize = {'type': 'constant', 'cutoff': 10}
>>> binarize = {'type': 'median'}
save_binarization_params : str
If not none, it refers to the path to save the binarization params
to.
apply_binarization_params : str
If not none, it refers to the path to load the binarization params
from and apply them to the loaded target variable.
remove : int or float or str
Removes instances in which y == remove from MvpBetween object.
ensure_balanced : bool
Expand Down Expand Up @@ -428,76 +414,22 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
if normalize:
self.y = (self.y - self.y.mean()) / self.y.std()

if apply_binarization_params is not None:

with open(apply_binarization_params) as fin:
params = json.load(fin)

if params['type'] == 'zscore':
y_norm = (self.y - params['mean']) / params['std']
idx = np.abs(y_norm) > params['n_std']
y = (y_norm[idx] > 0).astype(int)
else:
msg = ("Apply binarization params other than 'zscore is "
"not yet implemented.")
raise ValueError(msg)

self.y = y

if idx is not None:
self._update_common_subjects(idx)
self.X = self.X[idx, :]
if ensure_balanced:
self._undersample_majority()

if ensure_balanced:
self._undersample_majority()
def apply_binarization_params(self, param_file, ensure_balanced=False):

return 0
with open(param_file) as fin:
params = json.load(fin)

if binarize is None:

if ensure_balanced:
self._undersample_majority()
return 0
else:
y = self.y

if binarize['type'] == 'percentile':
y_rank = [stat.percentileofscore(y, a, 'rank') for a in y]
y_rank = np.array(y_rank)
idx = (y_rank < binarize['low']) | (y_rank > binarize['high'])
low = stat.scoreatpercentile(y, binarize['low'])
high = stat.scoreatpercentile(y, binarize['high'])
self.binarize_params = {'type': 'percentile',
'low': low,
'high': high}
y = (y_rank[idx] > 50).astype(int)

elif binarize['type'] == 'zscore':
y_norm = (y - y.mean()) / y.std() # just to be sure
idx = np.abs(y_norm) > binarize['std']
self.binarize_params = {'type': binarize['type'],
'mean': y.mean(),
'std': y.std(),
'n_std': binarize['std']}
if params['type'] == 'zscore':
y_norm = (self.y - params['mean']) / params['std']
idx = np.abs(y_norm) > params['n_std']
y = (y_norm[idx] > 0).astype(int)

elif binarize['type'] == 'constant':
y = (y > binarize['cutoff']).astype(int)
idx = None
self.binarize_params = {'type': binarize['type'],
'cutoff': binarize['cutoff']}
elif binarize['type'] == 'median': # median-split
median = np.median(y)
y = (y > median).astype(int)
idx = None
self.binarize_params = {'type': binarize['type'],
'median': median}

if save_binarization_params is not None:

with open(op.join(save_binarization_params,
'binarization_params.json'), 'w') as fout:
json.dump(self.binarize_params, fout)
else:
msg = ("Apply binarization params other than 'zscore is "
"not yet implemented.")
raise ValueError(msg)

self.y = y

Expand All @@ -508,6 +440,41 @@ def add_y(self, file_path, col_name, sep='\t', index_col=0,
if ensure_balanced:
self._undersample_majority()

def binarize_y(self, params, save_path=None, ensure_balanced=False):
""" Binarizes mvp's y-attribute using a specified method.
Parameters
----------
params : dict
The outcome variable (y) will be binarized along the
key-value pairs in the params-argument. Options:
>>> params = {'type': 'percentile', 'high': .75, 'low': .25}
>>> params = {'type': 'zscore', 'std': 1}
>>> params = {'type': 'constant', 'cutoff': 10}
>>> params = {'type': 'median'}
save_path : str
If not None (default), this should be an absolute path referring
to where the binarization-params should be saved.
ensure_balanced : bool
Whether to ensure balanced classes (if True, done by undersampling
the majority class).
"""
options = ['percentile', 'zscore', 'constant', 'median']

labb = LabelBinarizer(params)
self.X, y = labb.fit_transform(self.X, self.y)

if labb.idx_ is not None:
self._update_common_subjects(labb.idx_)

if ensure_balanced:
self._undersample_majority()

if save_path is not None:
# to do: save params as json
pass

def split(self, file_path, col_name, target, sep='\t', index_col=0):
""" Splits an MvpBetween object based on some external index.
Expand Down
53 changes: 53 additions & 0 deletions skbold/core/tests/test_mvp_between.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,6 +78,59 @@ def test_mvp_between_split():
mvp.create()
fpath = op.join(testdata_path, 'sample_behav.tsv')
mvp.split(fpath, col_name='group', target='train')


def test_mvp_between_calculate_confound_weighting():

source = dict()
source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
'sub*', 'run1.feat', 'stats',
'cope1.nii.gz')}

mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
mvp.create()
fpath = op.join(testdata_path, 'sample_behav.tsv')
mvp.add_y(fpath, col_name='var_categorical', index_col=0,
remove=999)
mvp.calculate_confound_weighting(fpath, 'confound_categorical')

assert(len(mvp.ipw) == mvp.X.shape[0])


def test_mvp_between_calculate_confound_weighting_two_vars():

source = dict()
source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
'sub*', 'run1.feat', 'stats',
'cope1.nii.gz')}

mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
mvp.create()
fpath = op.join(testdata_path, 'sample_behav.tsv')
mvp.add_y(fpath, col_name='var_categorical', index_col=0,
remove=999)
mvp.calculate_confound_weighting(fpath, ['confound_categorical',
'confound_continuous'])

assert(len(mvp.ipw) == mvp.X.shape[0])


def test_mvp_between_regress_out_confounds():

source = dict()
source['Contrast1'] = {'path': op.join(testdata_path, 'mock_subjects',
'sub*', 'run1.feat', 'stats',
'cope1.nii.gz')}

mvp = MvpBetween(source=source, subject_idf='sub???', mask=mask)
mvp.create()
fpath = op.join(testdata_path, 'sample_behav.tsv')
mvp.add_y(fpath, col_name='var_categorical', index_col=0,
remove=999)
mvp.regress_out_confounds(fpath, ['confound_categorical',
'confound_continuous'])

# assert(len(mvp.ipw) == mvp.X.shape[0])
spaths = glob(op.join(testdata_path, 'mock_subjects',
'sub*', 'run1.feat'))
_ = [shutil.rmtree(s) for s in spaths]
2 changes: 1 addition & 1 deletion skbold/data/test_data/sample_behav.tsv
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
var_categorical var_continuous var_multinomial confound_categoric confound_continuous group
var_categorical var_continuous var_multinomial confound_categorical confound_continuous group
sub001 1 23.123 2 0 384.123 train
sub002 1 24.129 2 0 819.32 train
sub003 1 20.001 2 1 381.33 train
Expand Down
5 changes: 3 additions & 2 deletions skbold/preproc/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .label_preproc import MajorityUndersampler, LabelFactorizer
from .label_preproc import MajorityUndersampler, LabelFactorizer,\
LabelBinarizer

__all__ = ['LabelFactorizer', 'MajorityUndersampler']
__all__ = ['LabelFactorizer', 'MajorityUndersampler', 'LabelBinarizer']
74 changes: 74 additions & 0 deletions skbold/preproc/label_preproc.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from __future__ import print_function, division, absolute_import
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
import scipy.stats as stat


class LabelFactorizer(BaseEstimator, TransformerMixin):
Expand Down Expand Up @@ -143,3 +144,76 @@ def transform(self, X, y):
self.idx_ = all_idx

return X[all_idx, :], y[all_idx]


class LabelBinarizer(BaseEstimator, TransformerMixin):

def __init__(self, params):
""" Initializes LabelBinarizer object. """
self.params = params
self.idx_ = None
self.binarize_params = None

def fit(self, X=None, y=None):
""" Does nothing, but included for scikit-learn pipelines. """
return self

def transform(self, X, y):
""" Binarizes y-attribute.
Parameters
----------
X : ndarray
Numeric (float) array of shape = [n_samples, n_features]
Returns
-------
X : ndarray
Transformed array of shape = [n_samples, n_features] given the
indices calculated during fit().
"""

options = ['percentile', 'zscore', 'constant', 'median']
params = self.params

if params['type'] == 'percentile':
y_rank = [stat.percentileofscore(y, a, 'rank') for a in y]
y_rank = np.array(y_rank)
idx = (y_rank < params['low']) | (y_rank > params['high'])
low = stat.scoreatpercentile(y, params['low'])
high = stat.scoreatpercentile(y, params['high'])
self.binarize_params = {'type': 'percentile',
'low': low,
'high': high}
y = (y_rank[idx] > 50).astype(int)

elif params['type'] == 'zscore':
y_norm = (y - y.mean()) / y.std() # just to be sure
idx = np.abs(y_norm) > params['std']
self.binarize_params = {'type': params['type'],
'mean': y.mean(),
'std': y.std(),
'n_std': params['std']}
y = (y_norm[idx] > 0).astype(int)

elif params['type'] == 'constant':
y = (y > params['cutoff']).astype(int)
idx = None
self.binarize_params = {'type': params['type'],
'cutoff': params['cutoff']}
elif params['type'] == 'median': # median-split
median = np.median(y)
y = (y > median).astype(int)
idx = None
self.binarize_params = {'type': params['type'],
'median': median}
else:
msg = 'Unknown type; please choose from: %r' % options
raise KeyError(msg)

if idx is not None:
X = X[idx, :]

self.idx_ = idx

return X, y

0 comments on commit 4468fcf

Please sign in to comment.