Skip to content

Commit

Permalink
[MRG] FIX Validate and convert X, y and groups to ndarray before spli…
Browse files Browse the repository at this point in the history
  • Loading branch information
raghavrv authored and paulha committed Aug 19, 2017
1 parent c6214e5 commit 547ccb3
Show file tree
Hide file tree
Showing 2 changed files with 215 additions and 37 deletions.
57 changes: 37 additions & 20 deletions sklearn/model_selection/_split.py
Expand Up @@ -25,6 +25,7 @@
from scipy.misc import comb
from ..utils import indexable, check_random_state, safe_indexing
from ..utils.validation import _num_samples, column_or_1d
from ..utils.validation import check_array
from ..utils.multiclass import type_of_target
from ..externals.six import with_metaclass
from ..externals.six.moves import zip
Expand Down Expand Up @@ -472,6 +473,7 @@ def __init__(self, n_splits=3):
def _iter_test_indices(self, X, y, groups):
if groups is None:
raise ValueError("The groups parameter should not be None")
groups = check_array(groups, ensure_2d=False, dtype=None)

unique_groups, groups = np.unique(groups, return_inverse=True)
n_groups = len(unique_groups)
Expand Down Expand Up @@ -618,12 +620,16 @@ def split(self, X, y, groups=None):
Training data, where n_samples is the number of samples
and n_features is the number of features.
Note that providing ``y`` is sufficient to generate the splits and
hence ``np.zeros(n_samples)`` may be used as a placeholder for
``X`` instead of actual training data.
y : array-like, shape (n_samples,)
The target variable for supervised learning problems.
Stratification is done based on the y labels.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
groups : object
Always ignored, exists for compatibility.
Returns
-------
Expand All @@ -633,6 +639,7 @@ def split(self, X, y, groups=None):
test : ndarray
The testing set indices for that split.
"""
y = check_array(y, ensure_2d=False, dtype=None)
return super(StratifiedKFold, self).split(X, y, groups)


Expand Down Expand Up @@ -696,11 +703,10 @@ def split(self, X, y=None, groups=None):
and n_features is the number of features.
y : array-like, shape (n_samples,)
The target variable for supervised learning problems.
Always ignored, exists for compatibility.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
Always ignored, exists for compatibility.
Returns
-------
Expand Down Expand Up @@ -746,12 +752,12 @@ class LeaveOneGroupOut(BaseCrossValidator):
>>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
>>> y = np.array([1, 2, 1, 2])
>>> groups = np.array([1, 1, 2, 2])
>>> lol = LeaveOneGroupOut()
>>> lol.get_n_splits(X, y, groups)
>>> logo = LeaveOneGroupOut()
>>> logo.get_n_splits(X, y, groups)
2
>>> print(lol)
>>> print(logo)
LeaveOneGroupOut()
>>> for train_index, test_index in lol.split(X, y, groups):
>>> for train_index, test_index in logo.split(X, y, groups):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
Expand All @@ -771,7 +777,7 @@ def _iter_test_masks(self, X, y, groups):
if groups is None:
raise ValueError("The groups parameter should not be None")
# We make a copy of groups to avoid side-effects during iteration
groups = np.array(groups, copy=True)
groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
unique_groups = np.unique(groups)
if len(unique_groups) <= 1:
raise ValueError(
Expand Down Expand Up @@ -833,12 +839,12 @@ class LeavePGroupsOut(BaseCrossValidator):
>>> X = np.array([[1, 2], [3, 4], [5, 6]])
>>> y = np.array([1, 2, 1])
>>> groups = np.array([1, 2, 3])
>>> lpl = LeavePGroupsOut(n_groups=2)
>>> lpl.get_n_splits(X, y, groups)
>>> lpgo = LeavePGroupsOut(n_groups=2)
>>> lpgo.get_n_splits(X, y, groups)
3
>>> print(lpl)
>>> print(lpgo)
LeavePGroupsOut(n_groups=2)
>>> for train_index, test_index in lpl.split(X, y, groups):
>>> for train_index, test_index in lpgo.split(X, y, groups):
... print("TRAIN:", train_index, "TEST:", test_index)
... X_train, X_test = X[train_index], X[test_index]
... y_train, y_test = y[train_index], y[test_index]
Expand All @@ -864,7 +870,7 @@ def __init__(self, n_groups):
def _iter_test_masks(self, X, y, groups):
if groups is None:
raise ValueError("The groups parameter should not be None")
groups = np.array(groups, copy=True)
groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
unique_groups = np.unique(groups)
if self.n_groups >= len(unique_groups):
raise ValueError(
Expand All @@ -886,9 +892,11 @@ def get_n_splits(self, X, y, groups):
----------
X : object
Always ignored, exists for compatibility.
``np.zeros(n_samples)`` may be used as a placeholder.
y : object
Always ignored, exists for compatibility.
``np.zeros(n_samples)`` may be used as a placeholder.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
Expand All @@ -901,6 +909,8 @@ def get_n_splits(self, X, y, groups):
"""
if groups is None:
raise ValueError("The groups parameter should not be None")
groups = check_array(groups, ensure_2d=False, dtype=None)
X, y, groups = indexable(X, y, groups)
return int(comb(len(np.unique(groups)), self.n_groups, exact=True))


Expand Down Expand Up @@ -1097,6 +1107,7 @@ def __init__(self, n_splits=5, test_size=0.2, train_size=None,
def _iter_indices(self, X, y, groups):
if groups is None:
raise ValueError("The groups parameter should not be None")
groups = check_array(groups, ensure_2d=False, dtype=None)
classes, group_indices = np.unique(groups, return_inverse=True)
for group_train, group_test in super(
GroupShuffleSplit, self)._iter_indices(X=classes):
Expand Down Expand Up @@ -1237,6 +1248,7 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None,

def _iter_indices(self, X, y, groups=None):
n_samples = _num_samples(X)
y = check_array(y, ensure_2d=False, dtype=None)
n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
self.train_size)
classes, y_indices = np.unique(y, return_inverse=True)
Expand Down Expand Up @@ -1290,12 +1302,16 @@ def split(self, X, y, groups=None):
Training data, where n_samples is the number of samples
and n_features is the number of features.
Note that providing ``y`` is sufficient to generate the splits and
hence ``np.zeros(n_samples)`` may be used as a placeholder for
``X`` instead of actual training data.
y : array-like, shape (n_samples,)
The target variable for supervised learning problems.
Stratification is done based on the y labels.
groups : array-like, with shape (n_samples,), optional
Group labels for the samples used while splitting the dataset into
train/test set.
groups : object
Always ignored, exists for compatibility.
Returns
-------
Expand All @@ -1305,6 +1321,7 @@ def split(self, X, y, groups=None):
test : ndarray
The testing set indices for that split.
"""
y = check_array(y, ensure_2d=False, dtype=None)
return super(StratifiedShuffleSplit, self).split(X, y, groups)


Expand Down Expand Up @@ -1613,7 +1630,7 @@ def train_test_split(*arrays, **options):
stratify : array-like or None (default is None)
If not None, data is split in a stratified fashion, using this as
the groups array.
the class labels.
Returns
-------
Expand Down

0 comments on commit 547ccb3

Please sign in to comment.