[MRG] FIX Validate and convert X, y and groups to ndarray before spli…

…tting (scikit-learn#7593)
paulha · Aug 19, 2017 · 547ccb3 · 547ccb3
1 parent c6214e5
commit 547ccb3
Show file tree

Hide file tree

Showing 2 changed files with 215 additions and 37 deletions.
diff --git a/sklearn/model_selection/_split.py b/sklearn/model_selection/_split.py
@@ -25,6 +25,7 @@
 from scipy.misc import comb
 from ..utils import indexable, check_random_state, safe_indexing
 from ..utils.validation import _num_samples, column_or_1d
+from ..utils.validation import check_array
 from ..utils.multiclass import type_of_target
 from ..externals.six import with_metaclass
 from ..externals.six.moves import zip
@@ -472,6 +473,7 @@ def __init__(self, n_splits=3):
     def _iter_test_indices(self, X, y, groups):
         if groups is None:
             raise ValueError("The groups parameter should not be None")
+        groups = check_array(groups, ensure_2d=False, dtype=None)
 
         unique_groups, groups = np.unique(groups, return_inverse=True)
         n_groups = len(unique_groups)
@@ -618,12 +620,16 @@ def split(self, X, y, groups=None):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
 
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
         y : array-like, shape (n_samples,)
             The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
 
-        groups : array-like, with shape (n_samples,), optional
-            Group labels for the samples used while splitting the dataset into
-            train/test set.
+        groups : object
+            Always ignored, exists for compatibility.
 
         Returns
         -------
@@ -633,6 +639,7 @@ def split(self, X, y, groups=None):
         test : ndarray
             The testing set indices for that split.
         """
+        y = check_array(y, ensure_2d=False, dtype=None)
         return super(StratifiedKFold, self).split(X, y, groups)
 
 
@@ -696,11 +703,10 @@ def split(self, X, y=None, groups=None):
             and n_features is the number of features.
 
         y : array-like, shape (n_samples,)
-            The target variable for supervised learning problems.
+            Always ignored, exists for compatibility.
 
         groups : array-like, with shape (n_samples,), optional
-            Group labels for the samples used while splitting the dataset into
-            train/test set.
+            Always ignored, exists for compatibility.
 
         Returns
         -------
@@ -746,12 +752,12 @@ class LeaveOneGroupOut(BaseCrossValidator):
     >>> X = np.array([[1, 2], [3, 4], [5, 6], [7, 8]])
     >>> y = np.array([1, 2, 1, 2])
     >>> groups = np.array([1, 1, 2, 2])
-    >>> lol = LeaveOneGroupOut()
-    >>> lol.get_n_splits(X, y, groups)
+    >>> logo = LeaveOneGroupOut()
+    >>> logo.get_n_splits(X, y, groups)
     2
-    >>> print(lol)
+    >>> print(logo)
     LeaveOneGroupOut()
-    >>> for train_index, test_index in lol.split(X, y, groups):
+    >>> for train_index, test_index in logo.split(X, y, groups):
     ...    print("TRAIN:", train_index, "TEST:", test_index)
     ...    X_train, X_test = X[train_index], X[test_index]
     ...    y_train, y_test = y[train_index], y[test_index]
@@ -771,7 +777,7 @@ def _iter_test_masks(self, X, y, groups):
         if groups is None:
             raise ValueError("The groups parameter should not be None")
         # We make a copy of groups to avoid side-effects during iteration
-        groups = np.array(groups, copy=True)
+        groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
         unique_groups = np.unique(groups)
         if len(unique_groups) <= 1:
             raise ValueError(
@@ -833,12 +839,12 @@ class LeavePGroupsOut(BaseCrossValidator):
     >>> X = np.array([[1, 2], [3, 4], [5, 6]])
     >>> y = np.array([1, 2, 1])
     >>> groups = np.array([1, 2, 3])
-    >>> lpl = LeavePGroupsOut(n_groups=2)
-    >>> lpl.get_n_splits(X, y, groups)
+    >>> lpgo = LeavePGroupsOut(n_groups=2)
+    >>> lpgo.get_n_splits(X, y, groups)
     3
-    >>> print(lpl)
+    >>> print(lpgo)
     LeavePGroupsOut(n_groups=2)
-    >>> for train_index, test_index in lpl.split(X, y, groups):
+    >>> for train_index, test_index in lpgo.split(X, y, groups):
     ...    print("TRAIN:", train_index, "TEST:", test_index)
     ...    X_train, X_test = X[train_index], X[test_index]
     ...    y_train, y_test = y[train_index], y[test_index]
@@ -864,7 +870,7 @@ def __init__(self, n_groups):
     def _iter_test_masks(self, X, y, groups):
         if groups is None:
             raise ValueError("The groups parameter should not be None")
-        groups = np.array(groups, copy=True)
+        groups = check_array(groups, copy=True, ensure_2d=False, dtype=None)
         unique_groups = np.unique(groups)
         if self.n_groups >= len(unique_groups):
             raise ValueError(
@@ -886,9 +892,11 @@ def get_n_splits(self, X, y, groups):
         ----------
         X : object
             Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
 
         y : object
             Always ignored, exists for compatibility.
+            ``np.zeros(n_samples)`` may be used as a placeholder.
 
         groups : array-like, with shape (n_samples,), optional
             Group labels for the samples used while splitting the dataset into
@@ -901,6 +909,8 @@ def get_n_splits(self, X, y, groups):
         """
         if groups is None:
             raise ValueError("The groups parameter should not be None")
+        groups = check_array(groups, ensure_2d=False, dtype=None)
+        X, y, groups = indexable(X, y, groups)
         return int(comb(len(np.unique(groups)), self.n_groups, exact=True))
 
 
@@ -1097,6 +1107,7 @@ def __init__(self, n_splits=5, test_size=0.2, train_size=None,
     def _iter_indices(self, X, y, groups):
         if groups is None:
             raise ValueError("The groups parameter should not be None")
+        groups = check_array(groups, ensure_2d=False, dtype=None)
         classes, group_indices = np.unique(groups, return_inverse=True)
         for group_train, group_test in super(
                 GroupShuffleSplit, self)._iter_indices(X=classes):
@@ -1237,6 +1248,7 @@ def __init__(self, n_splits=10, test_size=0.1, train_size=None,
 
     def _iter_indices(self, X, y, groups=None):
         n_samples = _num_samples(X)
+        y = check_array(y, ensure_2d=False, dtype=None)
         n_train, n_test = _validate_shuffle_split(n_samples, self.test_size,
                                                   self.train_size)
         classes, y_indices = np.unique(y, return_inverse=True)
@@ -1290,12 +1302,16 @@ def split(self, X, y, groups=None):
             Training data, where n_samples is the number of samples
             and n_features is the number of features.
 
+            Note that providing ``y`` is sufficient to generate the splits and
+            hence ``np.zeros(n_samples)`` may be used as a placeholder for
+            ``X`` instead of actual training data.
+
         y : array-like, shape (n_samples,)
             The target variable for supervised learning problems.
+            Stratification is done based on the y labels.
 
-        groups : array-like, with shape (n_samples,), optional
-            Group labels for the samples used while splitting the dataset into
-            train/test set.
+        groups : object
+            Always ignored, exists for compatibility.
 
         Returns
         -------
@@ -1305,6 +1321,7 @@ def split(self, X, y, groups=None):
         test : ndarray
             The testing set indices for that split.
         """
+        y = check_array(y, ensure_2d=False, dtype=None)
         return super(StratifiedShuffleSplit, self).split(X, y, groups)
 
 
@@ -1613,7 +1630,7 @@ def train_test_split(*arrays, **options):
 
     stratify : array-like or None (default is None)
         If not None, data is split in a stratified fashion, using this as
-        the groups array.
+        the class labels.
 
     Returns
     -------