[MRG + 1] fix kdd_kddcup99 shuffle logic (scikit-learn#9731)

maskani-moh · Nov 15, 2017 · f8a9528 · f8a9528
1 parent 960707f
commit f8a9528
Show file tree

Hide file tree

Showing 3 changed files with 18 additions and 8 deletions.
diff --git a/doc/whats_new/v0.20.rst b/doc/whats_new/v0.20.rst
@@ -100,6 +100,9 @@ Decomposition, manifold learning and clustering
   Similarly, the ``n_components=None`` case now selects the minimum of
   n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze <wallygauze>`.
 
+- Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly
+  shuffled. :issue:`9731` by `Nicolas Goix`_.
+
 API changes summary
 -------------------
 

diff --git a/sklearn/datasets/kddcup99.py b/sklearn/datasets/kddcup99.py
@@ -177,7 +177,7 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
 
     """
     data_home = get_data_home(data_home=data_home)
-    kddcup99 = _fetch_brute_kddcup99(data_home=data_home, shuffle=shuffle,
+    kddcup99 = _fetch_brute_kddcup99(data_home=data_home,
                                      percent10=percent10,
                                      download_if_missing=download_if_missing)
 
@@ -227,12 +227,15 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
         if subset == 'SF':
             data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]
 
+    if shuffle:
+        data, target = shuffle_method(data, target, random_state=random_state)
+
     return Bunch(data=data, target=target)
 
 
 def _fetch_brute_kddcup99(data_home=None,
                           download_if_missing=True, random_state=None,
-                          shuffle=False, percent10=True):
+                          percent10=True):
 
     """Load the kddcup99 dataset, downloading it if necessary.
 
@@ -253,9 +256,6 @@ def _fetch_brute_kddcup99(data_home=None,
         If None, the random number generator is the RandomState instance used
         by `np.random`.
 
-    shuffle : bool, default=False
-        Whether to shuffle dataset.
-
     percent10 : bool, default=True
         Whether to load only 10 percent of the data.
 
@@ -374,9 +374,6 @@ def _fetch_brute_kddcup99(data_home=None,
         X = joblib.load(samples_path)
         y = joblib.load(targets_path)
 
-    if shuffle:
-        X, y = shuffle_method(X, y, random_state=random_state)
-
     return Bunch(data=X, target=y, DESCR=__doc__)
 
 

diff --git a/sklearn/datasets/tests/test_kddcup99.py b/sklearn/datasets/tests/test_kddcup99.py
@@ -37,3 +37,13 @@ def test_percent10():
     data = fetch_kddcup99('smtp')
     assert_equal(data.data.shape, (9571, 3))
     assert_equal(data.target.shape, (9571,))
+
+
+def test_shuffle():
+    try:
+        dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
+                                 percent10=True, download_if_missing=False)
+    except IOError:
+        raise SkipTest("kddcup99 dataset can not be loaded.")
+
+    assert(any(dataset.target[-100:] == b'normal.'))