Skip to content

Commit

Permalink
[MRG + 1] fix kdd_kddcup99 shuffle logic (scikit-learn#9731)
Browse files Browse the repository at this point in the history
  • Loading branch information
ngoix authored and maskani-moh committed Nov 15, 2017
1 parent 960707f commit f8a9528
Show file tree
Hide file tree
Showing 3 changed files with 18 additions and 8 deletions.
3 changes: 3 additions & 0 deletions doc/whats_new/v0.20.rst
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,9 @@ Decomposition, manifold learning and clustering
Similarly, the ``n_components=None`` case now selects the minimum of
n_samples and n_features. :issue:`8484`. By :user:`Wally Gauze <wallygauze>`.

- Fixed a bug in :func:`datasets.fetch_kddcup99`, where data were not properly
shuffled. :issue:`9731` by `Nicolas Goix`_.

API changes summary
-------------------

Expand Down
13 changes: 5 additions & 8 deletions sklearn/datasets/kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,7 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
"""
data_home = get_data_home(data_home=data_home)
kddcup99 = _fetch_brute_kddcup99(data_home=data_home, shuffle=shuffle,
kddcup99 = _fetch_brute_kddcup99(data_home=data_home,
percent10=percent10,
download_if_missing=download_if_missing)

Expand Down Expand Up @@ -227,12 +227,15 @@ def fetch_kddcup99(subset=None, data_home=None, shuffle=False,
if subset == 'SF':
data = np.c_[data[:, 0], data[:, 2], data[:, 4], data[:, 5]]

if shuffle:
data, target = shuffle_method(data, target, random_state=random_state)

return Bunch(data=data, target=target)


def _fetch_brute_kddcup99(data_home=None,
download_if_missing=True, random_state=None,
shuffle=False, percent10=True):
percent10=True):

"""Load the kddcup99 dataset, downloading it if necessary.
Expand All @@ -253,9 +256,6 @@ def _fetch_brute_kddcup99(data_home=None,
If None, the random number generator is the RandomState instance used
by `np.random`.
shuffle : bool, default=False
Whether to shuffle dataset.
percent10 : bool, default=True
Whether to load only 10 percent of the data.
Expand Down Expand Up @@ -374,9 +374,6 @@ def _fetch_brute_kddcup99(data_home=None,
X = joblib.load(samples_path)
y = joblib.load(targets_path)

if shuffle:
X, y = shuffle_method(X, y, random_state=random_state)

return Bunch(data=X, target=y, DESCR=__doc__)


Expand Down
10 changes: 10 additions & 0 deletions sklearn/datasets/tests/test_kddcup99.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,3 +37,13 @@ def test_percent10():
data = fetch_kddcup99('smtp')
assert_equal(data.data.shape, (9571, 3))
assert_equal(data.target.shape, (9571,))


def test_shuffle():
try:
dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
percent10=True, download_if_missing=False)
except IOError:
raise SkipTest("kddcup99 dataset can not be loaded.")

assert(any(dataset.target[-100:] == b'normal.'))

0 comments on commit f8a9528

Please sign in to comment.