From 6ee32fc653ff694045276670b91400fc33f2cdb1 Mon Sep 17 00:00:00 2001 From: rahultuli Date: Wed, 14 Jul 2021 11:35:22 -0400 Subject: [PATCH 1/9] Add:Support for batched iteration --- src/sparsezoo/utils/data.py | 56 +++++++++++++++++++++++++++-- tests/test_dataset.py | 72 +++++++++++++++++++++++++++++++++++++ 2 files changed, 126 insertions(+), 2 deletions(-) create mode 100644 tests/test_dataset.py diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index fc424fbd..d5989078 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -19,7 +19,7 @@ import logging import math from collections import OrderedDict -from typing import Dict, Iterable, Iterator, List, Tuple, Union +from typing import Dict, Generator, Iterable, Iterator, List, Tuple, Union import numpy @@ -28,7 +28,6 @@ __all__ = ["Dataset", "RandomDataset", "DataLoader"] - _LOGGER = logging.getLogger(__name__) @@ -62,6 +61,59 @@ def __iter__(self) -> Iterator[Union[numpy.ndarray, Dict[str, numpy.ndarray]]]: for item in self._data: yield item + def iter_batches( + self, + batch_size: int, + iterations: int, + ) -> Generator[List[numpy.ndarray], None, None]: + """ + A method to iterate over data in batches + + :param batch_size: Positive integer representing the size of each batch + :param iterations: Positive integer representing number of batches to yield + + :pre: batch_size should be within the range [0, number_of_samples) + :pre: iterations should be a positive integer + + :return: A batch generator for self.data + """ + assert iterations >= 0, "number of iterations should be positive" + assert ( + self._data.shape[0] >= batch_size >= 0 + ), "batch_size should be positive and less than or equal to the size of data" + + _dataset = self._data + + # handle 1-d numpy arrays + if len(_dataset.shape) == 1: + _dataset = _dataset.reshape(_dataset.shape[0], 1) + + iteration = 0 + batch_buffer = [] + batch_template = [ + numpy.ascontiguousarray( + numpy.zeros((batch_size, *array.shape), dtype=array.dtype) + ) + for array in _dataset[0] + ] + while iteration < iterations: + for sample in _dataset: + batch_buffer.append(sample) + + if len(batch_buffer) == batch_size: + yield [ + numpy.stack( + [sample[idx] for sample in batch_buffer], out=template + ) + for idx, template in enumerate(batch_template) + ] + + batch_buffer = [] + iteration += 1 + + if iteration >= iterations: + break + @property def name(self) -> str: """ diff --git a/tests/test_dataset.py b/tests/test_dataset.py new file mode 100644 index 00000000..26573d54 --- /dev/null +++ b/tests/test_dataset.py @@ -0,0 +1,72 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import pytest as pytest + +from sparsezoo.utils import Dataset + + +@pytest.mark.parametrize( + "_data", + [ + Dataset(data=np.empty(shape=(100, 100, 10, 10)), name="4d"), + ], +) +def test_has_iter_batches(_data): + assert hasattr(_data, "iter_batches") + + +@pytest.mark.parametrize( + "_data", + [ + Dataset(data=np.empty(shape=(100, 100, 10, 10)), name="4d"), + Dataset(data=np.empty(shape=(100, 100, 10)), name="3d"), + Dataset(data=np.empty(shape=(100, 10)), name="2d"), + Dataset(data=np.empty(shape=(100,)), name="1d"), + ], +) +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 10, + 100, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 10, + 100, + ], +) +def test_batched_iteration(_data, batch_size, iterations): + data_loader = _data.iter_batches(batch_size=batch_size, iterations=iterations) + data_shape = _data.data.shape + + # fix 1-d numpy array shape + if len(data_shape) == 1: + data_shape = (data_shape[0], 1) + + for iteration, batch in enumerate(data_loader): + batch_shape = batch[0].shape + + assert isinstance(batch, list) + assert len(batch) == data_shape[1] + assert len(batch_shape) == len(data_shape) - 1 + assert all([a == b for a, b in zip(batch_shape[1:], data_shape[2:])]) + + assert iteration + 1 == iterations From 5ae1281d6118df571331a26d062fae212152adea Mon Sep 17 00:00:00 2001 From: rahultuli Date: Wed, 14 Jul 2021 16:15:22 -0400 Subject: [PATCH 2/9] Refactor:Data to be list of numpy arrays --- src/sparsezoo/utils/data.py | 16 ++------ tests/{test_dataset.py => test_data.py} | 54 ++++++++++++------------- 2 files changed, 29 insertions(+), 41 deletions(-) rename tests/{test_dataset.py => test_data.py} (50%) diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index d5989078..a3e3c9e6 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -77,27 +77,17 @@ def iter_batches( :return: A batch generator for self.data """ - assert iterations >= 0, "number of iterations should be positive" - assert ( - self._data.shape[0] >= batch_size >= 0 - ), "batch_size should be positive and less than or equal to the size of data" - - _dataset = self._data - - # handle 1-d numpy arrays - if len(_dataset.shape) == 1: - _dataset = _dataset.reshape(_dataset.shape[0], 1) - + dataset = self._data iteration = 0 batch_buffer = [] batch_template = [ numpy.ascontiguousarray( numpy.zeros((batch_size, *array.shape), dtype=array.dtype) ) - for array in _dataset[0] + for array in dataset[0] ] while iteration < iterations: - for sample in _dataset: + for sample in dataset: batch_buffer.append(sample) if len(batch_buffer) == batch_size: diff --git a/tests/test_dataset.py b/tests/test_data.py similarity index 50% rename from tests/test_dataset.py rename to tests/test_data.py index 26573d54..a627f798 100644 --- a/tests/test_dataset.py +++ b/tests/test_data.py @@ -18,29 +18,23 @@ from sparsezoo.utils import Dataset -@pytest.mark.parametrize( - "_data", - [ - Dataset(data=np.empty(shape=(100, 100, 10, 10)), name="4d"), - ], -) -def test_has_iter_batches(_data): - assert hasattr(_data, "iter_batches") +@pytest.fixture +def dummy_dataset(): + _dummy_array_1 = np.random.rand(2, 3) + _dummy_array_2 = np.random.rand(34, 3) + return Dataset(data=[_dummy_array_1, _dummy_array_2], name="dummy") + + +def test_has_iter_batches(dummy_dataset): + assert hasattr(dummy_dataset, "iter_batches") -@pytest.mark.parametrize( - "_data", - [ - Dataset(data=np.empty(shape=(100, 100, 10, 10)), name="4d"), - Dataset(data=np.empty(shape=(100, 100, 10)), name="3d"), - Dataset(data=np.empty(shape=(100, 10)), name="2d"), - Dataset(data=np.empty(shape=(100,)), name="1d"), - ], -) @pytest.mark.parametrize( "batch_size", [ 1, + 2, + 3, 10, 100, ], @@ -53,20 +47,24 @@ def test_has_iter_batches(_data): 100, ], ) -def test_batched_iteration(_data, batch_size, iterations): - data_loader = _data.iter_batches(batch_size=batch_size, iterations=iterations) - data_shape = _data.data.shape - - # fix 1-d numpy array shape - if len(data_shape) == 1: - data_shape = (data_shape[0], 1) +def test_batched_iteration(dummy_dataset, batch_size, iterations): + data_loader = dummy_dataset.iter_batches( + batch_size=batch_size, iterations=iterations + ) + data_shape = dummy_dataset.data[0].shape for iteration, batch in enumerate(data_loader): - batch_shape = batch[0].shape + batch_element_shape = batch[0].shape assert isinstance(batch, list) - assert len(batch) == data_shape[1] - assert len(batch_shape) == len(data_shape) - 1 - assert all([a == b for a, b in zip(batch_shape[1:], data_shape[2:])]) + assert batch_element_shape[0] == batch_size + assert all( + ( + batch_dimension == data_dimension + for batch_dimension, data_dimension in zip( + batch_element_shape[1:], data_shape[1:] + ) + ) + ) assert iteration + 1 == iterations From 5621b16c97c675002f4e7f186e527ab6c7be5b14 Mon Sep 17 00:00:00 2001 From: rahultuli Date: Thu, 15 Jul 2021 11:22:31 -0400 Subject: [PATCH 3/9] Add:test for tuple of numpy arrays --- tests/test_data.py | 30 ++++++++++++++++-------------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/tests/test_data.py b/tests/test_data.py index a627f798..b8bb33c7 100644 --- a/tests/test_data.py +++ b/tests/test_data.py @@ -18,17 +18,21 @@ from sparsezoo.utils import Dataset -@pytest.fixture -def dummy_dataset(): - _dummy_array_1 = np.random.rand(2, 3) - _dummy_array_2 = np.random.rand(34, 3) - return Dataset(data=[_dummy_array_1, _dummy_array_2], name="dummy") - - -def test_has_iter_batches(dummy_dataset): - assert hasattr(dummy_dataset, "iter_batches") +@pytest.mark.parametrize( + "dataset", + [Dataset(data=(np.random.rand(2, 45), np.random.rand(2, 45)), name="tuple")], +) +def test_has_iter_batches(dataset): + assert hasattr(dataset, "iter_batches") +@pytest.mark.parametrize( + "dataset", + [ + Dataset(data=[np.random.rand(2, 45), np.random.rand(2, 45)], name="list"), + Dataset(data=(np.random.rand(2, 45), np.random.rand(2, 45)), name="tuple"), + ], +) @pytest.mark.parametrize( "batch_size", [ @@ -47,11 +51,9 @@ def test_has_iter_batches(dummy_dataset): 100, ], ) -def test_batched_iteration(dummy_dataset, batch_size, iterations): - data_loader = dummy_dataset.iter_batches( - batch_size=batch_size, iterations=iterations - ) - data_shape = dummy_dataset.data[0].shape +def test_batched_iteration(dataset, batch_size, iterations): + data_loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) + data_shape = dataset.data[0].shape for iteration, batch in enumerate(data_loader): batch_element_shape = batch[0].shape From 57325b47bd646df841cb1b89e2bceef4da0d6849 Mon Sep 17 00:00:00 2001 From: rahultuli Date: Fri, 16 Jul 2021 13:57:05 -0400 Subject: [PATCH 4/9] Add:batched iteration support * iter_batches function in Dataset class returns a BatchLoader object * BatchLoader class added * Moved utils.py * Renamed utils.py * Created test_data.py * Cleanup * Fix Typo --- src/sparsezoo/utils/data.py | 202 ++++++++++++++---- tests/sparsezoo/utils/__init__.py | 15 ++ tests/sparsezoo/utils/test_data.py | 186 ++++++++++++++++ .../{utils.py => utils/utilities.py} | 8 + tests/test_data.py | 72 ------- 5 files changed, 368 insertions(+), 115 deletions(-) create mode 100644 tests/sparsezoo/utils/__init__.py create mode 100644 tests/sparsezoo/utils/test_data.py rename tests/sparsezoo/{utils.py => utils/utilities.py} (97%) delete mode 100644 tests/test_data.py diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index a3e3c9e6..7204eff6 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -31,6 +31,155 @@ _LOGGER = logging.getLogger(__name__) +class _BatchLoader: + """ + A utility class to load data in batches for fixed number of iterations + + :param data: An iterable of numpy arrays or a list of list of numpy arrays + for multi-input models + :param batch_size: non-negative integer representing the size of each + :param iterations: non-negative integer representing + the number of batches to return + """ + + __slots__ = [ + "_data", + "_batch_size", + "_iterations", + "_batch_buffer", + "_batch_template", + "_batches_created", + ] + + def __init__( + self, + data: Iterable[Union[numpy.ndarray, List[numpy.ndarray]]], + batch_size: int, + iterations: int, + ): + self._data = data + self._batch_size = batch_size + self._iterations = iterations + if batch_size < 0 or iterations < 0: + raise ValueError( + f"Both batch size and number of _iterations should be non-negative, " + f"supplied values (_batch_size, _iterations):{(batch_size, iterations)}" + ) + + self._batch_buffer = [] + self._batch_template = self._init_batch_template() + self._batches_created = 0 + + def __iter__(self) -> Generator[List[numpy.ndarray], None, None]: + """ + Iterate over the data in batches, + (yields from appropriate generator) + return: A generator for batches, each batch is enclosed in a list + """ + if self._is_multi_input: + yield from self._multi_input_batch_generator() + else: + yield from self._single_input_batch_generator() + + @property + def _is_multi_input(self) -> bool: + """ + Check if data consists of multi-input elements + return: True if data consists of multi-input elements else False + """ + return type(self._data[0]) is not numpy.ndarray + + @property + def _buffer_is_full(self) -> bool: + """ + Check if buffer has reached the batch size + """ + return len(self._batch_buffer) == self._batch_size + + @property + def _all_batches_loaded(self) -> bool: + """ + Check if all batches have been loaded + """ + return self._batches_created >= self._iterations + + def _single_input_batch_generator( + self, + ) -> Generator[List[numpy.ndarray], None, None]: + """ + :returns: A generator for single input batches, each element is + of the form [(batch_size, features)] + """ + while not self._all_batches_loaded: + for source in self._data: + yield from self._batch_generator(source=source) + if self._all_batches_loaded: + break + + def _multi_input_batch_generator( + self, + ) -> Generator[List[numpy.ndarray], None, None]: + """ + :returns: A generator for multi input batches, each element is + of the form [[(batch_size, features_a), (batch_size, features_b), ...]] + """ + while not self._all_batches_loaded: + yield from self._batch_generator(source=self._data) + + def _batch_generator(self, source) -> Generator[List[numpy.ndarray], None, None]: + """ + A helper function to create batches from source + """ + for sample in source: + self._batch_buffer.append(sample) + if self._buffer_is_full: + _batch = self._make_batch() + yield _batch + self._empty_buffer() + self._batches_created += 1 + if self._all_batches_loaded: + break + + def _empty_buffer(self): + self._batch_buffer = [] + + def _init_batch_template( + self, + ) -> Iterable[Union[List[numpy.ndarray], numpy.ndarray]]: + """ + Initialize a placeholder for batches + :returns: A placeholder numpy array or list of numpy arrays of specific shape + and size (filled with zeros) required for future batches + """ + if self._is_multi_input: + return [ + numpy.ascontiguousarray( + numpy.zeros((self._batch_size, *_input.shape), dtype=_input.dtype) + ) + for _input in self._data[0] + ] + + return numpy.ascontiguousarray( + numpy.zeros( + (self._batch_size, *self._data[0].shape[1:]), dtype=self._data[0].dtype + ) + ) + + def _make_batch(self) -> Iterable[Union[numpy.ndarray, List[numpy.ndarray]]]: + """ + Copy contents of buffer to batch placeholder + return: A numpy array or list of numpy arrays representing the batch + """ + if self._is_multi_input: + return [ + numpy.stack( + [sample[idx] for sample in self._batch_buffer], out=template + ) + for idx, template in enumerate(self._batch_template) + ] + return [numpy.stack(self._batch_buffer, out=self._batch_template)] + + class Dataset(Iterable): """ A numpy dataset implementation @@ -61,49 +210,6 @@ def __iter__(self) -> Iterator[Union[numpy.ndarray, Dict[str, numpy.ndarray]]]: for item in self._data: yield item - def iter_batches( - self, - batch_size: int, - iterations: int, - ) -> Generator[List[numpy.ndarray], None, None]: - """ - A method to iterate over data in batches - - :param batch_size: Positive integer representing the size of each batch - :param iterations: Positive integer representing number of batches to yield - - :pre: batch_size should be within the range [0, number_of_samples) - :pre: iterations should be a positive integer - - :return: A batch generator for self.data - """ - dataset = self._data - iteration = 0 - batch_buffer = [] - batch_template = [ - numpy.ascontiguousarray( - numpy.zeros((batch_size, *array.shape), dtype=array.dtype) - ) - for array in dataset[0] - ] - while iteration < iterations: - for sample in dataset: - batch_buffer.append(sample) - - if len(batch_buffer) == batch_size: - yield [ - numpy.stack( - [sample[idx] for sample in batch_buffer], out=template - ) - for idx, template in enumerate(batch_template) - ] - - batch_buffer = [] - iteration += 1 - - if iteration >= iterations: - break - @property def name(self) -> str: """ @@ -118,6 +224,16 @@ def data(self) -> List[Union[numpy.ndarray, Dict[str, numpy.ndarray]]]: """ return self._data + def iter_batches( + self, batch_size: int, iterations: int + ) -> Generator[List[numpy.ndarray], None, None]: + """ + A function to iterate over data in batches + """ + return _BatchLoader( + data=self.data, batch_size=batch_size, iterations=iterations + ) + class RandomDataset(Dataset): """ diff --git a/tests/sparsezoo/utils/__init__.py b/tests/sparsezoo/utils/__init__.py new file mode 100644 index 00000000..e8d586a1 --- /dev/null +++ b/tests/sparsezoo/utils/__init__.py @@ -0,0 +1,15 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .utilities import * # noqa diff --git a/tests/sparsezoo/utils/test_data.py b/tests/sparsezoo/utils/test_data.py new file mode 100644 index 00000000..8e2fea1a --- /dev/null +++ b/tests/sparsezoo/utils/test_data.py @@ -0,0 +1,186 @@ +# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Iterable + +import numpy as np +import pytest + +from sparsezoo.utils import Dataset + + +@pytest.fixture +def dummy_dataset(): + return Dataset(data=[np.random.rand(100, 10)], name="dummy-dataset") + + +@pytest.fixture +def single_input_dataset(): + data = [np.random.rand(1, 2), np.random.rand(2, 2), np.random.rand(3, 2)] + return Dataset(data=data, name="single-input-test-dataset") + + +@pytest.fixture +def multi_input_dataset(): + data = [ + [np.random.rand(1, 2), np.random.rand(1, 3)], + [np.random.rand(1, 2), np.random.rand(1, 3)], + ] + return Dataset(data=data, name="multi-input-test-dataset") + + +@pytest.fixture +def both_datasets(single_input_dataset, multi_input_dataset): + return [single_input_dataset, multi_input_dataset] + + +def test_has_iter_batches(dummy_dataset): + assert hasattr(dummy_dataset, "iter_batches") + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 10, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 4, + 10, + ], +) +def test_iter_batches_returns_iterable(both_datasets, batch_size, iterations): + for dataset in both_datasets: + loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) + assert isinstance(loader, Iterable) + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 10, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 4, + 10, + ], +) +def test_batch_is_in_list(both_datasets, batch_size, iterations): + for dataset in both_datasets: + loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) + for batch in loader: + assert isinstance(batch, list) + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 10, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 4, + 10, + ], +) +def test_iter_batches_single_input_batch_shape( + single_input_dataset, batch_size, iterations +): + loader = single_input_dataset.iter_batches( + batch_size=batch_size, iterations=iterations + ) + + _data_dimensions = single_input_dataset.data[0].shape + expected_batch_shape = (batch_size, *_data_dimensions[1:]) + + for batch in loader: + batch_data = batch[0] + batch_shape = batch_data.shape + assert batch_shape == expected_batch_shape + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 10, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 4, + 10, + ], +) +def test_iter_batches_number_of_iterations(both_datasets, batch_size, iterations): + for dataset in both_datasets: + loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) + for iteration, batch in enumerate(loader): + pass + assert iteration + 1 == iterations + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 3, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 3, + ], +) +def test_iter_batches_multi_input_batch_shape( + multi_input_dataset, batch_size, iterations +): + expected_batch_dimensions = [ + (batch_size, *multi_input.shape) for multi_input in multi_input_dataset.data[0] + ] + loader = multi_input_dataset.iter_batches( + batch_size=batch_size, iterations=iterations + ) + + for batch in loader: + assert all( + expected_batch_dimensions[idx] == multi_input.shape + for idx, multi_input in enumerate(batch) + ) diff --git a/tests/sparsezoo/utils.py b/tests/sparsezoo/utils/utilities.py similarity index 97% rename from tests/sparsezoo/utils.py rename to tests/sparsezoo/utils/utilities.py index b618a073..407f1162 100644 --- a/tests/sparsezoo/utils.py +++ b/tests/sparsezoo/utils/utilities.py @@ -23,6 +23,14 @@ from sparsezoo.utils import CACHE_DIR +__all__ = [ + "download_and_verify", + "model_constructor", + "validate_with_ort", + "validate_downloaded_model", +] + + def download_and_verify(model: str, other_args: Optional[Dict] = None): if other_args is None: other_args = { diff --git a/tests/test_data.py b/tests/test_data.py deleted file mode 100644 index b8bb33c7..00000000 --- a/tests/test_data.py +++ /dev/null @@ -1,72 +0,0 @@ -# Copyright (c) 2021 - present / Neuralmagic, Inc. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import numpy as np -import pytest as pytest - -from sparsezoo.utils import Dataset - - -@pytest.mark.parametrize( - "dataset", - [Dataset(data=(np.random.rand(2, 45), np.random.rand(2, 45)), name="tuple")], -) -def test_has_iter_batches(dataset): - assert hasattr(dataset, "iter_batches") - - -@pytest.mark.parametrize( - "dataset", - [ - Dataset(data=[np.random.rand(2, 45), np.random.rand(2, 45)], name="list"), - Dataset(data=(np.random.rand(2, 45), np.random.rand(2, 45)), name="tuple"), - ], -) -@pytest.mark.parametrize( - "batch_size", - [ - 1, - 2, - 3, - 10, - 100, - ], -) -@pytest.mark.parametrize( - "iterations", - [ - 1, - 10, - 100, - ], -) -def test_batched_iteration(dataset, batch_size, iterations): - data_loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) - data_shape = dataset.data[0].shape - - for iteration, batch in enumerate(data_loader): - batch_element_shape = batch[0].shape - - assert isinstance(batch, list) - assert batch_element_shape[0] == batch_size - assert all( - ( - batch_dimension == data_dimension - for batch_dimension, data_dimension in zip( - batch_element_shape[1:], data_shape[1:] - ) - ) - ) - - assert iteration + 1 == iterations From 7d82cd4629e30d3ba2fb3ed1a3297216fa654eef Mon Sep 17 00:00:00 2001 From: Rahul Tuli Date: Mon, 19 Jul 2021 12:19:35 -0400 Subject: [PATCH 5/9] Update src/sparsezoo/utils/data.py Co-authored-by: Benjamin Fineran --- src/sparsezoo/utils/data.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index 7204eff6..b6d893d5 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -62,8 +62,8 @@ def __init__( self._iterations = iterations if batch_size < 0 or iterations < 0: raise ValueError( - f"Both batch size and number of _iterations should be non-negative, " - f"supplied values (_batch_size, _iterations):{(batch_size, iterations)}" + f"Both batch size and number of iterations should be non-negative, " + f"supplied values (batch_size, iterations):{(batch_size, iterations)}" ) self._batch_buffer = [] From f7870048e0f2f7d49cec142f56f8788c1422127e Mon Sep 17 00:00:00 2001 From: rahultuli Date: Mon, 19 Jul 2021 12:59:54 -0400 Subject: [PATCH 6/9] Fix:Single-Input cases Address:PR review comments --- src/sparsezoo/utils/data.py | 114 ++++++++--------------------- tests/sparsezoo/utils/test_data.py | 4 +- tests/sparsezoo/utils/utilities.py | 8 -- 3 files changed, 32 insertions(+), 94 deletions(-) diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index b6d893d5..19080764 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -31,17 +31,10 @@ _LOGGER = logging.getLogger(__name__) -class _BatchLoader: - """ - A utility class to load data in batches for fixed number of iterations +# A utility class to load data in batches for fixed number of iterations - :param data: An iterable of numpy arrays or a list of list of numpy arrays - for multi-input models - :param batch_size: non-negative integer representing the size of each - :param iterations: non-negative integer representing - the number of batches to return - """ +class _BatchLoader: __slots__ = [ "_data", "_batch_size", @@ -58,6 +51,9 @@ def __init__( iterations: int, ): self._data = data + _single_input = type(self._data[0]) is numpy.ndarray + if _single_input: + self._data = [self._data] self._batch_size = batch_size self._iterations = iterations if batch_size < 0 or iterations < 0: @@ -71,113 +67,57 @@ def __init__( self._batches_created = 0 def __iter__(self) -> Generator[List[numpy.ndarray], None, None]: - """ - Iterate over the data in batches, - (yields from appropriate generator) - return: A generator for batches, each batch is enclosed in a list - """ - if self._is_multi_input: - yield from self._multi_input_batch_generator() - else: - yield from self._single_input_batch_generator() - - @property - def _is_multi_input(self) -> bool: - """ - Check if data consists of multi-input elements - return: True if data consists of multi-input elements else False - """ - return type(self._data[0]) is not numpy.ndarray + yield from self._multi_input_batch_generator() @property def _buffer_is_full(self) -> bool: - """ - Check if buffer has reached the batch size - """ return len(self._batch_buffer) == self._batch_size @property def _all_batches_loaded(self) -> bool: - """ - Check if all batches have been loaded - """ return self._batches_created >= self._iterations - def _single_input_batch_generator( - self, - ) -> Generator[List[numpy.ndarray], None, None]: - """ - :returns: A generator for single input batches, each element is - of the form [(batch_size, features)] - """ - while not self._all_batches_loaded: - for source in self._data: - yield from self._batch_generator(source=source) - if self._all_batches_loaded: - break - def _multi_input_batch_generator( self, ) -> Generator[List[numpy.ndarray], None, None]: - """ - :returns: A generator for multi input batches, each element is - of the form [[(batch_size, features_a), (batch_size, features_b), ...]] - """ + # A generator for with each element of the form + # [[(batch_size, features_a), (batch_size, features_b), ...]] while not self._all_batches_loaded: yield from self._batch_generator(source=self._data) def _batch_generator(self, source) -> Generator[List[numpy.ndarray], None, None]: - """ - A helper function to create batches from source - """ + # batches from source for sample in source: self._batch_buffer.append(sample) if self._buffer_is_full: _batch = self._make_batch() yield _batch - self._empty_buffer() + self._batch_buffer = [] self._batches_created += 1 if self._all_batches_loaded: break - def _empty_buffer(self): - self._batch_buffer = [] - def _init_batch_template( self, ) -> Iterable[Union[List[numpy.ndarray], numpy.ndarray]]: - """ - Initialize a placeholder for batches - :returns: A placeholder numpy array or list of numpy arrays of specific shape - and size (filled with zeros) required for future batches - """ - if self._is_multi_input: - return [ - numpy.ascontiguousarray( - numpy.zeros((self._batch_size, *_input.shape), dtype=_input.dtype) - ) - for _input in self._data[0] - ] - return numpy.ascontiguousarray( - numpy.zeros( - (self._batch_size, *self._data[0].shape[1:]), dtype=self._data[0].dtype + # A placeholder for batches + + return [ + numpy.ascontiguousarray( + numpy.zeros((self._batch_size, *_input.shape), dtype=_input.dtype) ) - ) + for _input in self._data[0] + ] def _make_batch(self) -> Iterable[Union[numpy.ndarray, List[numpy.ndarray]]]: - """ - Copy contents of buffer to batch placeholder - return: A numpy array or list of numpy arrays representing the batch - """ - if self._is_multi_input: - return [ - numpy.stack( - [sample[idx] for sample in self._batch_buffer], out=template - ) - for idx, template in enumerate(self._batch_template) - ] - return [numpy.stack(self._batch_buffer, out=self._batch_template)] + # Copy contents of buffer to batch placeholder + # and return A list of numpy array(s) representing the batch + + return [ + numpy.stack([sample[idx] for sample in self._batch_buffer], out=template) + for idx, template in enumerate(self._batch_template) + ] class Dataset(Iterable): @@ -229,6 +169,12 @@ def iter_batches( ) -> Generator[List[numpy.ndarray], None, None]: """ A function to iterate over data in batches + + :param batch_size: non-negative integer representing the size of each + :param iterations: non-negative integer representing + the number of batches to return + :returns: A generator for batches, each batch is enclosed in a list + Each batch is of the form [(batch_size, *feature_shape)] """ return _BatchLoader( data=self.data, batch_size=batch_size, iterations=iterations diff --git a/tests/sparsezoo/utils/test_data.py b/tests/sparsezoo/utils/test_data.py index 8e2fea1a..132c50e5 100644 --- a/tests/sparsezoo/utils/test_data.py +++ b/tests/sparsezoo/utils/test_data.py @@ -26,7 +26,7 @@ def dummy_dataset(): @pytest.fixture def single_input_dataset(): - data = [np.random.rand(1, 2), np.random.rand(2, 2), np.random.rand(3, 2)] + data = [np.random.rand(3, 2), np.random.rand(3, 2), np.random.rand(3, 2)] return Dataset(data=data, name="single-input-test-dataset") @@ -120,7 +120,7 @@ def test_iter_batches_single_input_batch_shape( ) _data_dimensions = single_input_dataset.data[0].shape - expected_batch_shape = (batch_size, *_data_dimensions[1:]) + expected_batch_shape = (batch_size, *_data_dimensions) for batch in loader: batch_data = batch[0] diff --git a/tests/sparsezoo/utils/utilities.py b/tests/sparsezoo/utils/utilities.py index 407f1162..b618a073 100644 --- a/tests/sparsezoo/utils/utilities.py +++ b/tests/sparsezoo/utils/utilities.py @@ -23,14 +23,6 @@ from sparsezoo.utils import CACHE_DIR -__all__ = [ - "download_and_verify", - "model_constructor", - "validate_with_ort", - "validate_downloaded_model", -] - - def download_and_verify(model: str, other_args: Optional[Dict] = None): if other_args is None: other_args = { From 477191ea2c5b558b359bea52f3a7a22b20109fa0 Mon Sep 17 00:00:00 2001 From: rahultuli Date: Mon, 19 Jul 2021 14:39:30 -0400 Subject: [PATCH 7/9] Update:Rename tests/utils.py to tests/helpers.py Fix:Unwrapping Single Input Errors --- src/sparsezoo/utils/data.py | 11 ++++++++--- tests/sparsezoo/{utils/utilities.py => helpers.py} | 0 .../models/classification/test_efficientnet.py | 2 +- .../sparsezoo/models/classification/test_inception.py | 2 +- .../sparsezoo/models/classification/test_mobilenet.py | 2 +- tests/sparsezoo/models/classification/test_resnet.py | 2 +- tests/sparsezoo/models/classification/test_vgg.py | 2 +- tests/sparsezoo/models/detection/test_ssd.py | 2 +- tests/sparsezoo/models/detection/test_yolo.py | 2 +- tests/sparsezoo/models/test_zoo.py | 2 +- tests/sparsezoo/models/test_zoo_extensive.py | 2 +- tests/sparsezoo/utils/__init__.py | 2 -- tests/sparsezoo/utils/test_data.py | 7 +++---- 13 files changed, 20 insertions(+), 18 deletions(-) rename tests/sparsezoo/{utils/utilities.py => helpers.py} (100%) diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index 19080764..b458a338 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -38,6 +38,7 @@ class _BatchLoader: __slots__ = [ "_data", "_batch_size", + "_single_input", "_iterations", "_batch_buffer", "_batch_template", @@ -51,8 +52,8 @@ def __init__( iterations: int, ): self._data = data - _single_input = type(self._data[0]) is numpy.ndarray - if _single_input: + self._single_input = type(self._data[0]) is numpy.ndarray + if self._single_input: self._data = [self._data] self._batch_size = batch_size self._iterations = iterations @@ -114,11 +115,15 @@ def _make_batch(self) -> Iterable[Union[numpy.ndarray, List[numpy.ndarray]]]: # Copy contents of buffer to batch placeholder # and return A list of numpy array(s) representing the batch - return [ + batch = [ numpy.stack([sample[idx] for sample in self._batch_buffer], out=template) for idx, template in enumerate(self._batch_template) ] + if self._single_input: + batch = batch[0] + return batch + class Dataset(Iterable): """ diff --git a/tests/sparsezoo/utils/utilities.py b/tests/sparsezoo/helpers.py similarity index 100% rename from tests/sparsezoo/utils/utilities.py rename to tests/sparsezoo/helpers.py diff --git a/tests/sparsezoo/models/classification/test_efficientnet.py b/tests/sparsezoo/models/classification/test_efficientnet.py index 4e8b73f8..29eae4c5 100644 --- a/tests/sparsezoo/models/classification/test_efficientnet.py +++ b/tests/sparsezoo/models/classification/test_efficientnet.py @@ -15,7 +15,7 @@ import pytest from sparsezoo.models.classification import efficientnet_b0, efficientnet_b4 -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/classification/test_inception.py b/tests/sparsezoo/models/classification/test_inception.py index 208992dd..e4d4a6df 100644 --- a/tests/sparsezoo/models/classification/test_inception.py +++ b/tests/sparsezoo/models/classification/test_inception.py @@ -15,7 +15,7 @@ import pytest from sparsezoo.models.classification import inception_v3 -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/classification/test_mobilenet.py b/tests/sparsezoo/models/classification/test_mobilenet.py index 0c3ae7b9..bab2427b 100644 --- a/tests/sparsezoo/models/classification/test_mobilenet.py +++ b/tests/sparsezoo/models/classification/test_mobilenet.py @@ -14,7 +14,7 @@ import pytest from sparsezoo.models.classification import mobilenet_v1, mobilenet_v2 -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/classification/test_resnet.py b/tests/sparsezoo/models/classification/test_resnet.py index 8d50b02e..184355c2 100644 --- a/tests/sparsezoo/models/classification/test_resnet.py +++ b/tests/sparsezoo/models/classification/test_resnet.py @@ -23,7 +23,7 @@ resnet_101_2x, resnet_152, ) -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/classification/test_vgg.py b/tests/sparsezoo/models/classification/test_vgg.py index 4910a1e6..b34948cc 100644 --- a/tests/sparsezoo/models/classification/test_vgg.py +++ b/tests/sparsezoo/models/classification/test_vgg.py @@ -24,7 +24,7 @@ vgg_19, vgg_19bn, ) -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/detection/test_ssd.py b/tests/sparsezoo/models/detection/test_ssd.py index 8f8111e8..cf44b96c 100644 --- a/tests/sparsezoo/models/detection/test_ssd.py +++ b/tests/sparsezoo/models/detection/test_ssd.py @@ -15,7 +15,7 @@ import pytest from sparsezoo.models.detection import ssd_resnet50_300 -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/detection/test_yolo.py b/tests/sparsezoo/models/detection/test_yolo.py index 31029151..dbd0c4df 100644 --- a/tests/sparsezoo/models/detection/test_yolo.py +++ b/tests/sparsezoo/models/detection/test_yolo.py @@ -15,7 +15,7 @@ import pytest from sparsezoo.models.detection import yolo_v3 -from tests.sparsezoo.utils import model_constructor +from tests.sparsezoo.helpers import model_constructor @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/test_zoo.py b/tests/sparsezoo/models/test_zoo.py index 9364eaee..855b58e0 100644 --- a/tests/sparsezoo/models/test_zoo.py +++ b/tests/sparsezoo/models/test_zoo.py @@ -19,7 +19,7 @@ from sparsezoo import Zoo from sparsezoo.utils import CACHE_DIR -from tests.sparsezoo.utils import validate_downloaded_model +from tests.sparsezoo.helpers import validate_downloaded_model @pytest.mark.parametrize( diff --git a/tests/sparsezoo/models/test_zoo_extensive.py b/tests/sparsezoo/models/test_zoo_extensive.py index bd9f8b98..5a1dd9d3 100644 --- a/tests/sparsezoo/models/test_zoo_extensive.py +++ b/tests/sparsezoo/models/test_zoo_extensive.py @@ -17,7 +17,7 @@ import pytest from sparsezoo.models import Zoo -from tests.sparsezoo.utils import download_and_verify +from tests.sparsezoo.helpers import download_and_verify def _get_models(domain, sub_domain) -> List[str]: diff --git a/tests/sparsezoo/utils/__init__.py b/tests/sparsezoo/utils/__init__.py index e8d586a1..0c44f887 100644 --- a/tests/sparsezoo/utils/__init__.py +++ b/tests/sparsezoo/utils/__init__.py @@ -11,5 +11,3 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - -from .utilities import * # noqa diff --git a/tests/sparsezoo/utils/test_data.py b/tests/sparsezoo/utils/test_data.py index 132c50e5..3dc5b74d 100644 --- a/tests/sparsezoo/utils/test_data.py +++ b/tests/sparsezoo/utils/test_data.py @@ -120,12 +120,11 @@ def test_iter_batches_single_input_batch_shape( ) _data_dimensions = single_input_dataset.data[0].shape + print("data dimensions", _data_dimensions) expected_batch_shape = (batch_size, *_data_dimensions) - + print(expected_batch_shape) for batch in loader: - batch_data = batch[0] - batch_shape = batch_data.shape - assert batch_shape == expected_batch_shape + assert batch.shape == expected_batch_shape @pytest.mark.parametrize( From f7f385761e79943c7c62428d1c66d126c0106d1d Mon Sep 17 00:00:00 2001 From: rahultuli Date: Mon, 19 Jul 2021 14:39:30 -0400 Subject: [PATCH 8/9] Update:Rename tests/utils.py to tests/helpers.py Fix:Unwrapping Single Input Errors Update:tests_data.py --- tests/sparsezoo/utils/test_data.py | 39 ++++++++++++++++++++++++++---- 1 file changed, 34 insertions(+), 5 deletions(-) diff --git a/tests/sparsezoo/utils/test_data.py b/tests/sparsezoo/utils/test_data.py index 3dc5b74d..6f0f0ec0 100644 --- a/tests/sparsezoo/utils/test_data.py +++ b/tests/sparsezoo/utils/test_data.py @@ -13,6 +13,7 @@ # limitations under the License. from typing import Iterable +import numpy import numpy as np import pytest @@ -88,11 +89,39 @@ def test_iter_batches_returns_iterable(both_datasets, batch_size, iterations): 10, ], ) -def test_batch_is_in_list(both_datasets, batch_size, iterations): - for dataset in both_datasets: - loader = dataset.iter_batches(batch_size=batch_size, iterations=iterations) - for batch in loader: - assert isinstance(batch, list) +def test_batch_is_in_list(multi_input_dataset, batch_size, iterations): + loader = multi_input_dataset.iter_batches( + batch_size=batch_size, iterations=iterations + ) + for batch in loader: + assert isinstance(batch, list) + + +@pytest.mark.parametrize( + "batch_size", + [ + 1, + 2, + 10, + ], +) +@pytest.mark.parametrize( + "iterations", + [ + 1, + 2, + 4, + 10, + ], +) +def test_batch_not_in_list_for_single_input( + single_input_dataset, batch_size, iterations +): + loader = single_input_dataset.iter_batches( + batch_size=batch_size, iterations=iterations + ) + for batch in loader: + assert not isinstance(batch, list) and isinstance(batch, numpy.ndarray) @pytest.mark.parametrize( From f6b31c228da327abf6e96625d3ff56bd0dc5f67f Mon Sep 17 00:00:00 2001 From: rahultuli Date: Tue, 20 Jul 2021 10:28:52 -0400 Subject: [PATCH 9/9] Update:fixes from PR comments --- src/sparsezoo/utils/data.py | 15 +++++++-------- 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/src/sparsezoo/utils/data.py b/src/sparsezoo/utils/data.py index b458a338..c1fe2056 100644 --- a/src/sparsezoo/utils/data.py +++ b/src/sparsezoo/utils/data.py @@ -38,7 +38,7 @@ class _BatchLoader: __slots__ = [ "_data", "_batch_size", - "_single_input", + "_was_wrapped_originally", "_iterations", "_batch_buffer", "_batch_template", @@ -52,14 +52,14 @@ def __init__( iterations: int, ): self._data = data - self._single_input = type(self._data[0]) is numpy.ndarray - if self._single_input: + self._was_wrapped_originally = type(self._data[0]) is list + if not self._was_wrapped_originally: self._data = [self._data] self._batch_size = batch_size self._iterations = iterations - if batch_size < 0 or iterations < 0: + if batch_size <= 0 or iterations <= 0: raise ValueError( - f"Both batch size and number of iterations should be non-negative, " + f"Both batch size and number of iterations should be positive, " f"supplied values (batch_size, iterations):{(batch_size, iterations)}" ) @@ -101,9 +101,7 @@ def _batch_generator(self, source) -> Generator[List[numpy.ndarray], None, None] def _init_batch_template( self, ) -> Iterable[Union[List[numpy.ndarray], numpy.ndarray]]: - # A placeholder for batches - return [ numpy.ascontiguousarray( numpy.zeros((self._batch_size, *_input.shape), dtype=_input.dtype) @@ -120,7 +118,8 @@ def _make_batch(self) -> Iterable[Union[numpy.ndarray, List[numpy.ndarray]]]: for idx, template in enumerate(self._batch_template) ] - if self._single_input: + if not self._was_wrapped_originally: + # unwrap outer list batch = batch[0] return batch