Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: add functionality symlog to numpy.histogram #26287

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
85 changes: 80 additions & 5 deletions numpy/lib/_histograms_impl.py
Expand Up @@ -359,15 +359,15 @@ def _unsigned_subtract(a, b):
casting='unsafe', dtype=unsigned_dt)


def _get_bin_edges(a, bins, range, weights):
def _get_bin_edges(a, bins, range, weights, symlog=None):
"""
Computes the bins used internally by `histogram`.

Parameters
==========
a : ndarray
Ravelled data array
bins, range
bins, range, symlog
Forwarded arguments from `histogram`.
weights : ndarray, optional
Ravelled weights array, or None
Expand All @@ -383,6 +383,17 @@ def _get_bin_edges(a, bins, range, weights):
# parse the overloaded bins argument
n_equal_bins = None
bin_edges = None
if symlog:
if np.ndim(bins) != 0:
warnings.warn(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This should raise an error

"symlog option is only valid when bins is an integer. "
"Attempting without symlog.",
stacklevel=2
)
return _get_bin_edges(a, bins, range, weights, symlog=None)
n_unequal_bins = bins
bin_edges = _get_geomspace_edges(n_unequal_bins, a)
Comment on lines +394 to +395
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
n_unequal_bins = bins
bin_edges = _get_geomspace_edges(n_unequal_bins, a)
bin_edges = _get_geomspace_edges(bins, a)

return bin_edges, None

if isinstance(bins, str):
bin_name = bins
Expand Down Expand Up @@ -453,6 +464,37 @@ def _get_bin_edges(a, bins, range, weights):
return bin_edges, None


def _get_geomspace_edges(n_unequal_bins, a):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
def _get_geomspace_edges(n_unequal_bins, a):
def _get_geomspace_edges(a, n_unequal_bins):

(keep the same order as _get_bin_edges)

"""
Compute the bin edges for a histogram with geometrically spaced bins.
The bins are spaced such that the width of each bin is constant in
log-space.
Reference issue: https://github.com/numpy/numpy/issues/24368
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Reference issue: https://github.com/numpy/numpy/issues/24368

No need to include references to github issues for user documentation

Returns
-------
bin_edges : ndarray
The edges of the bins.
"""
# The idea is to use the absolute min and max of the data to compute the
# range, and then the pseudo-first and last edge.
pseudo_first_edge, pseudo_last_edge = abs(a).min(), abs(a).max()
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
pseudo_first_edge, pseudo_last_edge = abs(a).min(), abs(a).max()
abs_a = abs(a)
pseudo_first_edge, pseudo_last_edge = abs_a.min(), abs_a.max()

# Compute the edges of the bins.
num_bins = int(n_unequal_bins)
if n_unequal_bins % 2 == 0:
num_bins = int(n_unequal_bins / 2)
bin_edges_geomspaced = np.geomspace(
pseudo_first_edge, pseudo_last_edge, num=num_bins
)
bin_edges_concatenated = np.concatenate(
(-bin_edges_geomspaced, [0], bin_edges_geomspaced)
)
bin_edges_sorted = np.sort(bin_edges_concatenated)
bin_edges = bin_edges_sorted
if n_unequal_bins % 2 == 1:
bin_edges = bin_edges_sorted[::2]
return bin_edges


def _search_sorted_inclusive(a, v):
"""
Like `searchsorted`, but where the last item in `v` is placed on the right.
Expand Down Expand Up @@ -673,12 +715,13 @@ def histogram_bin_edges(a, bins=10, range=None, weights=None):


def _histogram_dispatcher(
a, bins=None, range=None, density=None, weights=None):
a, bins=None, range=None, density=None, weights=None, symlog=None
):
return (a, bins, weights)


@array_function_dispatch(_histogram_dispatcher)
def histogram(a, bins=10, range=None, density=None, weights=None):
def histogram(a, bins=10, range=None, density=None, weights=None, symlog=None):
r"""
Compute the histogram of a dataset.

Expand Down Expand Up @@ -721,6 +764,10 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
the *integral* over the range is 1. Note that the sum of the
histogram values will not be equal to 1 unless bins of unity
width are chosen; it is not a probability *mass* function.
symlog : bool, optional
Allow numpy.histogram to give geometrically spaced bin edges.
Data can include both negative and positive numbers.
Based on https://github.com/numpy/numpy/issues/24368.

Returns
-------
Expand Down Expand Up @@ -768,6 +815,34 @@ def histogram(a, bins=10, range=None, density=None, weights=None):

.. versionadded:: 1.11.0

>>> # Example for symlog
>>> data = np.array([
... -2.40000e-03, -3.50000e-01, -4.60000e-02, -2.00000e-01,
... -3.60000e-04, -4.00000e+00, -2.60000e+01, -3.64000e+02,
... -2.43000e+02, -1.53240e+04, -1.35525e+05, 1.20000e-02,
... 3.00000e-01, 1.40000e-03, 7.00000e-01, 7.00000e+00,
... 3.60000e+01, 9.46000e+02, 2.54520e+04, -4.80000e-03,
... -7.00000e-01, -9.20000e-02, -4.00000e-01, -7.20000e-04,
... -8.00000e+00, -5.20000e+01, -7.28000e+02, -4.86000e+02,
... -3.06480e+04, -2.71050e+05, 2.40000e-02, 6.00000e-01,
... 2.80000e-03, 1.40000e+00, 1.40000e+01, 7.20000e+01,
... 1.89200e+03, 5.09040e+04, -1.20000e-03, -1.75000e-01,
... -2.30000e-02, -1.00000e-01, -1.80000e-04, -2.00000e+00,
... -1.30000e+01, -1.82000e+02, -1.21500e+02, -7.66200e+03,
... -6.77625e+04, 6.00000e-03, 1.50000e-01, 7.00000e-04,
... 3.50000e-01, 3.50000e+00, 1.80000e+01, 4.73000e+02,
... 1.27260e+04])
>>> np.histogram(data, bins=5, symlog=True)
(array([16, 16, 1, 13, 11]),
array([-2.71050000e+05, -6.98491231e+00, -1.80000000e-04, 1.80000000e-04,
6.98491231e+00, 2.71050000e+05]))

>>> np.histogram(data, bins=10, symlog=True)
(array([ 6, 10, 10, 6, 1, 0, 6, 7, 7, 4]),
array([-2.71050000e+05, -1.37595802e+03, -6.98491231e+00, -3.54582038e-02,
-1.80000000e-04, 0.00000000e+00, 1.80000000e-04, 3.54582038e-02,
6.98491231e+00, 1.37595802e+03, 2.71050000e+05]))

Automated Bin Selection Methods example, using 2 peak random data
with 2000 points.

Expand All @@ -787,7 +862,7 @@ def histogram(a, bins=10, range=None, density=None, weights=None):
"""
a, weights = _ravel_and_check_weights(a, weights)

bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights)
bin_edges, uniform_bins = _get_bin_edges(a, bins, range, weights, symlog)

# Histogram is an integer or a float array depending on the weights.
if weights is None:
Expand Down
1 change: 1 addition & 0 deletions numpy/lib/_histograms_impl.pyi
Expand Up @@ -36,6 +36,7 @@ def histogram(
range: None | tuple[float, float] = ...,
density: bool = ...,
weights: None | ArrayLike = ...,
symlog: None | bool = ...,
) -> tuple[NDArray[Any], NDArray[Any]]: ...

def histogramdd(
Expand Down
113 changes: 113 additions & 0 deletions numpy/lib/tests/test_histograms.py
Expand Up @@ -413,6 +413,119 @@ def test_gh_23110(self):
expected_hist = np.array([1, 0])
assert_array_equal(hist, expected_hist)

def test_histogram_positive_data(self):
data = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])
bins = 5
hist, bin_edges = np.histogram(data, bins=bins, symlog=True)
assert len(hist) == bins
assert len(bin_edges) == bins + 1

def test_symlog(self):
"""
Tests the histogram function with symlog parameter.
"""

data = np.array(
[
-2.40000e-03,
-3.50000e-01,
-4.60000e-02,
-2.00000e-01,
-3.60000e-04,
-4.00000e00,
-2.60000e01,
-3.64000e02,
-2.43000e02,
-1.53240e04,
-1.35525e05,
1.20000e-02,
3.00000e-01,
1.40000e-03,
7.00000e-01,
7.00000e00,
3.60000e01,
9.46000e02,
2.54520e04,
-4.80000e-03,
-7.00000e-01,
-9.20000e-02,
-4.00000e-01,
-7.20000e-04,
-8.00000e00,
-5.20000e01,
-7.28000e02,
-4.86000e02,
-3.06480e04,
-2.71050e05,
2.40000e-02,
6.00000e-01,
2.80000e-03,
1.40000e00,
1.40000e01,
7.20000e01,
1.89200e03,
5.09040e04,
-1.20000e-03,
-1.75000e-01,
-2.30000e-02,
-1.00000e-01,
-1.80000e-04,
-2.00000e00,
-1.30000e01,
-1.82000e02,
-1.21500e02,
-7.66200e03,
-6.77625e04,
6.00000e-03,
1.50000e-01,
7.00000e-04,
3.50000e-01,
3.50000e00,
1.80000e01,
4.73000e02,
1.27260e04,
]
)

bins_a = 5
histogram_a = np.histogram(data, bins=bins_a, symlog=True)
assert len(histogram_a[0]) == bins_a
assert len(histogram_a[1]) == bins_a + 1
assert np.array_equal(histogram_a[0], np.array([16, 16, 1, 13, 11]))

bins_b = 10
histogram_b = np.histogram(data, bins=bins_b, symlog=True)
assert len(histogram_b[0]) == bins_b
assert len(histogram_b[1]) == bins_b + 1
assert np.array_equal(
histogram_b[0], np.array([6, 10, 10, 6, 1, 0, 6, 7, 7, 4])
)

bins_c = [
-2.71050000e05,
-6.98491231e00,
-1.80000000e-04,
1.80000000e-04,
6.98491231e00,
2.71050000e05,
]

with pytest.warns(
UserWarning,
match=(
"symlog option is only valid when bins is an integer. "
"Attempting without symlog."
),
):
histogram_c_a = np.histogram(data, bins=bins_c, symlog=True)
assert len(histogram_c_a[0]) == len(bins_c) - 1
assert len(histogram_c_a[1]) == len(bins_c)
assert np.array_equal(histogram_c_a[0], np.array([16, 16, 1, 13, 11]))

histogram_c_b = np.histogram(data, bins=bins_c)
assert len(histogram_c_b[0]) == len(bins_c) - 1
assert len(histogram_c_b[1]) == len(bins_c)
assert np.array_equal(histogram_c_b[0], np.array([16, 16, 1, 13, 11]))

class TestHistogramOptimBinNums:
"""
Expand Down