Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

ENH: Automatic number of bins for np.histogram #6029

Merged
merged 1 commit into from Aug 15, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
4 changes: 3 additions & 1 deletion doc/release/1.11.0-notes.rst
Expand Up @@ -20,7 +20,9 @@ Compatibility notes

New Features
============

* `np.histogram` now provides plugin estimators for automatically estimating the optimal
number of bins. Passing one of ['auto', 'fd', 'scott', 'rice', 'sturges']
as the argument to 'bins' results in the corresponding estimator being used.

Improvements
============
Expand Down
171 changes: 170 additions & 1 deletion numpy/lib/function_base.py
Expand Up @@ -27,6 +27,7 @@
from numpy.core.multiarray import digitize, bincount, interp as compiled_interp
from numpy.core.umath import _add_newdoc_ufunc as add_newdoc_ufunc
from numpy.compat import long
from numpy.compat.py3k import basestring

# Force range to be a generator, for np.delete's usage.
if sys.version_info[0] < 3:
Expand Down Expand Up @@ -75,6 +76,91 @@ def iterable(y):
return 1


def _hist_optim_numbins_estimator(a, estimator):
"""
A helper function to be called from histogram to deal with estimating optimal number of bins

estimator: str
If estimator is one of ['auto', 'fd', 'scott', 'rice', 'sturges'] this function
will choose the appropriate estimator and return it's estimate for the optimal
number of bins.
"""
assert isinstance(estimator, basestring)
# private function should not be called otherwise

if a.size == 0:
return 1

def sturges(x):
"""
Sturges Estimator
A very simplistic estimator based on the assumption of normality of the data
Poor performance for non-normal data, especially obvious for large X.
Depends only on size of the data.
"""
return np.ceil(np.log2(x.size)) + 1

def rice(x):
"""
Rice Estimator
Another simple estimator, with no normality assumption.
It has better performance for large data, but tends to overestimate number of bins.
The number of bins is proportional to the cube root of data size (asymptotically optimal)
Depends only on size of the data
"""
return np.ceil(2 * x.size ** (1.0 / 3))

def scott(x):
"""
Scott Estimator
The binwidth is proportional to the standard deviation of the data and
inversely proportional to the cube root of data size (asymptotically optimal)

"""
h = 3.5 * x.std() * x.size ** (-1.0 / 3)
if h > 0:
return np.ceil(x.ptp() / h)
return 1

def fd(x):
"""
Freedman Diaconis rule using Inter Quartile Range (IQR) for binwidth
Considered a variation of the Scott rule with more robustness as the IQR
is less affected by outliers than the standard deviation. However the IQR depends on
fewer points than the sd so it is less accurate, especially for long tailed distributions.

If the IQR is 0, we return 1 for the number of bins.
Binwidth is inversely proportional to the cube root of data size (asymptotically optimal)
"""
iqr = np.subtract(*np.percentile(x, [75, 25]))

if iqr > 0:
h = (2 * iqr * x.size ** (-1.0 / 3))
return np.ceil(x.ptp() / h)

# If iqr is 0, default number of bins is 1
return 1

def auto(x):
"""
The FD estimator is usually the most robust method, but it tends to be too small
for small X. The Sturges estimator is quite good for small (<1000) datasets and is
the default in R.
This method gives good off the shelf behaviour.
"""
return max(fd(x), sturges(x))

optimal_numbins_methods = {'sturges': sturges, 'rice': rice, 'scott': scott,
'fd': fd, 'auto': auto}
try:
estimator_func = optimal_numbins_methods[estimator.lower()]
except KeyError:
raise ValueError("{0} not a valid method for `bins`".format(estimator))
else:
# these methods return floats, np.histogram requires an int
return int(estimator_func(a))


def histogram(a, bins=10, range=None, normed=False, weights=None,
density=None):
"""
Expand All @@ -84,11 +170,37 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
----------
a : array_like
Input data. The histogram is computed over the flattened array.
bins : int or sequence of scalars, optional
bins : int or sequence of scalars or str, optional
If `bins` is an int, it defines the number of equal-width
bins in the given range (10, by default). If `bins` is a sequence,
it defines the bin edges, including the rightmost edge, allowing
for non-uniform bin widths.

.. versionadded:: 1.11.0

If `bins` is a string from the list below, `histogram` will use the method
chosen to calculate the optimal number of bins (see Notes for more detail
on the estimators). For visualisation, we suggest using the 'auto' option.

'auto'
Maximum of the 'sturges' and 'fd' estimators. Provides good all round performance

'fd' (Freedman Diaconis Estimator)
Robust (resilient to outliers) estimator that takes into account data
variability and data size .

'scott'
Less robust estimator that that takes into account data
variability and data size.

'rice'
Estimator does not take variability into account, only data size.
Commonly overestimates number of bins required.

'sturges'
R's default method, only accounts for data size. Only optimal for
gaussian data and underestimates number of bins for large non-gaussian datasets.

range : (float, float), optional
The lower and upper range of the bins. If not provided, range
is simply ``(a.min(), a.max())``. Values outside the range are
Expand Down Expand Up @@ -140,6 +252,48 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
second ``[2, 3)``. The last bin, however, is ``[3, 4]``, which *includes*
4.

.. versionadded:: 1.11.0
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would say, putting it only with the keyword is sufficient (but frankly we have reached bikeshedding). The other option we do is to putting it only in the Notes section. So will put this in soon (unless someone beats me to it). Lets not start breaking rules again, rather hope that 1.11 is soon. I feel there may be some nice things coming up soon enough ;).


The methods to estimate the optimal number of bins are well found in literature,
and are inspired by the choices R provides for histogram visualisation.
Note that having the number of bins proportional to :math:`n^{1/3}` is asymptotically optimal,
which is why it appears in most estimators.
These are simply plug-in methods that give good starting points for number of bins.
In the equations below, :math:`h` is the binwidth and :math:`n_h` is the number of bins

'Auto' (maximum of the 'Sturges' and 'FD' estimators)
A compromise to get a good value. For small datasets the sturges
value will usually be chosen, while larger datasets will usually default to FD.
Avoids the overly conservative behaviour of FD and Sturges for small and
large datasets respectively. Switchover point is usually x.size~1000.

'FD' (Freedman Diaconis Estimator)
.. math:: h = 2 \\frac{IQR}{n^{-1/3}}
The binwidth is proportional to the interquartile range (IQR)
and inversely proportional to cube root of a.size. Can be too
conservative for small datasets, but is quite good
for large datasets. The IQR is very robust to outliers.

'Scott'
.. math:: h = \\frac{3.5\\sigma}{n^{-1/3}}
The binwidth is proportional to the standard deviation (sd) of the data
and inversely proportional to cube root of a.size. Can be too
conservative for small datasets, but is quite good
for large datasets. The sd is not very robust to outliers. Values
are very similar to the Freedman Diaconis Estimator in the absence of outliers.

'Rice'
.. math:: n_h = \\left\\lceil 2n^{1/3} \\right\\rceil
The number of bins is only proportional to cube root of a.size.
It tends to overestimate the number of bins
and it does not take into account data variability.

'Sturges'
.. math:: n_h = \\left\\lceil \\log _{2}n+1 \\right\\rceil
The number of bins is the base2 log of a.size.
This estimator assumes normality of data and is too conservative for larger,
non-normal datasets. This is the default method in R's `hist` method.

Examples
--------
>>> np.histogram([1, 2, 1], bins=[0, 1, 2, 3])
Expand All @@ -158,6 +312,16 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
>>> np.sum(hist*np.diff(bin_edges))
1.0

.. versionadded:: 1.11.0

Automated Bin Selection Methods example, using 2 peak random data with 2000 points

>>> import matplotlib.pyplot as plt
>>> rng = np.random.RandomState(10) # deterministic random data
>>> a = np.hstack((rng.normal(size = 1000), rng.normal(loc = 5, scale = 2, size = 1000)))
>>> plt.hist(a, bins = 'auto') # plt.hist passes it's arguments to np.histogram
>>> plt.title("Histogram with 'auto' bins")
>>> plt.show()
"""

a = asarray(a)
Expand All @@ -175,6 +339,11 @@ def histogram(a, bins=10, range=None, normed=False, weights=None,
raise AttributeError(
'max must be larger than min in range parameter.')

if isinstance(bins, basestring):
bins = _hist_optim_numbins_estimator(a, bins)
# if `bins` is a string for an automatic method,
# this will replace it with the number of bins calculated

# Histogram is an integer or a float array depending on the weights.
if weights is None:
ntype = np.dtype(np.intp)
Expand Down
90 changes: 90 additions & 0 deletions numpy/lib/tests/test_function_base.py
Expand Up @@ -1234,6 +1234,96 @@ def test_empty(self):
assert_array_equal(b, np.array([0, 1]))


class TestHistogramOptimBinNums(TestCase):
"""
Provide test coverage when using provided estimators for optimal number of bins
"""

def test_empty(self):
estimator_list = ['fd', 'scott', 'rice', 'sturges', 'auto']
# check it can deal with empty data
for estimator in estimator_list:
a, b = histogram([], bins=estimator)
assert_array_equal(a, np.array([0]))
assert_array_equal(b, np.array([0, 1]))

def test_simple(self):
"""
Straightforward testing with a mixture of linspace data (for consistency).
All test values have been precomputed and the values shouldn't change
"""
# some basic sanity checking, with some fixed data. Checking for the correct number of bins
basic_test = {50: {'fd': 4, 'scott': 4, 'rice': 8, 'sturges': 7, 'auto': 7},
500: {'fd': 8, 'scott': 8, 'rice': 16, 'sturges': 10, 'auto': 10},
5000: {'fd': 17, 'scott': 17, 'rice': 35, 'sturges': 14, 'auto': 17}}

for testlen, expectedResults in basic_test.items():
# create some sort of non uniform data to test with (2 peak uniform mixture)
x1 = np.linspace(-10, -1, testlen/5 * 2)
x2 = np.linspace(1,10, testlen/5 * 3)
x = np.hstack((x1, x2))
for estimator, numbins in expectedResults.items():
a, b = np.histogram(x, estimator)
assert_equal(len(a), numbins,
err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))

def test_small(self):
"""
Smaller datasets have the potential to cause issues with the data adaptive methods
Especially the FD methods
All bin numbers have been precalculated
"""
small_dat = {1: {'fd': 1, 'scott': 1, 'rice': 2, 'sturges': 1},
2: {'fd': 2, 'scott': 1, 'rice': 3, 'sturges': 2},
3: {'fd': 2, 'scott': 2, 'rice': 3, 'sturges': 3}}

for testlen, expectedResults in small_dat.items():
testdat = np.arange(testlen)
for estimator, expbins in expectedResults.items():
a, b = np.histogram(testdat, estimator)
assert_equal(len(a), expbins,
err_msg="For the {0} estimator with datasize of {1} ".format(estimator, testlen))

def test_incorrect_methods(self):
"""
Check a Value Error is thrown when an unknown string is passed in
"""
check_list = ['mad', 'freeman', 'histograms', 'IQR']
for estimator in check_list:
assert_raises(ValueError, histogram, [1, 2, 3], estimator)

def test_novariance(self):
"""
Check that methods handle no variance in data
Primarily for Scott and FD as the SD and IQR are both 0 in this case
"""
novar_dataset = np.ones(100)
novar_resultdict = {'fd': 1, 'scott': 1, 'rice': 10, 'sturges': 8, 'auto': 8}

for estimator, numbins in novar_resultdict.items():
a, b = np.histogram(novar_dataset, estimator)
assert_equal(len(a), numbins,
err_msg="{0} estimator, No Variance test".format(estimator))

def test_outlier(self):
"""
Check the fd and scott with outliers
The fd determines a smaller binwidth since it's less affected by outliers
since the range is so (artificially) large this means more bins
most of which will be empty, but the data of interest usually is unaffected.
The Scott estimator is more affected and returns fewer bins, despite most of
the variance being in one area of the data
"""
xcenter = np.linspace(-10, 10, 50)
outlier_dataset = np.hstack((np.linspace(-110, -100, 5), xcenter))

outlier_resultdict = {'fd': 21, 'scott': 5}

for estimator, numbins in outlier_resultdict.items():
a, b = np.histogram(outlier_dataset, estimator)
assert_equal(len(a), numbins)


class TestHistogramdd(TestCase):

def test_simple(self):
Expand Down