Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

Already on GitHub? Sign in to your account

VIS/ENH Hexbin plot #5478

Merged
merged 1 commit into from Feb 14, 2014
Jump to file or symbol
Failed to load files and symbols.
+194 −2
Split
View
@@ -53,6 +53,8 @@ pandas 0.14.0
New features
~~~~~~~~~~~~
+- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
+
API Changes
~~~~~~~~~~~
View
@@ -154,6 +154,7 @@ Enhancements
- ``plot(legend='reverse')`` will now reverse the order of legend labels for
most plot kinds. (:issue:`6014`)
- improve performance of slice indexing on Series with string keys (:issue:`6341`)
+- Hexagonal bin plots from ``DataFrame.plot`` with ``kind='hexbin'`` (:issue:`5478`)
Performance
~~~~~~~~~~~
@@ -414,6 +414,59 @@ setting `kind='kde'`:
@savefig kde_plot.png
ser.plot(kind='kde')
+.. _visualization.hexbin
+
+Hexagonal Bin plot
+~~~~~~~~~~~~~~~~~~
+*New in .14* You can create hexagonal bin plots with ``DataFrame.plot`` and
+``kind='hexbin'``.
+Hexbin plots can be a useful alternative to scatter plots if your data are
+too dense to plot each point individually.
+
+.. ipython:: python
+ :suppress:
+
+ plt.figure();
+
+.. ipython:: python
+
+ df = DataFrame(randn(1000, 2), columns=['a', 'b'])
+ df['b'] = df['b'] = df['b'] + np.arange(1000)
+
+ @savefig hexbin_plot.png
+ df.plot(kind='hexbin', x='a', y='b', gridsize=25)
+
+
+A useful keyword argument is ``gridsize``; it controls the number of hexagons
+in the x-direction, and defaults to 100. A larger ``gridsize`` means more, smaller
+bins.
+
+By default, a histogram of the counts around each ``(x, y)`` point is computed.
+You can specify alternative aggregations by passing values to the ``C`` and
+``reduce_C_function`` arguments. ``C`` specifies the value at each ``(x, y)`` point
+and ``reduce_C_function`` is a function of one argument that reduces all the
+values in a bin to a single number (e.g. ``mean``, ``max``, ``sum``, ``std``). In this
+example the positions are given by columns ``a`` and ``b``, while the value is
+given by column ``z``. The bins are aggregated with numpy's ``max`` function.
+
+.. ipython:: python
+ :suppress:
+
+ plt.figure();
+
+.. ipython:: python
+
+ df = DataFrame(randn(1000, 2), columns=['a', 'b'])
+ df['b'] = df['b'] = df['b'] + np.arange(1000)
+ df['z'] = np.random.uniform(0, 3, 1000)
+
+ @savefig hexbin_plot_agg.png
+ df.plot(kind='hexbin', x='a', y='b', C='z', reduce_C_function=np.max,
+ gridsize=25)
+
+
+See the `matplotlib hexbin documenation <http://matplotlib.org/api/pyplot_api.html#matplotlib.pyplot.hexbin>`__ for more.
+
.. _visualization.andrews_curves:
Andrews Curves
@@ -956,6 +956,65 @@ def test_invalid_kind(self):
with tm.assertRaises(ValueError):
df.plot(kind='aasdf')
+ @slow
+ def test_hexbin_basic(self):
+ df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(size=20)})
+
+ ax = df.plot(kind='hexbin', x='A', y='B', gridsize=10)
+ # TODO: need better way to test. This just does existence.
+ self.assert_(len(ax.collections) == 1)
+
+ @slow
+ def test_hexbin_with_c(self):
+ df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(size=20)})
+
+ ax = df.plot(kind='hexbin', x='A', y='B', C='C')
+ self.assert_(len(ax.collections) == 1)
+
+ ax = df.plot(kind='hexbin', x='A', y='B', C='C',
+ reduce_C_function=np.std)
+ self.assert_(len(ax.collections) == 1)
+
+ @slow
+ def test_hexbin_cmap(self):
+ df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(size=20)})
+
+ # Default to BuGn
+ ax = df.plot(kind='hexbin', x='A', y='B')
+ self.assertEquals(ax.collections[0].cmap.name, 'BuGn')
+
+ cm = 'cubehelix'
+ ax = df.plot(kind='hexbin', x='A', y='B', colormap=cm)
+ self.assertEquals(ax.collections[0].cmap.name, cm)
+
+ @slow
+ def test_no_color_bar(self):
+ df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(size=20)})
+
+ ax = df.plot(kind='hexbin', x='A', y='B', colorbar=None)
+ self.assertIs(ax.collections[0].colorbar, None)
+
+ @slow
+ def test_allow_cmap(self):
+ df = DataFrame({"A": np.random.uniform(size=20),
+ "B": np.random.uniform(size=20),
+ "C": np.arange(20) + np.random.uniform(size=20)})
+
+ ax = df.plot(kind='hexbin', x='A', y='B', cmap='YlGn')
+ self.assertEquals(ax.collections[0].cmap.name, 'YlGn')
+
+ with tm.assertRaises(TypeError):
+ df.plot(kind='hexbin', x='A', y='B', cmap='YlGn',
+ colormap='BuGn')
+
@tm.mplskip
class TestDataFrameGroupByPlots(tm.TestCase):
View
@@ -835,7 +835,14 @@ def __init__(self, data, kind=None, by=None, subplots=False, sharex=True,
secondary_y = [secondary_y]
self.secondary_y = secondary_y
- self.colormap = colormap
+ # ugly TypeError if user passes matplotlib's `cmap` name.
+ # Probably better to accept either.
+ if 'cmap' in kwds and colormap:
+ raise TypeError("Only specify one of `cmap` and `colormap`.")
+ elif 'cmap' in kwds:
+ self.colormap = kwds.pop('cmap')
+ else:
+ self.colormap = colormap
self.kwds = kwds
@@ -1263,6 +1270,52 @@ def _post_plot_logic(self):
ax.set_xlabel(com.pprint_thing(x))
+class HexBinPlot(MPLPlot):
+ def __init__(self, data, x, y, C=None, **kwargs):
+ MPLPlot.__init__(self, data, **kwargs)
+
+ if x is None or y is None:
+ raise ValueError('hexbin requires and x and y column')
+ if com.is_integer(x) and not self.data.columns.holds_integer():
+ x = self.data.columns[x]
+ if com.is_integer(y) and not self.data.columns.holds_integer():
+ y = self.data.columns[y]
+
+ if com.is_integer(C) and not self.data.columns.holds_integer():
+ C = self.data.columns[C]
+
+ self.x = x
+ self.y = y
+ self.C = C
+
+ def _make_plot(self):
+ import matplotlib.pyplot as plt
+
+ x, y, data, C = self.x, self.y, self.data, self.C
+ ax = self.axes[0]
+ # pandas uses colormap, matplotlib uses cmap.
+ cmap = self.colormap or 'BuGn'
+ cmap = plt.cm.get_cmap(cmap)
+ cb = self.kwds.pop('colorbar', True)
+
+ if C is None:
+ c_values = None
+ else:
+ c_values = data[C].values
+
+ ax.hexbin(data[x].values, data[y].values, C=c_values, cmap=cmap,
+ **self.kwds)
+ if cb:
+ img = ax.collections[0]
+ self.fig.colorbar(img, ax=ax)
+
+ def _post_plot_logic(self):
+ ax = self.axes[0]
+ x, y = self.x, self.y
+ ax.set_ylabel(com.pprint_thing(y))
+ ax.set_xlabel(com.pprint_thing(x))
+
+
class LinePlot(MPLPlot):
def __init__(self, data, **kwargs):
@@ -1663,11 +1716,12 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
ax : matplotlib axis object, default None
style : list or dict
matplotlib line style per column
- kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter'}
+ kind : {'line', 'bar', 'barh', 'kde', 'density', 'scatter', 'hexbin'}
bar : vertical bar plot
barh : horizontal bar plot
kde/density : Kernel Density Estimation plot
scatter: scatter plot
+ hexbin: hexbin plot
logx : boolean, default False
For line plots, use log scaling on x axis
logy : boolean, default False
@@ -1695,6 +1749,17 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
Returns
-------
ax_or_axes : matplotlib.AxesSubplot or list of them
+
+ Notes
+ -----
+
+ If `kind`='hexbin', you can control the size of the bins with the
+ `gridsize` argument. By default, a histogram of the counts around each
+ `(x, y)` point is computed. You can specify alternative aggregations
+ by passing values to the `C` and `reduce_C_function` arguments.
+ `C` specifies the value at each `(x, y)` point and `reduce_C_function`
+ is a function of one argument that reduces all the values in a bin to
+ a single number (e.g. `mean`, `max`, `sum`, `std`).
"""
kind = _get_standard_kind(kind.lower().strip())
if kind == 'line':
@@ -1705,6 +1770,8 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
klass = KdePlot
elif kind == 'scatter':
klass = ScatterPlot
+ elif kind == 'hexbin':
+ klass = HexBinPlot
else:
raise ValueError('Invalid chart type given %s' % kind)
@@ -1717,6 +1784,16 @@ def plot_frame(frame=None, x=None, y=None, subplots=False, sharex=True,
figsize=figsize, logx=logx, logy=logy,
sort_columns=sort_columns, secondary_y=secondary_y,
**kwds)
+ elif kind == 'hexbin':
+ C = kwds.pop('C', None) # remove from kwargs so we can set default
+ plot_obj = klass(frame, x=x, y=y, kind=kind, subplots=subplots,
+ rot=rot,legend=legend, ax=ax, style=style,
+ fontsize=fontsize, use_index=use_index, sharex=sharex,
+ sharey=sharey, xticks=xticks, yticks=yticks,
+ xlim=xlim, ylim=ylim, title=title, grid=grid,
+ figsize=figsize, logx=logx, logy=logy,
+ sort_columns=sort_columns, secondary_y=secondary_y,
+ C=C, **kwds)
else:
if x is not None:
if com.is_integer(x) and not frame.columns.holds_integer():