Histogram refactoring and improvements (#3045)

* Transition histplot internatls to use _stats/histogram * Simplify the way we use Hist * Note status of Histogram class * Store stat value in Hist output and simplify interface * Add API examples for Hist * Add parameter validation for Hist.stat * Add some checks on common_norm/common_bin
mwaskom · Oct 4, 2022 · ed3d367 · ed3d367
1 parent ad11bdc
commit ed3d367
Show file tree

Hide file tree

Showing 8 changed files with 386 additions and 65 deletions.
diff --git a/doc/_docstrings/objects.Hist.ipynb b/doc/_docstrings/objects.Hist.ipynb
@@ -0,0 +1,231 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "59690096-a0ad-4ff3-b82c-0258d724035a",
+   "metadata": {
+    "tags": [
+     "hide"
+    ]
+   },
+   "outputs": [],
+   "source": [
+    "import seaborn.objects as so\n",
+    "from seaborn import load_dataset\n",
+    "penguins = load_dataset(\"penguins\")"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "c345a35c-bac8-4163-ba40-e7c208df1033",
+   "metadata": {},
+   "source": [
+    "For discrete or categorical variables, this stat is commonly combined with a :class:`Bar` mark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "6a96ac9b-1240-496d-9385-840205945208",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "so.Plot(penguins, \"island\").add(so.Bar(), so.Hist())"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "1e5ff9d5-c6a9-4adc-a9be-0f155b1575be",
+   "metadata": {},
+   "source": [
+    "When used to estimate a univariate distribution, it is better to use the :class:`Bars` mark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f3e3144-752a-4d71-9528-85eb1ed0a9a4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = so.Plot(penguins, \"flipper_length_mm\")\n",
+    "p.add(so.Bars(), so.Hist())"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "008b9ffe-da74-4406-9756-4f70e333f33b",
+   "metadata": {},
+   "source": [
+    "The granularity of the bins will influence whether the underlying distribution is accurately represented. Adjust it by setting the total number:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27d221d5-add5-40a8-85d2-05102384dad1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(bins=20))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "fffebb54-0299-45c5-b7fb-6fcad6427239",
+   "metadata": {},
+   "source": [
+    "Alternatively, specify the *width* of the bins:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d036ca65-7dcf-45ac-a2d1-caafb9f922a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(binwidth=5))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "bc1e4bd3-2a16-42bd-9c13-a660dd381f66",
+   "metadata": {},
+   "source": [
+    "By default, the transform returns the count of observations in each bin. The counts can be normalized, e.g. to show a proportion:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dbf23712-2231-4226-8265-0e2a5299c4bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(stat=\"proportion\"))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "6c6fb23e-78c5-4630-a958-62cb4dee4ec8",
+   "metadata": {},
+   "source": [
+    "When additional variables define groups, the default behavior is to normalize across all groups:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ac3fe4ef-56e3-4ec7-b580-596d2a3d924b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p = p.facet(\"island\")\n",
+    "p.add(so.Bars(), so.Hist(stat=\"proportion\"))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "f7afc403-26cc-4325-a28a-913c2291aa35",
+   "metadata": {},
+   "source": [
+    "Pass `common_norm=False` to normalize each distribution independently:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b2029324-069f-4261-a178-1efad2fd0e88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(stat=\"proportion\", common_norm=False))"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "0f83401a-e456-4a14-af69-f1483c6c03c4",
+   "metadata": {},
+   "source": [
+    "Or, with more than one grouping varible, specify a subset to normalize within:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5c092262-8a8f-4a3e-8cae-9e0f23dd94ba",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(stat=\"proportion\", common_norm=[\"col\"]), color=\"sex\")"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "86532133-bf33-4674-9614-86ae3408aa51",
+   "metadata": {},
+   "source": [
+    "When distributions overlap it may be easier to discern their shapes with an :class:`Area` mark:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "00b18ad8-52d4-460a-a012-d87c66b3e71e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Area(), so.Hist(), color=\"sex\")"
+   ]
+  },
+  {
+   "cell_type": "raw",
+   "id": "2b34d435-abbf-41aa-b219-91883d7d29f3",
+   "metadata": {},
+   "source": [
+    "Or add :class:`Stack` move to represent a part-whole relationship:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3a7a0c05-d774-4f99-950f-5dc9865027c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "p.add(so.Bars(), so.Hist(), so.Stack(), color=\"sex\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "e247e74b-2c09-40f0-8f45-9fa5f8264d78",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "py310",
+   "language": "python",
+   "name": "py310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/seaborn/_statistics.py b/seaborn/_statistics.py
@@ -194,6 +194,8 @@ def __call__(self, x1, x2=None, weights=None):
             return self._eval_bivariate(x1, x2, weights)
 
 
+# Note: we no longer use this for univariate histograms in histplot,
+# preferring _stats.Hist. We'll deprecate this once we have a bivariate Stat class.
 class Histogram:
     """Univariate and bivariate histogram estimator."""
     def __init__(

diff --git a/seaborn/_stats/base.py b/seaborn/_stats/base.py
@@ -1,7 +1,8 @@
 """Base module for statistical transformations."""
 from __future__ import annotations
+from collections.abc import Iterable
 from dataclasses import dataclass
-from typing import ClassVar
+from typing import ClassVar, Any
 
 from typing import TYPE_CHECKING
 if TYPE_CHECKING:
@@ -28,6 +29,18 @@ class Stat:
     # value on the orient axis, but we would not in the latter case.
     group_by_orient: ClassVar[bool] = False
 
+    def _check_param_one_of(self, param: Any, options: Iterable[Any]) -> None:
+        """Raise when parameter value is not one of a specified set."""
+        value = getattr(self, param)
+        if value not in options:
+            *most, last = options
+            option_str = ", ".join(f"{x!r}" for x in most[:-1]) + f" or {last!r}"
+            err = " ".join([
+                f"The `{param}` parameter for `{self.__class__.__name__}` must be",
+                f"one of {option_str}; not {value!r}.",
+            ])
+            raise ValueError(err)
+
     def __call__(
         self,
         data: DataFrame,