Skip to content

Commit

Permalink
Histogram refactoring and improvements (#3045)
Browse files Browse the repository at this point in the history
* Transition histplot internatls to use _stats/histogram

* Simplify the way we use Hist

* Note status of Histogram class

* Store stat value in Hist output and simplify interface

* Add API examples for Hist

* Add parameter validation for Hist.stat

* Add some checks on common_norm/common_bin
  • Loading branch information
mwaskom committed Oct 4, 2022
1 parent ad11bdc commit ed3d367
Show file tree
Hide file tree
Showing 8 changed files with 386 additions and 65 deletions.
231 changes: 231 additions & 0 deletions doc/_docstrings/objects.Hist.ipynb
@@ -0,0 +1,231 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "59690096-a0ad-4ff3-b82c-0258d724035a",
"metadata": {
"tags": [
"hide"
]
},
"outputs": [],
"source": [
"import seaborn.objects as so\n",
"from seaborn import load_dataset\n",
"penguins = load_dataset(\"penguins\")"
]
},
{
"cell_type": "raw",
"id": "c345a35c-bac8-4163-ba40-e7c208df1033",
"metadata": {},
"source": [
"For discrete or categorical variables, this stat is commonly combined with a :class:`Bar` mark:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "6a96ac9b-1240-496d-9385-840205945208",
"metadata": {},
"outputs": [],
"source": [
"so.Plot(penguins, \"island\").add(so.Bar(), so.Hist())"
]
},
{
"cell_type": "raw",
"id": "1e5ff9d5-c6a9-4adc-a9be-0f155b1575be",
"metadata": {},
"source": [
"When used to estimate a univariate distribution, it is better to use the :class:`Bars` mark:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "7f3e3144-752a-4d71-9528-85eb1ed0a9a4",
"metadata": {},
"outputs": [],
"source": [
"p = so.Plot(penguins, \"flipper_length_mm\")\n",
"p.add(so.Bars(), so.Hist())"
]
},
{
"cell_type": "raw",
"id": "008b9ffe-da74-4406-9756-4f70e333f33b",
"metadata": {},
"source": [
"The granularity of the bins will influence whether the underlying distribution is accurately represented. Adjust it by setting the total number:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "27d221d5-add5-40a8-85d2-05102384dad1",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(bins=20))"
]
},
{
"cell_type": "raw",
"id": "fffebb54-0299-45c5-b7fb-6fcad6427239",
"metadata": {},
"source": [
"Alternatively, specify the *width* of the bins:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d036ca65-7dcf-45ac-a2d1-caafb9f922a7",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(binwidth=5))"
]
},
{
"cell_type": "raw",
"id": "bc1e4bd3-2a16-42bd-9c13-a660dd381f66",
"metadata": {},
"source": [
"By default, the transform returns the count of observations in each bin. The counts can be normalized, e.g. to show a proportion:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "dbf23712-2231-4226-8265-0e2a5299c4bb",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(stat=\"proportion\"))"
]
},
{
"cell_type": "raw",
"id": "6c6fb23e-78c5-4630-a958-62cb4dee4ec8",
"metadata": {},
"source": [
"When additional variables define groups, the default behavior is to normalize across all groups:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ac3fe4ef-56e3-4ec7-b580-596d2a3d924b",
"metadata": {},
"outputs": [],
"source": [
"p = p.facet(\"island\")\n",
"p.add(so.Bars(), so.Hist(stat=\"proportion\"))"
]
},
{
"cell_type": "raw",
"id": "f7afc403-26cc-4325-a28a-913c2291aa35",
"metadata": {},
"source": [
"Pass `common_norm=False` to normalize each distribution independently:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "b2029324-069f-4261-a178-1efad2fd0e88",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(stat=\"proportion\", common_norm=False))"
]
},
{
"cell_type": "raw",
"id": "0f83401a-e456-4a14-af69-f1483c6c03c4",
"metadata": {},
"source": [
"Or, with more than one grouping varible, specify a subset to normalize within:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "5c092262-8a8f-4a3e-8cae-9e0f23dd94ba",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(stat=\"proportion\", common_norm=[\"col\"]), color=\"sex\")"
]
},
{
"cell_type": "raw",
"id": "86532133-bf33-4674-9614-86ae3408aa51",
"metadata": {},
"source": [
"When distributions overlap it may be easier to discern their shapes with an :class:`Area` mark:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "00b18ad8-52d4-460a-a012-d87c66b3e71e",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Area(), so.Hist(), color=\"sex\")"
]
},
{
"cell_type": "raw",
"id": "2b34d435-abbf-41aa-b219-91883d7d29f3",
"metadata": {},
"source": [
"Or add :class:`Stack` move to represent a part-whole relationship:"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "3a7a0c05-d774-4f99-950f-5dc9865027c4",
"metadata": {},
"outputs": [],
"source": [
"p.add(so.Bars(), so.Hist(), so.Stack(), color=\"sex\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "e247e74b-2c09-40f0-8f45-9fa5f8264d78",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "py310",
"language": "python",
"name": "py310"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.0"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
2 changes: 2 additions & 0 deletions seaborn/_statistics.py
Expand Up @@ -194,6 +194,8 @@ def __call__(self, x1, x2=None, weights=None):
return self._eval_bivariate(x1, x2, weights)


# Note: we no longer use this for univariate histograms in histplot,
# preferring _stats.Hist. We'll deprecate this once we have a bivariate Stat class.
class Histogram:
"""Univariate and bivariate histogram estimator."""
def __init__(
Expand Down
15 changes: 14 additions & 1 deletion seaborn/_stats/base.py
@@ -1,7 +1,8 @@
"""Base module for statistical transformations."""
from __future__ import annotations
from collections.abc import Iterable
from dataclasses import dataclass
from typing import ClassVar
from typing import ClassVar, Any

from typing import TYPE_CHECKING
if TYPE_CHECKING:
Expand All @@ -28,6 +29,18 @@ class Stat:
# value on the orient axis, but we would not in the latter case.
group_by_orient: ClassVar[bool] = False

def _check_param_one_of(self, param: Any, options: Iterable[Any]) -> None:
"""Raise when parameter value is not one of a specified set."""
value = getattr(self, param)
if value not in options:
*most, last = options
option_str = ", ".join(f"{x!r}" for x in most[:-1]) + f" or {last!r}"
err = " ".join([
f"The `{param}` parameter for `{self.__class__.__name__}` must be",
f"one of {option_str}; not {value!r}.",
])
raise ValueError(err)

def __call__(
self,
data: DataFrame,
Expand Down

0 comments on commit ed3d367

Please sign in to comment.