Skip to content

Commit

Permalink
Improve automatic bin determination for histograms via start, end, an…
Browse files Browse the repository at this point in the history
…d step attributes (#285)

* Merge upstream

* Sync with master

* Fix bin size variance from #217

* Format and test

* Change labelOverlap to True

Co-authored-by: Dominik Moritz <domoritz@gmail.com>

* Modify markbar; currently questioning whether or not it's needed

* Remove markbar enitrely, rely on Altair automatic bin detection https://altair-viz.github.io/user_guide/generated/core/altair.BinParams.html

* Modify code snippet

* Revert "Remove markbar enitrely, rely on Altair automatic bin detection https://altair-viz.github.io/user_guide/generated/core/altair.BinParams.html"

This reverts commit 9cb9418.

* Implement bin size estimation via Freedman Diaconis's Rule

* Use numpy to compute IQR for better performance (pandas too slow)

* Add tests

* Add test cases for histogram binning

* Address changes from @domoritz review (small optimizations)

* Black and format

* Move histogram bin width computation to pandas executor (execute_binning)

* Center bars between ticks in distribution setting

* Renaming in execute_binning

* Bin width computed accurately in execute_binning; no need for get_bin_size()

* Revert to Freedman rule; maintain correct ticks

Co-authored-by: Micah Yong <micahyong@Micahs-MacBook-Pro.local>
Co-authored-by: Dominik Moritz <domoritz@gmail.com>
  • Loading branch information
3 people committed Mar 3, 2021
1 parent c49cbdb commit 952b642
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 19 deletions.
6 changes: 3 additions & 3 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,10 +282,10 @@ def execute_binning(vis: Vis):
series = vis.data[bin_attr].dropna()
# TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_center as the bin location
bin_center = np.mean(np.vstack([bin_edges[0:-1], bin_edges[1:]]), axis=0)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
# TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
binned_result = np.array([bin_center, counts]).T
binned_result = np.array([bin_start, counts]).T
vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])

@staticmethod
Expand Down
51 changes: 35 additions & 16 deletions lux/vislib/altair/Histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,51 +43,55 @@ def initialize_chart(self):
if len(msr_attr_abv) > 17:
msr_attr_abv = msr_attr_abv[:10] + "..." + msr_attr_abv[-7:]

x_min = self.vis.min_max[msr_attr.attribute][0]
x_max = self.vis.min_max[msr_attr.attribute][1]
x_min, x_max = self.vis.min_max[msr_attr.attribute]
x_range = abs(x_max - x_min)

if isinstance(msr_attr.attribute, str):
msr_attr.attribute = msr_attr.attribute.replace(".", "")
markbar = compute_bin_width(self.data[msr_attr.attribute])
step = abs(self.data[msr_attr.attribute][1] - self.data[msr_attr.attribute][0])

colval = self.vis.data[msr_attr.attribute]
x_range = abs(max(colval) - min(colval))
plot_range = abs(x_max - x_min)
markbar = x_range / plot_range * 12
# Default when bin too small
if markbar < (x_range / 24):
markbar = (x_max - x_min) / 12

self.data = AltairChart.sanitize_dataframe(self.data)
end_attr_abv = str(msr_attr.attribute) + "_end"
self.data[end_attr_abv] = self.data[str(msr_attr.attribute)] + markbar

axis_title = f"{msr_attr_abv} (binned)"
if msr_attr.attribute == " ":
axis_title = "Series (binned)"
if measure.channel == "x":
chart = (
alt.Chart(self.data)
.mark_bar(size=markbar)
.mark_bar()
.encode(
alt.X(
x=alt.X(
str(msr_attr.attribute),
title=axis_title,
bin=alt.Bin(binned=True),
bin=alt.Bin(binned=True, step=step),
type=msr_attr.data_type,
axis=alt.Axis(labelOverlap=True, title=axis_title),
scale=alt.Scale(domain=(x_min, x_max)),
axis=alt.Axis(title=axis_title),
scale=alt.Scale(domain=[x_min, x_max]),
),
alt.Y("Number of Records", type="quantitative"),
x2=end_attr_abv,
y=alt.Y("Number of Records", type="quantitative"),
)
)
elif measure.channel == "y":
chart = (
alt.Chart(self.data)
.mark_bar(size=markbar)
.mark_bar()
.encode(
x=alt.X("Number of Records", type="quantitative"),
y=alt.Y(
str(msr_attr.attribute),
title=axis_title,
bin=alt.Bin(binned=True),
axis=alt.Axis(labelOverlap=True, title=axis_title),
scale=alt.Scale(domain=(x_min, x_max)),
bin=alt.Bin(binned=True, step=markbar),
axis=alt.Axis(title=axis_title),
),
y2=end_attr_abv,
)
)
#####################################
Expand All @@ -112,3 +116,18 @@ def initialize_chart(self):
)
"""
return chart


def compute_bin_width(series):
"""
Helper function that returns optimal bin size via Freedman Diaconis's Rule
Source: https://en.wikipedia.org/wiki/Freedman%E2%80%93Diaconis_rule
"""
import math
import numpy as np

data = np.asarray(series)
num_pts = data.size
IQR = np.subtract(*np.percentile(data, [75, 25]))
size = 2 * IQR * (num_pts ** -1 / 3)
return round(size * 3.5, 2)
3 changes: 3 additions & 0 deletions tests/test_vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
import pandas as pd
from lux.vis.VisList import VisList
from lux.vis.Vis import Vis
from lux.vislib.altair.Histogram import compute_bin_width


def test_vis(global_var):
Expand Down Expand Up @@ -340,7 +341,9 @@ def test_histogram_chart(global_var):
lux.config.plotting_backend = "vegalite"
vis = Vis(["Displacement"], df)
vis_code = vis.to_Altair()
expected_bin_size = compute_bin_width(vis.data["Displacement"])
assert "alt.Chart(visData).mark_bar" in vis_code
assert str(expected_bin_size) in vis_code
assert (
"alt.X('Displacement', title='Displacement (binned)',bin=alt.Bin(binned=True), type='quantitative', axis=alt.Axis(labelOverlap=True, title='Displacement (binned)'), scale=alt.Scale(domain=(68.0, 455.0)))"
in vis_code
Expand Down

0 comments on commit 952b642

Please sign in to comment.