Skip to content

Commit

Permalink
Revert Cardinality Requirement for Histograms (#301)
Browse files Browse the repository at this point in the history
* coalesce data_types into data_type_lookup

* merge fixed

* merge conflicts

* first commit

* requirements.txt updated for pandas 1.2.2

* revert cardinality requiremment

* black reformat

* all tests passing with cardinality optimization

* remove abs value

* tests added

* black reformat

* minor fixes

* black

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
jinimukh and dorisjlee committed Mar 22, 2021
1 parent 127806f commit 950eba6
Show file tree
Hide file tree
Showing 6 changed files with 37 additions and 21 deletions.
14 changes: 4 additions & 10 deletions lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,9 +46,7 @@ def univariate(ldf, *args):
ignore_rec_flag = False
if data_type_constraint == "quantitative":
possible_attributes = [
c
for c in ldf.columns
if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records"
c for c in ldf.columns if ldf.data_type[c] == "quantitative" and c != "Number of Records"
]
intent = [lux.Clause(possible_attributes)]
intent.extend(filter_specs)
Expand All @@ -65,14 +63,12 @@ def univariate(ldf, *args):
ignore_rec_flag = True
elif data_type_constraint == "nominal":
possible_attributes = [
c
for c in ldf.columns
if ldf.data_type[c] == "nominal" and ldf.cardinality[c] > 5 and c != "Number of Records"
c for c in ldf.columns if ldf.data_type[c] == "nominal" and c != "Number of Records"
]
examples = ""
if len(possible_attributes) >= 1:
examples = f" (e.g., {possible_attributes[0]})"
intent = [lux.Clause("?", data_type="nominal")]
intent = [lux.Clause(possible_attributes)]
intent.extend(filter_specs)
recommendation = {
"action": "Occurrence",
Expand All @@ -81,9 +77,7 @@ def univariate(ldf, *args):
}
elif data_type_constraint == "geographical":
possible_attributes = [
c
for c in ldf.columns
if ldf.data_type[c] == "geographical" and ldf.cardinality[c] > 5 and c != "Number of Records"
c for c in ldf.columns if ldf.data_type[c] == "geographical" and c != "Number of Records"
]
examples = ""
if len(possible_attributes) >= 1:
Expand Down
12 changes: 8 additions & 4 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
has_color = False
groupby_attr = ""
measure_attr = ""
attr_unique_vals = []
if x_attr.aggregation is None or y_attr.aggregation is None:
return
if y_attr.aggregation != "":
Expand All @@ -143,7 +144,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
measure_attr = x_attr
agg_func = x_attr.aggregation
if groupby_attr.attribute in vis.data.unique_values.keys():
attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
attr_unique_vals = vis.data.unique_values.get(groupby_attr.attribute)
# checks if color is specified in the Vis
if len(vis.get_attr_by_channel("color")) == 1:
color_attr = vis.get_attr_by_channel("color")[0]
Expand Down Expand Up @@ -426,7 +427,7 @@ def compute_data_type(self, ldf: LuxDataFrame):
if (
convertible2int
and ldf.cardinality[attr] != len(ldf)
and ldf.cardinality[attr] < 20
and (len(ldf[attr].convert_dtypes().unique() < 20))
):
ldf._data_type[attr] = "nominal"
else:
Expand Down Expand Up @@ -524,8 +525,11 @@ def compute_stats(self, ldf: LuxDataFrame):
else:
attribute_repr = attribute

ldf.unique_values[attribute_repr] = list(ldf[attribute_repr].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])
if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
else:
ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute

if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
ldf.dtypes[attribute]
Expand Down
2 changes: 1 addition & 1 deletion lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -357,7 +357,7 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in
warnings.filterwarnings("error")
try:
score = np.abs(pearsonr(v_x, v_y)[0])
except (RuntimeWarning):
except:
# RuntimeWarning: invalid value encountered in true_divide (occurs when v_x and v_y are uniform, stdev in denominator is zero, leading to spearman's correlation as nan), ignore these cases.
score = -1

Expand Down
2 changes: 1 addition & 1 deletion lux/vislib/altair/Histogram.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ def initialize_chart(self):

# Default when bin too small
if markbar < (x_range / 24):
markbar = (x_max - x_min) / 12
markbar = x_max - x_min / 12

self.data = AltairChart.sanitize_dataframe(self.data)
end_attr_abv = str(msr_attr.attribute) + "_end"
Expand Down
10 changes: 5 additions & 5 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def test_transform(global_var):
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
new_df._repr_html_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
assert len(new_df.cardinality) == 7


Expand Down Expand Up @@ -405,11 +405,11 @@ def test_loc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10, "Displacement":"Origin"]
new_df._repr_html_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10, "Displacement":"Horsepower"]
new_df._repr_html_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand All @@ -434,11 +434,11 @@ def test_iloc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11, 3:9]
new_df._repr_html_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11, 3:5]
new_df._repr_html_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand Down
18 changes: 18 additions & 0 deletions tests/test_vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,15 @@ def test_colored_bar_chart(global_var):
assert "ax.set_ylabel('Cylinders')" in vis_code


def test_bar_uniform():
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
df["Type"] = "A"
vis = Vis(["Type"], df)
vis_code = vis.to_Altair()
assert "y = alt.Y('Type', type= 'nominal'" in vis_code


def test_scatter_chart(global_var):
df = pytest.car_df
lux.config.plotting_backend = "vegalite"
Expand Down Expand Up @@ -361,6 +370,15 @@ def test_histogram_chart(global_var):
assert "ax.set_ylabel('Number of Records')" in vis_code


def test_histogram_uniform():
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
df["Units"] = 4.0
vis = Vis(["Units"], df)
vis_code = vis.to_Altair()
assert "y = alt.Y('Units', type= 'nominal'" in vis_code


def test_heatmap_chart(global_var):
df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
lux.config.plotting_backend = "vegalite"
Expand Down

0 comments on commit 950eba6

Please sign in to comment.