Revert Cardinality Requirement for Histograms (#301)

* coalesce data_types into data_type_lookup * merge fixed * merge conflicts * first commit * requirements.txt updated for pandas 1.2.2 * revert cardinality requiremment * black reformat * all tests passing with cardinality optimization * remove abs value * tests added * black reformat * minor fixes * black Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
lux-org · Mar 22, 2021 · 950eba6 · 950eba6
1 parent 127806f
commit 950eba6
Show file tree

Hide file tree

Showing 6 changed files with 37 additions and 21 deletions.
diff --git a/lux/action/univariate.py b/lux/action/univariate.py
@@ -46,9 +46,7 @@ def univariate(ldf, *args):
     ignore_rec_flag = False
     if data_type_constraint == "quantitative":
         possible_attributes = [
-            c
-            for c in ldf.columns
-            if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records"
+            c for c in ldf.columns if ldf.data_type[c] == "quantitative" and c != "Number of Records"
         ]
         intent = [lux.Clause(possible_attributes)]
         intent.extend(filter_specs)
@@ -65,14 +63,12 @@ def univariate(ldf, *args):
             ignore_rec_flag = True
     elif data_type_constraint == "nominal":
         possible_attributes = [
-            c
-            for c in ldf.columns
-            if ldf.data_type[c] == "nominal" and ldf.cardinality[c] > 5 and c != "Number of Records"
+            c for c in ldf.columns if ldf.data_type[c] == "nominal" and c != "Number of Records"
         ]
         examples = ""
         if len(possible_attributes) >= 1:
             examples = f" (e.g., {possible_attributes[0]})"
-        intent = [lux.Clause("?", data_type="nominal")]
+        intent = [lux.Clause(possible_attributes)]
         intent.extend(filter_specs)
         recommendation = {
             "action": "Occurrence",
@@ -81,9 +77,7 @@ def univariate(ldf, *args):
         }
     elif data_type_constraint == "geographical":
         possible_attributes = [
-            c
-            for c in ldf.columns
-            if ldf.data_type[c] == "geographical" and ldf.cardinality[c] > 5 and c != "Number of Records"
+            c for c in ldf.columns if ldf.data_type[c] == "geographical" and c != "Number of Records"
         ]
         examples = ""
         if len(possible_attributes) >= 1:

diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -132,6 +132,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
         has_color = False
         groupby_attr = ""
         measure_attr = ""
+        attr_unique_vals = []
         if x_attr.aggregation is None or y_attr.aggregation is None:
             return
         if y_attr.aggregation != "":
@@ -143,7 +144,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
             measure_attr = x_attr
             agg_func = x_attr.aggregation
         if groupby_attr.attribute in vis.data.unique_values.keys():
-            attr_unique_vals = vis.data.unique_values[groupby_attr.attribute]
+            attr_unique_vals = vis.data.unique_values.get(groupby_attr.attribute)
         # checks if color is specified in the Vis
         if len(vis.get_attr_by_channel("color")) == 1:
             color_attr = vis.get_attr_by_channel("color")[0]
@@ -426,7 +427,7 @@ def compute_data_type(self, ldf: LuxDataFrame):
                     if (
                         convertible2int
                         and ldf.cardinality[attr] != len(ldf)
-                        and ldf.cardinality[attr] < 20
+                        and (len(ldf[attr].convert_dtypes().unique() < 20))
                     ):
                         ldf._data_type[attr] = "nominal"
                     else:
@@ -524,8 +525,11 @@ def compute_stats(self, ldf: LuxDataFrame):
             else:
                 attribute_repr = attribute
 
-            ldf.unique_values[attribute_repr] = list(ldf[attribute_repr].unique())
-            ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])
+            if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
+                ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
+                ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
+            else:
+                ldf.cardinality[attribute_repr] = 999  # special value for non-numeric attribute
 
             if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
                 ldf.dtypes[attribute]

diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py
@@ -357,7 +357,7 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in
         warnings.filterwarnings("error")
         try:
             score = np.abs(pearsonr(v_x, v_y)[0])
-        except (RuntimeWarning):
+        except:
             # RuntimeWarning: invalid value encountered in true_divide (occurs when v_x and v_y are uniform, stdev in denominator is zero, leading to spearman's correlation as nan), ignore these cases.
             score = -1
 

diff --git a/lux/vislib/altair/Histogram.py b/lux/vislib/altair/Histogram.py
@@ -53,7 +53,7 @@ def initialize_chart(self):
 
         # Default when bin too small
         if markbar < (x_range / 24):
-            markbar = (x_max - x_min) / 12
+            markbar = x_max - x_min / 12
 
         self.data = AltairChart.sanitize_dataframe(self.data)
         end_attr_abv = str(msr_attr.attribute) + "_end"

diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py
@@ -257,7 +257,7 @@ def test_transform(global_var):
     df["Year"] = pd.to_datetime(df["Year"], format="%Y")
     new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
     new_df._repr_html_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
     assert len(new_df.cardinality) == 7
 
 
@@ -405,11 +405,11 @@ def test_loc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.loc[0:10, "Displacement":"Origin"]
     new_df._repr_html_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
     assert len(new_df.cardinality) == 6
     new_df = df.loc[0:10, "Displacement":"Horsepower"]
     new_df._repr_html_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
+    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
     assert len(new_df.cardinality) == 2
     import numpy as np
 
@@ -434,11 +434,11 @@ def test_iloc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.iloc[0:11, 3:9]
     new_df._repr_html_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
     assert len(new_df.cardinality) == 6
     new_df = df.iloc[0:11, 3:5]
     new_df._repr_html_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
+    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
     assert len(new_df.cardinality) == 2
     import numpy as np
 

diff --git a/tests/test_vis.py b/tests/test_vis.py
@@ -247,6 +247,15 @@ def test_colored_bar_chart(global_var):
     assert "ax.set_ylabel('Cylinders')" in vis_code
 
 
+def test_bar_uniform():
+    df = pd.read_csv("lux/data/car.csv")
+    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
+    df["Type"] = "A"
+    vis = Vis(["Type"], df)
+    vis_code = vis.to_Altair()
+    assert "y = alt.Y('Type', type= 'nominal'" in vis_code
+
+
 def test_scatter_chart(global_var):
     df = pytest.car_df
     lux.config.plotting_backend = "vegalite"
@@ -361,6 +370,15 @@ def test_histogram_chart(global_var):
     assert "ax.set_ylabel('Number of Records')" in vis_code
 
 
+def test_histogram_uniform():
+    df = pd.read_csv("lux/data/car.csv")
+    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
+    df["Units"] = 4.0
+    vis = Vis(["Units"], df)
+    vis_code = vis.to_Altair()
+    assert "y = alt.Y('Units', type= 'nominal'" in vis_code
+
+
 def test_heatmap_chart(global_var):
     df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
     lux.config.plotting_backend = "vegalite"