Making Lux more robust with missing values and NaN (#179) (#180)

* improve datetime warning message with starter templates * Handling NaN value errors * skipping validator check for NaN filter values * adding special case for PandasExecutor to map filter NaN to isna() * fixing unevenness metric when bar values are NaN * eliminate 1-cardinality filters in Filter action (since equal to overall) * fixed deviation array unequal bug when NaN * Handling NaN filter and data type * fixed data type detection when int coerced to float when containing NaN * added test for applying NaN filter * Ensure that LuxSeries displayed when there is NaN * ensure that NaNs are not dropped in groupbys * exclude NaN values in deviation calculation * fix unnamed series issue * improved debugging message for LuxSeries * Override pd.Series with LuxSeries * Fixes for type checking and line charts with NaNs * exclude NaN for line charts to prevent large axes offsetting * improved type checking for float no-longer NaN columns * fixed and improved deviation calculation test * added float categorical test
lux-org · Dec 21, 2020 · e08460b · e08460b
1 parent b7635c0
commit e08460b
Show file tree

Hide file tree

Showing 11 changed files with 254 additions and 44 deletions.
diff --git a/lux/action/filter.py b/lux/action/filter.py
@@ -102,7 +102,7 @@ def get_complementary_ops(fltr_op):
         categorical_vars = []
         for col in list(ldf.columns):
             # if cardinality is not too high, and attribute is not one of the X,Y (specified) column
-            if ldf.cardinality[col] < 30 and col not in column_spec_attr:
+            if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr:
                 categorical_vars.append(col)
         for cat in categorical_vars:
             unique_values = ldf.unique_values[cat]

diff --git a/lux/core/__init__.py b/lux/core/__init__.py
@@ -14,19 +14,23 @@
 
 import pandas as pd
 from .frame import LuxDataFrame
+from .series import LuxSeries
 
 global originalDF
 # Keep variable scope of original pandas df
 originalDF = pd.core.frame.DataFrame
+originalSeries = pd.core.series.Series
 
 
 def setOption(overridePandas=True):
     if overridePandas:
         pd.DataFrame = (
             pd.io.json._json.DataFrame
         ) = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = LuxDataFrame
+        pd.Series = LuxSeries
     else:
         pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF
+        pd.Series = originalSeries
 
 
 setOption(overridePandas=True)
diff --git a/lux/core/series.py b/lux/core/series.py
@@ -15,6 +15,7 @@
 import pandas as pd
 import lux
 import warnings
+import traceback
 
 
 class LuxSeries(pd.Series):
@@ -56,13 +57,24 @@ def f(*args, **kwargs):
         f._get_axis_number = super(LuxSeries, self)._get_axis_number
         return f
 
+    def to_pandas(self):
+        import lux.core
+
+        return lux.core.originalSeries(self, copy=False)
+
+    def display_pandas(self):
+        return self.to_pandas()
+
     def __repr__(self):
         from IPython.display import display
         from IPython.display import clear_output
         import ipywidgets as widgets
         from lux.core.frame import LuxDataFrame
 
         series_repr = super(LuxSeries, self).__repr__()
+        # Default column name 0 causes errors
+        if self.name is None:
+            self.name = " "
         ldf = LuxDataFrame(self)
 
         try:
@@ -137,12 +149,13 @@ def on_button_clicked(b):
 
         except (KeyboardInterrupt, SystemExit):
             raise
-        except:
+        except Exception:
             warnings.warn(
                 "\nUnexpected error in rendering Lux widget and recommendations. "
-                "Falling back to Pandas display.\n\n"
-                "Please report this issue on Github: https://github.com/lux-org/lux/issues ",
+                "Falling back to Pandas display.\n"
+                "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n",
                 stacklevel=2,
             )
-            print(series_repr)
+            warnings.warn(traceback.format_exc())
+            display(self.display_pandas())
         return ""
diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -159,22 +159,26 @@ def execute_aggregate(vis: Vis, isFiltered=True):
 
                 if has_color:
                     vis._vis_data = (
-                        vis.data.groupby([groupby_attr.attribute, color_attr.attribute])
+                        vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False)
                         .count()
                         .reset_index()
                     )
                     vis._vis_data = vis.data.rename(columns={"index": "Record"})
                     vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, "Record"]]
                 else:
-                    vis._vis_data = vis.data.groupby(groupby_attr.attribute).count().reset_index()
+                    vis._vis_data = (
+                        vis.data.groupby(groupby_attr.attribute, dropna=False).count().reset_index()
+                    )
                     vis._vis_data = vis.data.rename(columns={"index": "Record"})
                     vis._vis_data = vis.data[[groupby_attr.attribute, "Record"]]
             else:
                 # if color is specified, need to group by groupby_attr and color_attr
                 if has_color:
-                    groupby_result = vis.data.groupby([groupby_attr.attribute, color_attr.attribute])
+                    groupby_result = vis.data.groupby(
+                        [groupby_attr.attribute, color_attr.attribute], dropna=False
+                    )
                 else:
-                    groupby_result = vis.data.groupby(groupby_attr.attribute)
+                    groupby_result = vis.data.groupby(groupby_attr.attribute, dropna=False)
                 groupby_result = groupby_result.agg(agg_func)
                 intermediate = groupby_result.reset_index()
                 vis._vis_data = intermediate.__finalize__(vis.data)
@@ -225,6 +229,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
                         assert (
                             len(list(vis.data[groupby_attr.attribute])) == N_unique_vals
                         ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
+            vis._vis_data = vis.data.dropna(subset=[measure_attr.attribute])
             vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True)
             vis._vis_data = vis.data.reset_index()
             vis._vis_data = vis.data.drop(columns="index")
@@ -298,6 +303,16 @@ def apply_filter(df: pd.DataFrame, attribute: str, op: str, val: object) -> pd.D
         df: pandas.DataFrame
             Dataframe resulting from the filter operation
         """
+        # Handling NaN filter values
+        if utils.like_nan(val):
+            if op != "=" and op != "!=":
+                warnings.warn("Filter on NaN must be used with equality operations (i.e., `=` or `!=`)")
+            else:
+                if op == "=":
+                    return df[df[attribute].isna()]
+                elif op == "!=":
+                    return df[~df[attribute].isna()]
+        # Applying filter in regular, non-NaN cases
         if op == "=":
             return df[df[attribute] == val]
         elif op == "<":
@@ -380,7 +395,12 @@ def compute_data_type(self, ldf: LuxDataFrame):
             elif str(attr).lower() in temporal_var_list:
                 ldf.data_type_lookup[attr] = "temporal"
             elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
-                ldf.data_type_lookup[attr] = "quantitative"
+                # int columns gets coerced into floats if contain NaN
+                convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
+                if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20:
+                    ldf.data_type_lookup[attr] = "nominal"
+                else:
+                    ldf.data_type_lookup[attr] = "quantitative"
             elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
                 # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
                 if ldf.pre_aggregated:
@@ -413,24 +433,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
         for attr in ldf.columns:
             if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]):
                 non_datetime_attrs.append(attr)
+        warn_msg = ""
         if len(non_datetime_attrs) == 1:
-            warnings.warn(
-                f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
-                "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
-                "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
-                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
-                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
-                stacklevel=2,
-            )
+            warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
         elif len(non_datetime_attrs) > 1:
-            warnings.warn(
-                f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
-                "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
-                "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
-                "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
-                "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
-                stacklevel=2,
-            )
+            warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
+        if len(non_datetime_attrs) > 0:
+            warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n"
+            for attr in non_datetime_attrs:
+                warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n"
+            warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html"
+            warnings.warn(warn_msg, stacklevel=2)
 
     def _is_datetime_string(self, series):
         if len(series) > 100:

diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py
@@ -73,6 +73,7 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
     if n_dim == 1 and (n_msr == 0 or n_msr == 1):
         if v_size < 2:
             return -1
+
         if n_filter == 0:
             return unevenness(vis, ldf, measure_lst, dimension_lst)
         elif n_filter == 1:
@@ -184,7 +185,9 @@ def weighted_correlation(x, y, w):
     return weighted_cov(x, y, w) / np.sqrt(weighted_cov(x, x, w) * weighted_cov(y, y, w))
 
 
-def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int:
+def deviation_from_overall(
+    vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str, exclude_nan: bool = True
+) -> int:
     """
     Difference in bar chart/histogram shape from overall chart
     Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data.
@@ -197,15 +200,22 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
             List of filters from the Vis
     msr_attribute : str
             The attribute name of the measure value of the chart
+    exclude_nan: bool
+            Whether to include/exclude NaN values as part of the deviation calculation
 
     Returns
     -------
     int
             Score describing how different the vis is from the overall vis
     """
     v_filter_size = get_filtered_size(filter_specs, ldf)
-    v_size = len(vis.data)
-    v_filter = vis.data[msr_attribute]
+
+    if exclude_nan:
+        vdata = vis.data.dropna()
+    else:
+        vdata = vis.data
+    v_size = len(vdata)
+    v_filter = vdata[msr_attribute]
     total = v_filter.sum()
     v_filter = v_filter / total  # normalize by total to get ratio
     if total == 0:
@@ -217,8 +227,11 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
     # Remove filters, keep only attribute intent
     unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent)
     lux.config.executor.execute([unfiltered_vis], ldf)
-
-    v = unfiltered_vis.data[msr_attribute]
+    if exclude_nan:
+        uv = unfiltered_vis.data.dropna()
+    else:
+        uv = unfiltered_vis.data
+    v = uv[msr_attribute]
     v = v / v.sum()
     assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length."
     sig = v_filter_size / v_size  # significance factor
@@ -230,8 +243,8 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
         dimList = vis.get_attr_by_data_model("dimension")
 
         # use Pandas rank function to calculate rank positions for each category
-        v_rank = unfiltered_vis.data.rank()
-        v_filter_rank = vis.data.rank()
+        v_rank = uv.rank()
+        v_filter_rank = vdata.rank()
         # go through and count the number of ranking changes between the filtered and unfiltered data
         numCategories = ldf.cardinality[dimList[0].attribute]
         for r in range(0, numCategories - 1):
@@ -267,12 +280,16 @@ def unevenness(vis: Vis, ldf: LuxDataFrame, measure_lst: list, dimension_lst: li
     """
     v = vis.data[measure_lst[0].attribute]
     v = v / v.sum()  # normalize by total to get ratio
+    v = v.fillna(0)  # Some bar values may be NaN
     C = ldf.cardinality[dimension_lst[0].attribute]
     D = (0.9) ** C  # cardinality-based discounting factor
     v_flat = pd.Series([1 / C] * len(v))
     if is_datetime(v):
         v = v.astype("int")
-    return D * euclidean(v, v_flat)
+    try:
+        return D * euclidean(v, v_flat)
+    except (ValueError):
+        return 0.01
 
 
 def mutual_information(v_x: list, v_y: list) -> int:

diff --git a/lux/processor/Validator.py b/lux/processor/Validator.py
@@ -19,6 +19,7 @@
 from lux.utils.date_utils import is_datetime_series, is_datetime_string
 import warnings
 import lux
+import lux.utils.utils
 
 
 class Validator:
@@ -80,15 +81,19 @@ def validate_clause(clause):
                                 else:
                                     warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n  Please check your input intent for typos."
                         if clause.value and clause.attribute and clause.filter_op == "=":
-                            series = ldf[clause.attribute]
-                            if not is_datetime_series(series):
-                                if isinstance(clause.value, list):
-                                    vals = clause.value
-                                else:
-                                    vals = [clause.value]
-                                for val in vals:
-                                    if val not in series.values:
-                                        warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame."
+                            import math
+
+                            # Skip check for NaN filter values
+                            if not lux.utils.utils.like_nan(clause.value):
+                                series = ldf[clause.attribute]
+                                if not is_datetime_series(series):
+                                    if isinstance(clause.value, list):
+                                        vals = clause.value
+                                    else:
+                                        vals = [clause.value]
+                                    for val in vals:
+                                        if val not in series.values:
+                                            warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame."
             return warn_msg
 
         warn_msg = ""

diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -89,3 +89,12 @@ def check_if_id_like(df, attribute):
     else:
         # TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
         return high_cardinality and (attribute_contain_id or almost_all_vals_unique)
+
+
+def like_nan(val):
+    if isinstance(val, str):
+        return val.lower() == "nan"
+    elif isinstance(val, float) or isinstance(val, int):
+        import math
+
+        return math.isnan(val)
diff --git a/lux/vislib/altair/LineChart.py b/lux/vislib/altair/LineChart.py
@@ -39,7 +39,8 @@ def initialize_chart(self):
         self.tooltip = False  # tooltip looks weird for line chart
         x_attr = self.vis.get_attr_by_channel("x")[0]
         y_attr = self.vis.get_attr_by_channel("y")[0]
-
+        # Remove NaNs only for Line Charts (offsets axis range)
+        self.data = self.data.dropna(subset=[x_attr.attribute, y_attr.attribute])
         self.code += "import altair as alt\n"
         self.code += "import pandas._libs.tslibs.timestamps\n"
         self.code += "from pandas._libs.tslibs.timestamps import Timestamp\n"

diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py
@@ -277,3 +277,37 @@ def test_interestingness_0_2_1(global_var):
     df._repr_html_()
     # check that top recommended Generalize graph score is not none
     assert interestingness(df.recommendation["Generalize"][0], df) != None
+
+
+def test_interestingness_deviation_nan():
+    import numpy as np
+
+    dataset = [
+        {"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
+        {"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
+        {"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
+        {"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
+        {"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
+        {"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
+        {"date": np.nan, "category": "C", "value": 0.2},
+        {"date": np.nan, "category": "B", "value": 0.2},
+        {"date": np.nan, "category": "F", "value": 0.3},
+        {"date": np.nan, "category": "E", "value": 0.3},
+        {"date": np.nan, "category": "D", "value": 0.4},
+        {"date": np.nan, "category": "A", "value": 10.4},
+        {"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
+        {"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
+        {"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
+    ]
+    test = pd.DataFrame(dataset)
+    from lux.vis.Vis import Vis
+
+    vis = Vis(["date", "value", "category=A"], test)
+    vis2 = Vis(["date", "value", "category=B"], test)
+    from lux.interestingness.interestingness import interestingness
+
+    smaller_diff_score = interestingness(vis, test)
+    bigger_diff_score = interestingness(vis2, test)
+    assert np.isclose(smaller_diff_score, 0.29, rtol=0.1)
+    assert np.isclose(bigger_diff_score, 0.94, rtol=0.1)
+    assert smaller_diff_score < bigger_diff_score