[WIP] update type inference for string columns (#343)

* update type inference for string columns * #249 example working with histograms on NaN columns, added test * rewrote is_numeric_nan_column in a more optimized way Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
lux-org · Apr 10, 2021 · bab48ff · bab48ff
1 parent 952d3c5
commit bab48ff
Show file tree

Hide file tree

Showing 4 changed files with 70 additions and 14 deletions.
diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -19,7 +19,7 @@
 from lux.executor.Executor import Executor
 from lux.utils import utils
 from lux.utils.date_utils import is_datetime_series
-from lux.utils.utils import check_import_lux_widget, check_if_id_like
+from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column
 import warnings
 import lux
 
@@ -97,7 +97,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
             if vis.mark == "bar" or vis.mark == "line" or vis.mark == "geographical":
                 PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed)
             elif vis.mark == "histogram":
-                PandasExecutor.execute_binning(vis)
+                PandasExecutor.execute_binning(ldf, vis)
             elif vis.mark == "scatter":
                 HBIN_START = 5000
                 if lux.config.heatmap and len(ldf) > HBIN_START:
@@ -259,7 +259,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
             vis._vis_data = vis._vis_data.drop(columns="index")
 
     @staticmethod
-    def execute_binning(vis: Vis):
+    def execute_binning(ldf, vis: Vis):
         """
         Binning of data points for generating histograms
 
@@ -278,16 +278,22 @@ def execute_binning(vis: Vis):
 
         bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
         bin_attr = bin_attribute.attribute
-        if not np.isnan(vis.data[bin_attr]).all():
-            # np.histogram breaks if array contain NaN
-            series = vis.data[bin_attr].dropna()
-            # TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
-            counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
-            # bin_edges of size N+1, so need to compute bin_start as the bin location
-            bin_start = bin_edges[0:-1]
-            # TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
-            binned_result = np.array([bin_start, counts]).T
-            vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
+        series = vis.data[bin_attr]
+
+        if series.hasnans:
+            ldf._message.add_unique(
+                f"The column <code>{bin_attr}</code> contains missing values, not shown in the displayed histogram.",
+                priority=100,
+            )
+            series = series.dropna()
+        if pd.api.types.is_object_dtype(series):
+            series = series.astype("float", errors="ignore")
+
+        counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
+        # bin_edges of size N+1, so need to compute bin_start as the bin location
+        bin_start = bin_edges[0:-1]
+        binned_result = np.array([bin_start, counts]).T
+        vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
 
     @staticmethod
     def execute_filter(vis: Vis):
@@ -440,7 +446,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
                         ldf._data_type[attr] = "id"
                 # Eliminate this clause because a single NaN value can cause the dtype to be object
                 elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
-                    if check_if_id_like(ldf, attr):
+                    # Check first if it's castable to float after removing NaN
+                    is_numeric_nan, series = is_numeric_nan_column(ldf[attr])
+                    if is_numeric_nan:
+                        # int columns gets coerced into floats if contain NaN
+                        ldf._data_type[attr] = "quantitative"
+                        # min max was not computed since object type, so recompute here
+                        ldf._min_max[attr] = (
+                            series.min(),
+                            series.max(),
+                        )
+                    elif check_if_id_like(ldf, attr):
                         ldf._data_type[attr] = "id"
                     else:
                         ldf._data_type[attr] = "nominal"

diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -125,3 +125,15 @@ def matplotlib_setup(w, h):
     ax.spines["right"].set_color("#dddddd")
     ax.spines["top"].set_color("#dddddd")
     return fig, ax
+
+
+def is_numeric_nan_column(series):
+    if series.dtype == object:
+        if series.hasnans:
+            series = series.dropna()
+        try:
+            return True, series.astype("float")
+        except Exception as e:
+            return False, series
+    else:
+        return False, series
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -4,3 +4,4 @@ Sphinx>=3.0.2
 sphinx-rtd-theme>=0.4.3
 xlrd
 black
+lxml
diff --git a/tests/test_nan.py b/tests/test_nan.py
@@ -113,3 +113,30 @@ def test_nan_series_occurence():
     ldf = pd.DataFrame(nan_series, columns=["col"])
     ldf._ipython_display_()
     assert ldf.recommendation["Occurrence"][0].mark == "bar"
+
+
+def test_numeric_with_nan():
+    df = pd.read_html(
+        "https://archive.ics.uci.edu/ml/datasets.php?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=table"
+    )[5]
+    df.columns = df.loc[0]
+    df = df.loc[1:]
+    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
+    assert (
+        df.data_type["# Instances"] == "quantitative"
+    ), "Testing a numeric columns with NaN, check if type can be detected correctly"
+    assert (
+        df.data_type["# Attributes"] == "quantitative"
+    ), "Testing a numeric columns with NaN, check if type can be detected correctly"
+    a = df[["# Instances", "# Attributes"]]
+    a._ipython_display_()
+    assert (
+        len(a.recommendation["Distribution"]) == 2
+    ), "Testing a numeric columns with NaN, check that histograms are displayed"
+    assert "contains missing values" in a._message.to_html(), "Warning message for NaN displayed"
+    a = a.dropna()
+    a._ipython_display_()
+    assert (
+        len(a.recommendation["Distribution"]) == 2
+    ), "Example where dtype might be off after dropna(), check if histograms are still displayed"
+    assert "" in a._message.to_html(), "No warning message for NaN should be displayed"