various bugfix in test and data type inference code (#349)

* remove expensive convert_dtype in float inference * bring back cardinality calculation on all float columns * nominal detection applies even for floats < 20 row now
lux-org · Apr 8, 2021 · 4a04723 · 4a04723
1 parent 1a72332
commit 4a04723
Show file tree

Hide file tree

Showing 3 changed files with 19 additions and 24 deletions.
diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -422,13 +422,8 @@ def compute_data_type(self, ldf: LuxDataFrame):
                 elif self._is_geographical_attribute(ldf[attr]):
                     ldf._data_type[attr] = "geographical"
                 elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
-                    # int columns gets coerced into floats if contain NaN
-                    convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
-                    if (
-                        convertible2int
-                        and ldf.cardinality[attr] != len(ldf)
-                        and (len(ldf[attr].convert_dtypes().unique() < 20))
-                    ):
+
+                    if ldf.cardinality[attr] != len(ldf) and (ldf.cardinality[attr] < 20):
                         ldf._data_type[attr] = "nominal"
                     else:
                         ldf._data_type[attr] = "quantitative"
@@ -525,11 +520,8 @@ def compute_stats(self, ldf: LuxDataFrame):
             else:
                 attribute_repr = attribute
 
-            if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
-                ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
-                ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
-            else:
-                ldf.cardinality[attribute_repr] = 999  # special value for non-numeric attribute
+            ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
+            ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])
 
             if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
                 ldf.dtypes[attribute]

diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py
@@ -273,25 +273,28 @@ def test_interestingness_deviation_nan():
     import numpy as np
 
     dataset = [
-        {"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
+        {"date": "2017-08-25", "category": "A", "value": 25.0},
+        {"date": "2017-08-25", "category": "B", "value": 1.2},
+        {"date": "2017-08-25", "category": "C", "value": 1.3},
+        {"date": "2017-08-25", "category": "D", "value": 1.4},
+        {"date": "2017-08-25", "category": "E", "value": 1.5},
+        {"date": "2017-08-25", "category": "F", "value": 0.1},
         {"date": np.nan, "category": "C", "value": 0.2},
         {"date": np.nan, "category": "B", "value": 0.2},
         {"date": np.nan, "category": "F", "value": 0.3},
         {"date": np.nan, "category": "E", "value": 0.3},
         {"date": np.nan, "category": "D", "value": 0.4},
         {"date": np.nan, "category": "A", "value": 10.4},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
+        {"date": "2017-07-25", "category": "A", "value": 15.5},
+        {"date": "2017-07-25", "category": "F", "value": 1.0},
+        {"date": "2017-07-25", "category": "B", "value": 0.1},
     ]
     test = pd.DataFrame(dataset)
     from lux.vis.Vis import Vis
 
+    test["date"] = pd.to_datetime(test["date"], format="%Y-%M-%d")
+    test.set_data_type({"value": "quantitative"})
+
     vis = Vis(["date", "value", "category=A"], test)
     vis2 = Vis(["date", "value", "category=B"], test)
     from lux.interestingness.interestingness import interestingness

diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py
@@ -257,7 +257,7 @@ def test_transform(global_var):
     df["Year"] = pd.to_datetime(df["Year"], format="%Y")
     new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Occurrence"]
     assert len(new_df.cardinality) == 7
 
 
@@ -409,7 +409,7 @@ def test_loc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.loc[0:10, "Displacement":"Horsepower"]
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
     assert len(new_df.cardinality) == 2
     import numpy as np
 
@@ -438,7 +438,7 @@ def test_iloc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.iloc[0:11, 3:5]
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
     assert len(new_df.cardinality) == 2
     import numpy as np