Mixed dtype issue (#205)

* coalesce data_types into data_type_lookup * merge fixed * merge conflicts * add warning and suggestion on how to fix * formatting for warnings version * change to internal data * legibility update * test added * update test * test updated * xlrd in dev reqs * black * update link * changes to test logic, minor string format for warning Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
lux-org · Jan 9, 2021 · 14c141b · 14c141b
1 parent e1430df
commit 14c141b
Show file tree

Hide file tree

Showing 3 changed files with 25 additions and 4 deletions.
diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -238,10 +238,20 @@ def execute_aggregate(vis: Vis, isFiltered=True):
                         assert (
                             len(list(vis.data[groupby_attr.attribute])) == N_unique_vals
                         ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
-            vis._vis_data = vis.data.dropna(subset=[measure_attr.attribute])
-            vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True)
-            vis._vis_data = vis.data.reset_index()
-            vis._vis_data = vis.data.drop(columns="index")
+
+            vis._vis_data = vis._vis_data.dropna(subset=[measure_attr.attribute])
+            try:
+                vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)
+            except TypeError:
+                warnings.warn(
+                    f"\nLux detects that the attribute '{groupby_attr.attribute}' maybe contain mixed type."
+                    + f"\nTo visualize this attribute, you may want to convert the '{groupby_attr.attribute}' into a uniform type as follows:"
+                    + f"\n\tdf['{groupby_attr.attribute}'] = df['{groupby_attr.attribute}'].astype(str)"
+                )
+                vis._vis_data[groupby_attr.attribute] = vis._vis_data[groupby_attr.attribute].astype(str)
+                vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True)
+            vis._vis_data = vis._vis_data.reset_index()
+            vis._vis_data = vis._vis_data.drop(columns="index")
 
     @staticmethod
     def execute_binning(vis: Vis):

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -2,4 +2,5 @@ pytest>=5.3.1
 pytest-cov>=2.8.1
 Sphinx>=3.0.2
 sphinx-rtd-theme>=0.4.3
+xlrd
 black
diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py
@@ -15,6 +15,8 @@
 from .context import lux
 import pytest
 import pandas as pd
+import numpy as np
+import warnings
 
 ###################
 # DataFrame Tests #
@@ -678,3 +680,11 @@ def test_read_sas(global_var):
     df._repr_html_()
     assert list(df.recommendation.keys()) == ["Correlation", "Distribution", "Temporal"]
     assert len(df.data_type) == 6
+
+
+def test_read_multi_dtype(global_var):
+    url = "https://github.com/lux-org/lux-datasets/blob/master/data/car-data.xls?raw=true"
+    df = pd.read_excel(url)
+    with pytest.warns(UserWarning, match="mixed type") as w:
+        df._repr_html_()
+        assert "df['Car Type'] = df['Car Type'].astype(str)" in str(w[-1].message)