Merge branch 'master' into sql-engine

lux-org · Apr 11, 2021 · 2298f13 · 2298f13
2 parents 40b85b1 + bab48ff
commit 2298f13
Show file tree

Hide file tree

Showing 11 changed files with 125 additions and 46 deletions.
diff --git a/lux/action/univariate.py b/lux/action/univariate.py
@@ -84,10 +84,13 @@ def univariate(ldf, *args):
             examples = f" (e.g., {possible_attributes[0]})"
         intent = [lux.Clause("?", data_type="geographical"), lux.Clause("?", data_model="measure")]
         intent.extend(filter_specs)
+        long_description = f"Geographical displays <a href='https://en.wikipedia.org/wiki/Choropleth_map'>choropleths</a> for geographic attribute{examples}, with colors indicating the average measure values. "
+        if lux.config.plotting_backend == "matplotlib":
+            long_description += "The map visualizations from the 'Geographical' tab are rendered using <a href='https://altair-viz.github.io/'>Altair</a>. Lux does not currently support geographical maps with Matplotlib. If you would like this feature, please leave us a comment at <a href='https://github.com/lux-org/lux/issues/310'>issue #310</a> to let us know!"
         recommendation = {
             "action": "Geographical",
             "description": "Show choropleth maps of <p class='highlight-descriptor'>geographic</p> attributes",
-            "long_description": f"Occurence displays choropleths of averages for some geographic attribute{examples}. Visualizations are ranked by diversity of the geographic attribute.",
+            "long_description": long_description,
         }
     elif data_type_constraint == "temporal":
         intent = [lux.Clause("?", data_type="temporal")]

diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py
@@ -19,7 +19,7 @@
 from lux.executor.Executor import Executor
 from lux.utils import utils
 from lux.utils.date_utils import is_datetime_series
-from lux.utils.utils import check_import_lux_widget, check_if_id_like
+from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column
 import warnings
 import lux
 
@@ -97,7 +97,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
             if vis.mark == "bar" or vis.mark == "line" or vis.mark == "geographical":
                 PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed)
             elif vis.mark == "histogram":
-                PandasExecutor.execute_binning(vis)
+                PandasExecutor.execute_binning(ldf, vis)
             elif vis.mark == "scatter":
                 HBIN_START = 5000
                 if lux.config.heatmap and len(ldf) > HBIN_START:
@@ -259,7 +259,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
             vis._vis_data = vis._vis_data.drop(columns="index")
 
     @staticmethod
-    def execute_binning(vis: Vis):
+    def execute_binning(ldf, vis: Vis):
         """
         Binning of data points for generating histograms
 
@@ -278,16 +278,22 @@ def execute_binning(vis: Vis):
 
         bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
         bin_attr = bin_attribute.attribute
-        if not np.isnan(vis.data[bin_attr]).all():
-            # np.histogram breaks if array contain NaN
-            series = vis.data[bin_attr].dropna()
-            # TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
-            counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
-            # bin_edges of size N+1, so need to compute bin_start as the bin location
-            bin_start = bin_edges[0:-1]
-            # TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
-            binned_result = np.array([bin_start, counts]).T
-            vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
+        series = vis.data[bin_attr]
+
+        if series.hasnans:
+            ldf._message.add_unique(
+                f"The column <code>{bin_attr}</code> contains missing values, not shown in the displayed histogram.",
+                priority=100,
+            )
+            series = series.dropna()
+        if pd.api.types.is_object_dtype(series):
+            series = series.astype("float", errors="ignore")
+
+        counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
+        # bin_edges of size N+1, so need to compute bin_start as the bin location
+        bin_start = bin_edges[0:-1]
+        binned_result = np.array([bin_start, counts]).T
+        vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
 
     @staticmethod
     def execute_filter(vis: Vis):
@@ -422,13 +428,8 @@ def compute_data_type(self, ldf: LuxDataFrame):
                 elif self._is_geographical_attribute(ldf[attr]):
                     ldf._data_type[attr] = "geographical"
                 elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
-                    # int columns gets coerced into floats if contain NaN
-                    convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
-                    if (
-                        convertible2int
-                        and ldf.cardinality[attr] != len(ldf)
-                        and (len(ldf[attr].convert_dtypes().unique() < 20))
-                    ):
+
+                    if ldf.cardinality[attr] != len(ldf) and (ldf.cardinality[attr] < 20):
                         ldf._data_type[attr] = "nominal"
                     else:
                         ldf._data_type[attr] = "quantitative"
@@ -445,7 +446,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
                         ldf._data_type[attr] = "id"
                 # Eliminate this clause because a single NaN value can cause the dtype to be object
                 elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
-                    if check_if_id_like(ldf, attr):
+                    # Check first if it's castable to float after removing NaN
+                    is_numeric_nan, series = is_numeric_nan_column(ldf[attr])
+                    if is_numeric_nan:
+                        # int columns gets coerced into floats if contain NaN
+                        ldf._data_type[attr] = "quantitative"
+                        # min max was not computed since object type, so recompute here
+                        ldf._min_max[attr] = (
+                            series.min(),
+                            series.max(),
+                        )
+                    elif check_if_id_like(ldf, attr):
                         ldf._data_type[attr] = "id"
                     else:
                         ldf._data_type[attr] = "nominal"
@@ -527,11 +538,8 @@ def compute_stats(self, ldf: LuxDataFrame):
             else:
                 attribute_repr = attribute
 
-            if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
-                ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
-                ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
-            else:
-                ldf.cardinality[attribute_repr] = 999  # special value for non-numeric attribute
+            ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
+            ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])
 
             if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
                 ldf.dtypes[attribute]

diff --git a/lux/utils/utils.py b/lux/utils/utils.py
@@ -131,3 +131,15 @@ def matplotlib_setup(w, h):
     ax.spines["right"].set_color("#dddddd")
     ax.spines["top"].set_color("#dddddd")
     return fig, ax
+
+
+def is_numeric_nan_column(series):
+    if series.dtype == object:
+        if series.hasnans:
+            series = series.dropna()
+        try:
+            return True, series.astype("float")
+        except Exception as e:
+            return False, series
+    else:
+        return False, series
diff --git a/lux/vislib/altair/Choropleth.py b/lux/vislib/altair/Choropleth.py
@@ -37,7 +37,7 @@ def __init__(self, dobj):
         super().__init__(dobj)
 
     def __repr__(self):
-        return f"Proportional Symbol Map <{str(self.vis)}>"
+        return f"Choropleth Map <{str(self.vis)}>"
 
     def initialize_chart(self):
         x_attr = self.vis.get_attr_by_channel("x")[0]

diff --git a/lux/vislib/matplotlib/MatplotlibRenderer.py b/lux/vislib/matplotlib/MatplotlibRenderer.py
@@ -20,6 +20,7 @@
 from lux.vislib.matplotlib.LineChart import LineChart
 from lux.vislib.matplotlib.Histogram import Histogram
 from lux.vislib.matplotlib.Heatmap import Heatmap
+from lux.vislib.altair.AltairRenderer import AltairRenderer
 import matplotlib.pyplot as plt
 from lux.utils.utils import matplotlib_setup
 
@@ -81,6 +82,8 @@ def create_vis(self, vis, standalone=True):
             chart = LineChart(vis, fig, ax)
         elif vis.mark == "heatmap":
             chart = Heatmap(vis, fig, ax)
+        elif vis.mark == "geographical":
+            return AltairRenderer().create_vis(vis, False)
         else:
             chart = None
             return chart

diff --git a/lux/vislib/matplotlib/ScatterChart.py b/lux/vislib/matplotlib/ScatterChart.py
@@ -66,7 +66,7 @@ def initialize_chart(self):
             vals = [unique.index(i) for i in colors]
             if color_attr_type == "quantitative":
                 self.fig, self.ax = matplotlib_setup(7, 5)
-                set_fig_code = "fig, ax = plt.subplots(7, 5)\n"
+                set_fig_code = "fig, ax = plt.subplots(figsize=(7, 5))\n"
                 self.ax.scatter(x_pts, y_pts, c=vals, cmap="Blues", alpha=0.5)
                 plot_code += f"ax.scatter(x_pts, y_pts, c={vals}, cmap='Blues', alpha=0.5)\n"
                 my_cmap = plt.cm.get_cmap("Blues")
@@ -96,10 +96,10 @@ def initialize_chart(self):
                         maxlen = len(unique[i])
                 if maxlen > 20:
                     self.fig, self.ax = matplotlib_setup(9, 5)
-                    set_fig_code = "fig, ax = plt.subplots(9, 5)\n"
+                    set_fig_code = "fig, ax = plt.subplots(figsize=(9, 5))\n"
                 else:
                     self.fig, self.ax = matplotlib_setup(7, 5)
-                    set_fig_code = "fig, ax = plt.subplots(7, 5)\n"
+                    set_fig_code = "fig, ax = plt.subplots(figsize=(7, 5))\n"
 
                 cmap = "Set1"
                 if len(unique) > 9:
@@ -131,7 +131,7 @@ def initialize_chart(self):
                     fontsize='13')\n"""
                 plot_code += "scatter.set_alpha(0.5)\n"
         else:
-            set_fig_code = "fig, ax = plt.subplots(4.5, 4)\n"
+            set_fig_code = "fig, ax = plt.subplots(figsize=(4.5, 4))\n"
             self.ax.scatter(x_pts, y_pts, alpha=0.5)
             plot_code += f"ax.scatter(x_pts, y_pts, alpha=0.5)\n"
         self.ax.set_xlabel(x_attr_abv, fontsize="15")

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -6,4 +6,5 @@ xlrd
 black
 # Install only to use SQLExecutor
 psycopg2>=2.8.5
-psycopg2-binary>=2.8.5
+psycopg2-binary>=2.8.5
+lxml
diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py
@@ -328,25 +328,28 @@ def test_interestingness_deviation_nan():
     import numpy as np
 
     dataset = [
-        {"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
-        {"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
+        {"date": "2017-08-25", "category": "A", "value": 25.0},
+        {"date": "2017-08-25", "category": "B", "value": 1.2},
+        {"date": "2017-08-25", "category": "C", "value": 1.3},
+        {"date": "2017-08-25", "category": "D", "value": 1.4},
+        {"date": "2017-08-25", "category": "E", "value": 1.5},
+        {"date": "2017-08-25", "category": "F", "value": 0.1},
         {"date": np.nan, "category": "C", "value": 0.2},
         {"date": np.nan, "category": "B", "value": 0.2},
         {"date": np.nan, "category": "F", "value": 0.3},
         {"date": np.nan, "category": "E", "value": 0.3},
         {"date": np.nan, "category": "D", "value": 0.4},
         {"date": np.nan, "category": "A", "value": 10.4},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
-        {"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
+        {"date": "2017-07-25", "category": "A", "value": 15.5},
+        {"date": "2017-07-25", "category": "F", "value": 1.0},
+        {"date": "2017-07-25", "category": "B", "value": 0.1},
     ]
     test = pd.DataFrame(dataset)
     from lux.vis.Vis import Vis
 
+    test["date"] = pd.to_datetime(test["date"], format="%Y-%M-%d")
+    test.set_data_type({"value": "quantitative"})
+
     vis = Vis(["date", "value", "category=A"], test)
     vis2 = Vis(["date", "value", "category=B"], test)
     from lux.interestingness.interestingness import interestingness

diff --git a/tests/test_nan.py b/tests/test_nan.py
@@ -113,3 +113,30 @@ def test_nan_series_occurence():
     ldf = pd.DataFrame(nan_series, columns=["col"])
     ldf._ipython_display_()
     assert ldf.recommendation["Occurrence"][0].mark == "bar"
+
+
+def test_numeric_with_nan():
+    df = pd.read_html(
+        "https://archive.ics.uci.edu/ml/datasets.php?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=table"
+    )[5]
+    df.columns = df.loc[0]
+    df = df.loc[1:]
+    df["Year"] = pd.to_datetime(df["Year"], format="%Y")
+    assert (
+        df.data_type["# Instances"] == "quantitative"
+    ), "Testing a numeric columns with NaN, check if type can be detected correctly"
+    assert (
+        df.data_type["# Attributes"] == "quantitative"
+    ), "Testing a numeric columns with NaN, check if type can be detected correctly"
+    a = df[["# Instances", "# Attributes"]]
+    a._ipython_display_()
+    assert (
+        len(a.recommendation["Distribution"]) == 2
+    ), "Testing a numeric columns with NaN, check that histograms are displayed"
+    assert "contains missing values" in a._message.to_html(), "Warning message for NaN displayed"
+    a = a.dropna()
+    a._ipython_display_()
+    assert (
+        len(a.recommendation["Distribution"]) == 2
+    ), "Example where dtype might be off after dropna(), check if histograms are still displayed"
+    assert "" in a._message.to_html(), "No warning message for NaN should be displayed"
diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py
@@ -257,7 +257,7 @@ def test_transform(global_var):
     df["Year"] = pd.to_datetime(df["Year"], format="%Y")
     new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Occurrence"]
     assert len(new_df.cardinality) == 7
 
 
@@ -409,7 +409,7 @@ def test_loc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.loc[0:10, "Displacement":"Horsepower"]
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
     assert len(new_df.cardinality) == 2
     import numpy as np
 
@@ -438,7 +438,7 @@ def test_iloc(global_var):
     assert len(new_df.cardinality) == 6
     new_df = df.iloc[0:11, 3:5]
     new_df._ipython_display_()
-    assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
+    assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
     assert len(new_df.cardinality) == 2
     import numpy as np
 

diff --git a/tests/test_vis.py b/tests/test_vis.py
@@ -465,7 +465,7 @@ def test_vegalite_default_actions_registered_2(global_var):
     df["magnitude"] = np.random.randint(0, 20, size=len(df))
     lux.config.plotting_backend = "vegalite"
 
-    # Symbol Map
+    # Choropleth Map
     assert "Geographical" in df.recommendation
     assert len(df.recommendation["Geographical"]) > 0
 
@@ -499,6 +499,28 @@ def test_matplotlib_default_actions_registered(global_var):
     assert len(df.recommendation["Correlation"]) > 0
 
 
+def test_matplotlib_default_actions_registered_2(global_var):
+    import numpy as np
+
+    df = pd.read_csv(
+        "https://raw.githubusercontent.com/altair-viz/vega_datasets/master/vega_datasets/_data/airports.csv"
+    )
+    df["magnitude"] = np.random.randint(0, 20, size=len(df))
+    lux.config.plotting_backend = "matplotlib"
+
+    # Choropleth Map
+    assert "Geographical" in df.recommendation
+    assert len(df.recommendation["Geographical"]) > 0
+
+    # Occurrence Chart
+    assert "Occurrence" in df.recommendation
+    assert len(df.recommendation["Occurrence"]) > 0
+
+    # Scatter Chart
+    assert "Correlation" in df.recommendation
+    assert len(df.recommendation["Correlation"]) > 0
+
+
 def test_vegalite_heatmap_flag_config():
     df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
     lux.config.plotting_backend = "vegalite"