diff --git a/lux/action/filter.py b/lux/action/filter.py index dde432fc..af9a495b 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -102,7 +102,7 @@ def get_complementary_ops(fltr_op): categorical_vars = [] for col in list(ldf.columns): # if cardinality is not too high, and attribute is not one of the X,Y (specified) column - if ldf.cardinality[col] < 30 and col not in column_spec_attr: + if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr: categorical_vars.append(col) for cat in categorical_vars: unique_values = ldf.unique_values[cat] diff --git a/lux/core/__init__.py b/lux/core/__init__.py index 23585503..9a13cd20 100644 --- a/lux/core/__init__.py +++ b/lux/core/__init__.py @@ -14,10 +14,12 @@ import pandas as pd from .frame import LuxDataFrame +from .series import LuxSeries global originalDF # Keep variable scope of original pandas df originalDF = pd.core.frame.DataFrame +originalSeries = pd.core.series.Series def setOption(overridePandas=True): @@ -25,8 +27,10 @@ def setOption(overridePandas=True): pd.DataFrame = ( pd.io.json._json.DataFrame ) = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = LuxDataFrame + pd.Series = LuxSeries else: pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF + pd.Series = originalSeries setOption(overridePandas=True) diff --git a/lux/core/series.py b/lux/core/series.py index 44c05bf7..0ba805ce 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -15,6 +15,7 @@ import pandas as pd import lux import warnings +import traceback class LuxSeries(pd.Series): @@ -56,6 +57,14 @@ def f(*args, **kwargs): f._get_axis_number = super(LuxSeries, self)._get_axis_number return f + def to_pandas(self): + import lux.core + + return lux.core.originalSeries(self, copy=False) + + def display_pandas(self): + return self.to_pandas() + def __repr__(self): from IPython.display import display from IPython.display import clear_output @@ -63,6 +72,9 @@ def __repr__(self): from lux.core.frame import LuxDataFrame series_repr = super(LuxSeries, self).__repr__() + # Default column name 0 causes errors + if self.name is None: + self.name = " " ldf = LuxDataFrame(self) try: @@ -137,12 +149,13 @@ def on_button_clicked(b): except (KeyboardInterrupt, SystemExit): raise - except: + except Exception: warnings.warn( "\nUnexpected error in rendering Lux widget and recommendations. " - "Falling back to Pandas display.\n\n" - "Please report this issue on Github: https://github.com/lux-org/lux/issues ", + "Falling back to Pandas display.\n" + "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n", stacklevel=2, ) - print(series_repr) + warnings.warn(traceback.format_exc()) + display(self.display_pandas()) return "" diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index e0c10a90..4d055f72 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -159,22 +159,26 @@ def execute_aggregate(vis: Vis, isFiltered=True): if has_color: vis._vis_data = ( - vis.data.groupby([groupby_attr.attribute, color_attr.attribute]) + vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False) .count() .reset_index() ) vis._vis_data = vis.data.rename(columns={"index": "Record"}) vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, "Record"]] else: - vis._vis_data = vis.data.groupby(groupby_attr.attribute).count().reset_index() + vis._vis_data = ( + vis.data.groupby(groupby_attr.attribute, dropna=False).count().reset_index() + ) vis._vis_data = vis.data.rename(columns={"index": "Record"}) vis._vis_data = vis.data[[groupby_attr.attribute, "Record"]] else: # if color is specified, need to group by groupby_attr and color_attr if has_color: - groupby_result = vis.data.groupby([groupby_attr.attribute, color_attr.attribute]) + groupby_result = vis.data.groupby( + [groupby_attr.attribute, color_attr.attribute], dropna=False + ) else: - groupby_result = vis.data.groupby(groupby_attr.attribute) + groupby_result = vis.data.groupby(groupby_attr.attribute, dropna=False) groupby_result = groupby_result.agg(agg_func) intermediate = groupby_result.reset_index() vis._vis_data = intermediate.__finalize__(vis.data) @@ -225,6 +229,7 @@ def execute_aggregate(vis: Vis, isFiltered=True): assert ( len(list(vis.data[groupby_attr.attribute])) == N_unique_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." + vis._vis_data = vis.data.dropna(subset=[measure_attr.attribute]) vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True) vis._vis_data = vis.data.reset_index() vis._vis_data = vis.data.drop(columns="index") @@ -298,6 +303,16 @@ def apply_filter(df: pd.DataFrame, attribute: str, op: str, val: object) -> pd.D df: pandas.DataFrame Dataframe resulting from the filter operation """ + # Handling NaN filter values + if utils.like_nan(val): + if op != "=" and op != "!=": + warnings.warn("Filter on NaN must be used with equality operations (i.e., `=` or `!=`)") + else: + if op == "=": + return df[df[attribute].isna()] + elif op == "!=": + return df[~df[attribute].isna()] + # Applying filter in regular, non-NaN cases if op == "=": return df[df[attribute] == val] elif op == "<": @@ -380,7 +395,12 @@ def compute_data_type(self, ldf: LuxDataFrame): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): - ldf.data_type_lookup[attr] = "quantitative" + # int columns gets coerced into floats if contain NaN + convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) + if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20: + ldf.data_type_lookup[attr] = "nominal" + else: + ldf.data_type_lookup[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: @@ -413,24 +433,17 @@ def compute_data_type(self, ldf: LuxDataFrame): for attr in ldf.columns: if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]): non_datetime_attrs.append(attr) + warn_msg = "" if len(non_datetime_attrs) == 1: - warnings.warn( - f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" - "In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" - "Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" - "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" - "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", - stacklevel=2, - ) + warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n" elif len(non_datetime_attrs) > 1: - warnings.warn( - f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" - "In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n" - "Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n" - "For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n" - "See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n", - stacklevel=2, - ) + warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n" + if len(non_datetime_attrs) > 0: + warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n" + for attr in non_datetime_attrs: + warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='')\n" + warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" + warnings.warn(warn_msg, stacklevel=2) def _is_datetime_string(self, series): if len(series) > 100: diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 0734dc6d..0c94757e 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -73,6 +73,7 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: if n_dim == 1 and (n_msr == 0 or n_msr == 1): if v_size < 2: return -1 + if n_filter == 0: return unevenness(vis, ldf, measure_lst, dimension_lst) elif n_filter == 1: @@ -184,7 +185,9 @@ def weighted_correlation(x, y, w): return weighted_cov(x, y, w) / np.sqrt(weighted_cov(x, x, w) * weighted_cov(y, y, w)) -def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int: +def deviation_from_overall( + vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str, exclude_nan: bool = True +) -> int: """ Difference in bar chart/histogram shape from overall chart Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data. @@ -197,6 +200,8 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_ List of filters from the Vis msr_attribute : str The attribute name of the measure value of the chart + exclude_nan: bool + Whether to include/exclude NaN values as part of the deviation calculation Returns ------- @@ -204,8 +209,13 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_ Score describing how different the vis is from the overall vis """ v_filter_size = get_filtered_size(filter_specs, ldf) - v_size = len(vis.data) - v_filter = vis.data[msr_attribute] + + if exclude_nan: + vdata = vis.data.dropna() + else: + vdata = vis.data + v_size = len(vdata) + v_filter = vdata[msr_attribute] total = v_filter.sum() v_filter = v_filter / total # normalize by total to get ratio if total == 0: @@ -217,8 +227,11 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_ # Remove filters, keep only attribute intent unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent) lux.config.executor.execute([unfiltered_vis], ldf) - - v = unfiltered_vis.data[msr_attribute] + if exclude_nan: + uv = unfiltered_vis.data.dropna() + else: + uv = unfiltered_vis.data + v = uv[msr_attribute] v = v / v.sum() assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length." sig = v_filter_size / v_size # significance factor @@ -230,8 +243,8 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_ dimList = vis.get_attr_by_data_model("dimension") # use Pandas rank function to calculate rank positions for each category - v_rank = unfiltered_vis.data.rank() - v_filter_rank = vis.data.rank() + v_rank = uv.rank() + v_filter_rank = vdata.rank() # go through and count the number of ranking changes between the filtered and unfiltered data numCategories = ldf.cardinality[dimList[0].attribute] for r in range(0, numCategories - 1): @@ -267,12 +280,16 @@ def unevenness(vis: Vis, ldf: LuxDataFrame, measure_lst: list, dimension_lst: li """ v = vis.data[measure_lst[0].attribute] v = v / v.sum() # normalize by total to get ratio + v = v.fillna(0) # Some bar values may be NaN C = ldf.cardinality[dimension_lst[0].attribute] D = (0.9) ** C # cardinality-based discounting factor v_flat = pd.Series([1 / C] * len(v)) if is_datetime(v): v = v.astype("int") - return D * euclidean(v, v_flat) + try: + return D * euclidean(v, v_flat) + except (ValueError): + return 0.01 def mutual_information(v_x: list, v_y: list) -> int: diff --git a/lux/processor/Validator.py b/lux/processor/Validator.py index 9be2f5de..c72dc63b 100644 --- a/lux/processor/Validator.py +++ b/lux/processor/Validator.py @@ -19,6 +19,7 @@ from lux.utils.date_utils import is_datetime_series, is_datetime_string import warnings import lux +import lux.utils.utils class Validator: @@ -80,15 +81,19 @@ def validate_clause(clause): else: warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n Please check your input intent for typos." if clause.value and clause.attribute and clause.filter_op == "=": - series = ldf[clause.attribute] - if not is_datetime_series(series): - if isinstance(clause.value, list): - vals = clause.value - else: - vals = [clause.value] - for val in vals: - if val not in series.values: - warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame." + import math + + # Skip check for NaN filter values + if not lux.utils.utils.like_nan(clause.value): + series = ldf[clause.attribute] + if not is_datetime_series(series): + if isinstance(clause.value, list): + vals = clause.value + else: + vals = [clause.value] + for val in vals: + if val not in series.values: + warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame." return warn_msg warn_msg = "" diff --git a/lux/utils/utils.py b/lux/utils/utils.py index 4c289b65..1d32f6e5 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -89,3 +89,12 @@ def check_if_id_like(df, attribute): else: # TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even) return high_cardinality and (attribute_contain_id or almost_all_vals_unique) + + +def like_nan(val): + if isinstance(val, str): + return val.lower() == "nan" + elif isinstance(val, float) or isinstance(val, int): + import math + + return math.isnan(val) diff --git a/lux/vislib/altair/LineChart.py b/lux/vislib/altair/LineChart.py index 002beefb..18538d51 100644 --- a/lux/vislib/altair/LineChart.py +++ b/lux/vislib/altair/LineChart.py @@ -39,7 +39,8 @@ def initialize_chart(self): self.tooltip = False # tooltip looks weird for line chart x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] - + # Remove NaNs only for Line Charts (offsets axis range) + self.data = self.data.dropna(subset=[x_attr.attribute, y_attr.attribute]) self.code += "import altair as alt\n" self.code += "import pandas._libs.tslibs.timestamps\n" self.code += "from pandas._libs.tslibs.timestamps import Timestamp\n" diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py index eceadcde..a13ee406 100644 --- a/tests/test_interestingness.py +++ b/tests/test_interestingness.py @@ -277,3 +277,37 @@ def test_interestingness_0_2_1(global_var): df._repr_html_() # check that top recommended Generalize graph score is not none assert interestingness(df.recommendation["Generalize"][0], df) != None + + +def test_interestingness_deviation_nan(): + import numpy as np + + dataset = [ + {"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0}, + {"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2}, + {"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3}, + {"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4}, + {"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5}, + {"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1}, + {"date": np.nan, "category": "C", "value": 0.2}, + {"date": np.nan, "category": "B", "value": 0.2}, + {"date": np.nan, "category": "F", "value": 0.3}, + {"date": np.nan, "category": "E", "value": 0.3}, + {"date": np.nan, "category": "D", "value": 0.4}, + {"date": np.nan, "category": "A", "value": 10.4}, + {"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5}, + {"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0}, + {"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1}, + ] + test = pd.DataFrame(dataset) + from lux.vis.Vis import Vis + + vis = Vis(["date", "value", "category=A"], test) + vis2 = Vis(["date", "value", "category=B"], test) + from lux.interestingness.interestingness import interestingness + + smaller_diff_score = interestingness(vis, test) + bigger_diff_score = interestingness(vis2, test) + assert np.isclose(smaller_diff_score, 0.29, rtol=0.1) + assert np.isclose(bigger_diff_score, 0.94, rtol=0.1) + assert smaller_diff_score < bigger_diff_score diff --git a/tests/test_nan.py b/tests/test_nan.py index f91f7c0c..96918af0 100644 --- a/tests/test_nan.py +++ b/tests/test_nan.py @@ -27,3 +27,85 @@ def test_nan_column(global_var): for visList in df.recommendation.keys(): for vis in df.recommendation[visList]: assert vis.get_attr_by_attr_name("Geography") == [] + + +def test_nan_data_type_detection(): + import numpy as np + + dataset = [ + {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, + {"fully_nan": np.nan, "some_nan": 15.0, "some_nan2": 3.0}, + {"fully_nan": np.nan, "some_nan": np.nan, "some_nan2": 3.0}, + {"fully_nan": np.nan, "some_nan": 7.0, "some_nan2": 0.0}, + {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 2.0}, + {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, + {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, + {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, + {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 0.0}, + {"fully_nan": np.nan, "some_nan": 11.0, "some_nan2": 0.0}, + ] + test = pd.DataFrame(dataset) + test.maintain_metadata() + assert test.data_type["nominal"] == [ + "fully_nan", + "some_nan", + "some_nan2", + ], "Categorical columns containing NaNs should be treated as nominal data type" + nona_test = test.dropna(subset=["some_nan"]) + nona_test.maintain_metadata() + assert nona_test.data_type["nominal"] == [ + "fully_nan", + "some_nan", + "some_nan2", + ], "Categorical float columns without NaNs should still be categorical, even after dropping NaNs" + + +def test_apply_nan_filter(): + from lux.vis.Vis import Vis + + import numpy as np + + dataset = [ + {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, + {"fully_nan": np.nan, "some_nan": 15.0, "some_nan2": 3.0}, + {"fully_nan": np.nan, "some_nan": np.nan, "some_nan2": 3.0}, + {"fully_nan": np.nan, "some_nan": 7.0, "some_nan2": 0.0}, + {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 2.0}, + {"fully_nan": np.nan, "some_nan": 3.0, "some_nan2": np.nan}, + {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, + {"fully_nan": np.nan, "some_nan": 1.0, "some_nan2": 1.0}, + {"fully_nan": np.nan, "some_nan": 2.0, "some_nan2": 0.0}, + {"fully_nan": np.nan, "some_nan": 11.0, "some_nan2": 0.0}, + ] + test = pd.DataFrame(dataset) + + vis = Vis(["some_nan", "some_nan2=nan"], test) + vis._repr_html_() + assert vis.mark == "bar" + + +def test_nan_series_occurence(): + from lux.core.series import LuxSeries + from math import nan + + dvalues = { + 1: " dummy ", + 2: " dummy ", + 3: nan, + 4: " dummy ", + 5: nan, + 6: " dummy ", + 7: " dummy ", + 8: nan, + 9: " dummy ", + 10: nan, + 11: " dummy ", + 12: nan, + 13: nan, + 14: " dummy ", + 15: " dummy ", + } + nan_series = LuxSeries(dvalues) + ldf = pd.DataFrame(nan_series, columns=["col"]) + ldf._repr_html_() + assert ldf.recommendation["Occurrence"][0].mark == "bar" diff --git a/tests/test_type.py b/tests/test_type.py index 4c53656a..4dbac971 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -154,3 +154,35 @@ def test_check_college(): "MedianFamilyIncome": "quantitative", "MedianEarnings": "quantitative", } + + +def test_float_categorical(): + values = [ + {"A": 6.0, "B": 1.0, "C": 1.0, "D": 3.0, "E": 2.0, "F": 5.0}, + {"A": 5.0, "B": 2.0, "C": 2.0, "D": 2.0, "E": 2.0, "F": 3.0}, + {"A": 3.0, "B": 6.0, "C": 3.0, "D": 3.0, "E": 2.0, "F": 5.0}, + {"A": 6.0, "B": 3.0, "C": 3.0, "D": 2.0, "E": 2.0, "F": 2.0}, + {"A": 7.0, "B": 4.0, "C": 2.0, "D": 2.0, "E": 2.0, "F": 4.0}, + {"A": 5.0, "B": 3.0, "C": 6.0, "D": 3.0, "E": 3.0, "F": 4.0}, + {"A": 3.0, "B": 4.0, "C": 3.0, "D": 6.0, "E": 5.0, "F": 5.0}, + {"A": 3.0, "B": 3.0, "C": 2.0, "D": 2.0, "E": 4.0, "F": 5.0}, + {"A": 3.0, "B": 2.0, "C": 2.0, "D": 2.0, "E": 2.0, "F": 4.0}, + {"A": 1.0, "B": 2.0, "C": 2.0, "D": 2.0, "E": 2.0, "F": 6.0}, + {"A": 3.0, "B": 3.0, "C": 2.0, "D": 3.0, "E": 3.0, "F": 5.0}, + {"A": 7.0, "B": 1.0, "C": 1.0, "D": 2.0, "E": 2.0, "F": 3.0}, + {"A": 6.0, "B": 2.0, "C": 2.0, "D": 2.0, "E": 2.0, "F": 3.0}, + {"A": 2.0, "B": 3.0, "C": 2.0, "D": 3.0, "E": 3.0, "F": 4.0}, + {"A": 6.0, "B": 2.0, "C": 3.0, "D": 3.0, "E": 3.0, "F": 5.0}, + ] + df = pd.DataFrame(values) + df.maintain_metadata() + assert df.data_type["nominal"] == [ + "A", + "B", + "C", + "D", + "E", + "F", + ], "Float column should be detected as categorical" + for x in list(df.dtypes): + assert x == "float64", "Source dataframe preserved as float dtype"