Skip to content

Commit

Permalink
Making Lux more robust with missing values and NaN (#179) (#180)
Browse files Browse the repository at this point in the history
* improve datetime warning message with starter templates

* Handling NaN value errors
* skipping validator check for NaN filter values
* adding special case for PandasExecutor to map filter NaN to isna()
* fixing unevenness metric when bar values are NaN
* eliminate 1-cardinality filters in Filter action (since equal to overall)
* fixed deviation array unequal bug when NaN

* Handling NaN filter and data type
* fixed data type detection when int coerced to float when containing NaN
* added test for applying NaN filter

* Ensure that LuxSeries displayed when there is NaN
* ensure that NaNs are not dropped in groupbys
* exclude NaN values in deviation calculation
* fix unnamed series issue
* improved debugging message for LuxSeries

* Override pd.Series with LuxSeries

* Fixes for type checking and line charts with NaNs
* exclude NaN for line charts to prevent large axes offsetting
* improved type checking for float no-longer NaN columns
* fixed and improved deviation calculation test

* added float categorical test
  • Loading branch information
dorisjlee committed Dec 21, 2020
1 parent b7635c0 commit e08460b
Show file tree
Hide file tree
Showing 11 changed files with 254 additions and 44 deletions.
2 changes: 1 addition & 1 deletion lux/action/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def get_complementary_ops(fltr_op):
categorical_vars = []
for col in list(ldf.columns):
# if cardinality is not too high, and attribute is not one of the X,Y (specified) column
if ldf.cardinality[col] < 30 and col not in column_spec_attr:
if 1 < ldf.cardinality[col] < 30 and col not in column_spec_attr:
categorical_vars.append(col)
for cat in categorical_vars:
unique_values = ldf.unique_values[cat]
Expand Down
4 changes: 4 additions & 0 deletions lux/core/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,19 +14,23 @@

import pandas as pd
from .frame import LuxDataFrame
from .series import LuxSeries

global originalDF
# Keep variable scope of original pandas df
originalDF = pd.core.frame.DataFrame
originalSeries = pd.core.series.Series


def setOption(overridePandas=True):
if overridePandas:
pd.DataFrame = (
pd.io.json._json.DataFrame
) = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = LuxDataFrame
pd.Series = LuxSeries
else:
pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF
pd.Series = originalSeries


setOption(overridePandas=True)
21 changes: 17 additions & 4 deletions lux/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
import pandas as pd
import lux
import warnings
import traceback


class LuxSeries(pd.Series):
Expand Down Expand Up @@ -56,13 +57,24 @@ def f(*args, **kwargs):
f._get_axis_number = super(LuxSeries, self)._get_axis_number
return f

def to_pandas(self):
import lux.core

return lux.core.originalSeries(self, copy=False)

def display_pandas(self):
return self.to_pandas()

def __repr__(self):
from IPython.display import display
from IPython.display import clear_output
import ipywidgets as widgets
from lux.core.frame import LuxDataFrame

series_repr = super(LuxSeries, self).__repr__()
# Default column name 0 causes errors
if self.name is None:
self.name = " "
ldf = LuxDataFrame(self)

try:
Expand Down Expand Up @@ -137,12 +149,13 @@ def on_button_clicked(b):

except (KeyboardInterrupt, SystemExit):
raise
except:
except Exception:
warnings.warn(
"\nUnexpected error in rendering Lux widget and recommendations. "
"Falling back to Pandas display.\n\n"
"Please report this issue on Github: https://github.com/lux-org/lux/issues ",
"Falling back to Pandas display.\n"
"Please report the following issue on Github: https://github.com/lux-org/lux/issues \n",
stacklevel=2,
)
print(series_repr)
warnings.warn(traceback.format_exc())
display(self.display_pandas())
return ""
55 changes: 34 additions & 21 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -159,22 +159,26 @@ def execute_aggregate(vis: Vis, isFiltered=True):

if has_color:
vis._vis_data = (
vis.data.groupby([groupby_attr.attribute, color_attr.attribute])
vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False)
.count()
.reset_index()
)
vis._vis_data = vis.data.rename(columns={"index": "Record"})
vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, "Record"]]
else:
vis._vis_data = vis.data.groupby(groupby_attr.attribute).count().reset_index()
vis._vis_data = (
vis.data.groupby(groupby_attr.attribute, dropna=False).count().reset_index()
)
vis._vis_data = vis.data.rename(columns={"index": "Record"})
vis._vis_data = vis.data[[groupby_attr.attribute, "Record"]]
else:
# if color is specified, need to group by groupby_attr and color_attr
if has_color:
groupby_result = vis.data.groupby([groupby_attr.attribute, color_attr.attribute])
groupby_result = vis.data.groupby(
[groupby_attr.attribute, color_attr.attribute], dropna=False
)
else:
groupby_result = vis.data.groupby(groupby_attr.attribute)
groupby_result = vis.data.groupby(groupby_attr.attribute, dropna=False)
groupby_result = groupby_result.agg(agg_func)
intermediate = groupby_result.reset_index()
vis._vis_data = intermediate.__finalize__(vis.data)
Expand Down Expand Up @@ -225,6 +229,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
assert (
len(list(vis.data[groupby_attr.attribute])) == N_unique_vals
), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
vis._vis_data = vis.data.dropna(subset=[measure_attr.attribute])
vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True)
vis._vis_data = vis.data.reset_index()
vis._vis_data = vis.data.drop(columns="index")
Expand Down Expand Up @@ -298,6 +303,16 @@ def apply_filter(df: pd.DataFrame, attribute: str, op: str, val: object) -> pd.D
df: pandas.DataFrame
Dataframe resulting from the filter operation
"""
# Handling NaN filter values
if utils.like_nan(val):
if op != "=" and op != "!=":
warnings.warn("Filter on NaN must be used with equality operations (i.e., `=` or `!=`)")
else:
if op == "=":
return df[df[attribute].isna()]
elif op == "!=":
return df[~df[attribute].isna()]
# Applying filter in regular, non-NaN cases
if op == "=":
return df[df[attribute] == val]
elif op == "<":
Expand Down Expand Up @@ -380,7 +395,12 @@ def compute_data_type(self, ldf: LuxDataFrame):
elif str(attr).lower() in temporal_var_list:
ldf.data_type_lookup[attr] = "temporal"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
ldf.data_type_lookup[attr] = "quantitative"
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20:
ldf.data_type_lookup[attr] = "nominal"
else:
ldf.data_type_lookup[attr] = "quantitative"
elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]):
# See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values
if ldf.pre_aggregated:
Expand Down Expand Up @@ -413,24 +433,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
for attr in ldf.columns:
if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]):
non_datetime_attrs.append(attr)
warn_msg = ""
if len(non_datetime_attrs) == 1:
warnings.warn(
f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
"In order to display visualizations for this attribute accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
"Please consider converting this attribute using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
"For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
"See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
stacklevel=2,
)
warn_msg += f"\nLux detects that the attribute '{non_datetime_attrs[0]}' may be temporal.\n"
elif len(non_datetime_attrs) > 1:
warnings.warn(
f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
"In order to display visualizations for these attributes accurately, temporal attributes should be converted to Pandas Datetime objects.\n\n"
"Please consider converting these attributes using the pd.to_datetime function and providing a 'format' parameter to specify datetime format of the attribute.\n"
"For example, you can convert the 'month' attribute in a dataset to Datetime type via the following command:\n\n\t df['month'] = pd.to_datetime(df['month'], format='%m')\n\n"
"See more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html\n",
stacklevel=2,
)
warn_msg += f"\nLux detects that attributes {non_datetime_attrs} may be temporal.\n"
if len(non_datetime_attrs) > 0:
warn_msg += "To display visualizations for these attributes accurately, please convert temporal attributes to Pandas Datetime objects using the pd.to_datetime function and provide a 'format' parameter to specify the datetime format of the attribute.\nFor example, you can convert a year-only attribute (e.g., 1998, 1971, 1982) to Datetime type by specifying the `format` as '%Y'.\n\nHere is a starter template that you can use for converting the temporal fields:\n"
for attr in non_datetime_attrs:
warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='<replace-with-datetime-format>')\n"
warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html"
warnings.warn(warn_msg, stacklevel=2)

def _is_datetime_string(self, series):
if len(series) > 100:
Expand Down
33 changes: 25 additions & 8 deletions lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,7 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int:
if n_dim == 1 and (n_msr == 0 or n_msr == 1):
if v_size < 2:
return -1

if n_filter == 0:
return unevenness(vis, ldf, measure_lst, dimension_lst)
elif n_filter == 1:
Expand Down Expand Up @@ -184,7 +185,9 @@ def weighted_correlation(x, y, w):
return weighted_cov(x, y, w) / np.sqrt(weighted_cov(x, x, w) * weighted_cov(y, y, w))


def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str) -> int:
def deviation_from_overall(
vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str, exclude_nan: bool = True
) -> int:
"""
Difference in bar chart/histogram shape from overall chart
Note: this function assumes that the filtered vis.data is operating on the same range as the unfiltered vis.data.
Expand All @@ -197,15 +200,22 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
List of filters from the Vis
msr_attribute : str
The attribute name of the measure value of the chart
exclude_nan: bool
Whether to include/exclude NaN values as part of the deviation calculation
Returns
-------
int
Score describing how different the vis is from the overall vis
"""
v_filter_size = get_filtered_size(filter_specs, ldf)
v_size = len(vis.data)
v_filter = vis.data[msr_attribute]

if exclude_nan:
vdata = vis.data.dropna()
else:
vdata = vis.data
v_size = len(vdata)
v_filter = vdata[msr_attribute]
total = v_filter.sum()
v_filter = v_filter / total # normalize by total to get ratio
if total == 0:
Expand All @@ -217,8 +227,11 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
# Remove filters, keep only attribute intent
unfiltered_vis._inferred_intent = utils.get_attrs_specs(vis._inferred_intent)
lux.config.executor.execute([unfiltered_vis], ldf)

v = unfiltered_vis.data[msr_attribute]
if exclude_nan:
uv = unfiltered_vis.data.dropna()
else:
uv = unfiltered_vis.data
v = uv[msr_attribute]
v = v / v.sum()
assert len(v) == len(v_filter), "Data for filtered and unfiltered vis have unequal length."
sig = v_filter_size / v_size # significance factor
Expand All @@ -230,8 +243,8 @@ def deviation_from_overall(vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_
dimList = vis.get_attr_by_data_model("dimension")

# use Pandas rank function to calculate rank positions for each category
v_rank = unfiltered_vis.data.rank()
v_filter_rank = vis.data.rank()
v_rank = uv.rank()
v_filter_rank = vdata.rank()
# go through and count the number of ranking changes between the filtered and unfiltered data
numCategories = ldf.cardinality[dimList[0].attribute]
for r in range(0, numCategories - 1):
Expand Down Expand Up @@ -267,12 +280,16 @@ def unevenness(vis: Vis, ldf: LuxDataFrame, measure_lst: list, dimension_lst: li
"""
v = vis.data[measure_lst[0].attribute]
v = v / v.sum() # normalize by total to get ratio
v = v.fillna(0) # Some bar values may be NaN
C = ldf.cardinality[dimension_lst[0].attribute]
D = (0.9) ** C # cardinality-based discounting factor
v_flat = pd.Series([1 / C] * len(v))
if is_datetime(v):
v = v.astype("int")
return D * euclidean(v, v_flat)
try:
return D * euclidean(v, v_flat)
except (ValueError):
return 0.01


def mutual_information(v_x: list, v_y: list) -> int:
Expand Down
23 changes: 14 additions & 9 deletions lux/processor/Validator.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
from lux.utils.date_utils import is_datetime_series, is_datetime_string
import warnings
import lux
import lux.utils.utils


class Validator:
Expand Down Expand Up @@ -80,15 +81,19 @@ def validate_clause(clause):
else:
warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n Please check your input intent for typos."
if clause.value and clause.attribute and clause.filter_op == "=":
series = ldf[clause.attribute]
if not is_datetime_series(series):
if isinstance(clause.value, list):
vals = clause.value
else:
vals = [clause.value]
for val in vals:
if val not in series.values:
warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame."
import math

# Skip check for NaN filter values
if not lux.utils.utils.like_nan(clause.value):
series = ldf[clause.attribute]
if not is_datetime_series(series):
if isinstance(clause.value, list):
vals = clause.value
else:
vals = [clause.value]
for val in vals:
if val not in series.values:
warn_msg = f"\n- The input value '{val}' does not exist for the attribute '{clause.attribute}' for the DataFrame."
return warn_msg

warn_msg = ""
Expand Down
9 changes: 9 additions & 0 deletions lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,12 @@ def check_if_id_like(df, attribute):
else:
# TODO: Could probably add some type of entropy measure (since the binned id fields are usually very even)
return high_cardinality and (attribute_contain_id or almost_all_vals_unique)


def like_nan(val):
if isinstance(val, str):
return val.lower() == "nan"
elif isinstance(val, float) or isinstance(val, int):
import math

return math.isnan(val)
3 changes: 2 additions & 1 deletion lux/vislib/altair/LineChart.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ def initialize_chart(self):
self.tooltip = False # tooltip looks weird for line chart
x_attr = self.vis.get_attr_by_channel("x")[0]
y_attr = self.vis.get_attr_by_channel("y")[0]

# Remove NaNs only for Line Charts (offsets axis range)
self.data = self.data.dropna(subset=[x_attr.attribute, y_attr.attribute])
self.code += "import altair as alt\n"
self.code += "import pandas._libs.tslibs.timestamps\n"
self.code += "from pandas._libs.tslibs.timestamps import Timestamp\n"
Expand Down
34 changes: 34 additions & 0 deletions tests/test_interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,3 +277,37 @@ def test_interestingness_0_2_1(global_var):
df._repr_html_()
# check that top recommended Generalize graph score is not none
assert interestingness(df.recommendation["Generalize"][0], df) != None


def test_interestingness_deviation_nan():
import numpy as np

dataset = [
{"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
{"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
{"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
{"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
{"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
{"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
{"date": np.nan, "category": "C", "value": 0.2},
{"date": np.nan, "category": "B", "value": 0.2},
{"date": np.nan, "category": "F", "value": 0.3},
{"date": np.nan, "category": "E", "value": 0.3},
{"date": np.nan, "category": "D", "value": 0.4},
{"date": np.nan, "category": "A", "value": 10.4},
{"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
{"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
{"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
]
test = pd.DataFrame(dataset)
from lux.vis.Vis import Vis

vis = Vis(["date", "value", "category=A"], test)
vis2 = Vis(["date", "value", "category=B"], test)
from lux.interestingness.interestingness import interestingness

smaller_diff_score = interestingness(vis, test)
bigger_diff_score = interestingness(vis2, test)
assert np.isclose(smaller_diff_score, 0.29, rtol=0.1)
assert np.isclose(bigger_diff_score, 0.94, rtol=0.1)
assert smaller_diff_score < bigger_diff_score
Loading

0 comments on commit e08460b

Please sign in to comment.