Skip to content

Commit

Permalink
[WIP] update type inference for string columns (#343)
Browse files Browse the repository at this point in the history
* update type inference for string columns

* #249 example working with histograms on NaN columns, added test

* rewrote is_numeric_nan_column in a more optimized way

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
Moh-Yakoub and dorisjlee committed Apr 10, 2021
1 parent 952d3c5 commit bab48ff
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 14 deletions.
44 changes: 30 additions & 14 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from lux.executor.Executor import Executor
from lux.utils import utils
from lux.utils.date_utils import is_datetime_series
from lux.utils.utils import check_import_lux_widget, check_if_id_like
from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column
import warnings
import lux

Expand Down Expand Up @@ -97,7 +97,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
if vis.mark == "bar" or vis.mark == "line" or vis.mark == "geographical":
PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed)
elif vis.mark == "histogram":
PandasExecutor.execute_binning(vis)
PandasExecutor.execute_binning(ldf, vis)
elif vis.mark == "scatter":
HBIN_START = 5000
if lux.config.heatmap and len(ldf) > HBIN_START:
Expand Down Expand Up @@ -259,7 +259,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
vis._vis_data = vis._vis_data.drop(columns="index")

@staticmethod
def execute_binning(vis: Vis):
def execute_binning(ldf, vis: Vis):
"""
Binning of data points for generating histograms
Expand All @@ -278,16 +278,22 @@ def execute_binning(vis: Vis):

bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
bin_attr = bin_attribute.attribute
if not np.isnan(vis.data[bin_attr]).all():
# np.histogram breaks if array contain NaN
series = vis.data[bin_attr].dropna()
# TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
# TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
binned_result = np.array([bin_start, counts]).T
vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
series = vis.data[bin_attr]

if series.hasnans:
ldf._message.add_unique(
f"The column <code>{bin_attr}</code> contains missing values, not shown in the displayed histogram.",
priority=100,
)
series = series.dropna()
if pd.api.types.is_object_dtype(series):
series = series.astype("float", errors="ignore")

counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
binned_result = np.array([bin_start, counts]).T
vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])

@staticmethod
def execute_filter(vis: Vis):
Expand Down Expand Up @@ -440,7 +446,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
ldf._data_type[attr] = "id"
# Eliminate this clause because a single NaN value can cause the dtype to be object
elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
if check_if_id_like(ldf, attr):
# Check first if it's castable to float after removing NaN
is_numeric_nan, series = is_numeric_nan_column(ldf[attr])
if is_numeric_nan:
# int columns gets coerced into floats if contain NaN
ldf._data_type[attr] = "quantitative"
# min max was not computed since object type, so recompute here
ldf._min_max[attr] = (
series.min(),
series.max(),
)
elif check_if_id_like(ldf, attr):
ldf._data_type[attr] = "id"
else:
ldf._data_type[attr] = "nominal"
Expand Down
12 changes: 12 additions & 0 deletions lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,3 +125,15 @@ def matplotlib_setup(w, h):
ax.spines["right"].set_color("#dddddd")
ax.spines["top"].set_color("#dddddd")
return fig, ax


def is_numeric_nan_column(series):
if series.dtype == object:
if series.hasnans:
series = series.dropna()
try:
return True, series.astype("float")
except Exception as e:
return False, series
else:
return False, series
1 change: 1 addition & 0 deletions requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,4 @@ Sphinx>=3.0.2
sphinx-rtd-theme>=0.4.3
xlrd
black
lxml
27 changes: 27 additions & 0 deletions tests/test_nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,30 @@ def test_nan_series_occurence():
ldf = pd.DataFrame(nan_series, columns=["col"])
ldf._ipython_display_()
assert ldf.recommendation["Occurrence"][0].mark == "bar"


def test_numeric_with_nan():
df = pd.read_html(
"https://archive.ics.uci.edu/ml/datasets.php?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=table"
)[5]
df.columns = df.loc[0]
df = df.loc[1:]
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
assert (
df.data_type["# Instances"] == "quantitative"
), "Testing a numeric columns with NaN, check if type can be detected correctly"
assert (
df.data_type["# Attributes"] == "quantitative"
), "Testing a numeric columns with NaN, check if type can be detected correctly"
a = df[["# Instances", "# Attributes"]]
a._ipython_display_()
assert (
len(a.recommendation["Distribution"]) == 2
), "Testing a numeric columns with NaN, check that histograms are displayed"
assert "contains missing values" in a._message.to_html(), "Warning message for NaN displayed"
a = a.dropna()
a._ipython_display_()
assert (
len(a.recommendation["Distribution"]) == 2
), "Example where dtype might be off after dropna(), check if histograms are still displayed"
assert "" in a._message.to_html(), "No warning message for NaN should be displayed"

0 comments on commit bab48ff

Please sign in to comment.