Skip to content

Commit

Permalink
various bugfix in test and data type inference code (#349)
Browse files Browse the repository at this point in the history
* remove expensive convert_dtype in float inference
* bring back cardinality calculation on all float columns
* nominal detection applies even for floats < 20 row now
  • Loading branch information
dorisjlee committed Apr 8, 2021
1 parent 1a72332 commit 4a04723
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 24 deletions.
16 changes: 4 additions & 12 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,13 +422,8 @@ def compute_data_type(self, ldf: LuxDataFrame):
elif self._is_geographical_attribute(ldf[attr]):
ldf._data_type[attr] = "geographical"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if (
convertible2int
and ldf.cardinality[attr] != len(ldf)
and (len(ldf[attr].convert_dtypes().unique() < 20))
):

if ldf.cardinality[attr] != len(ldf) and (ldf.cardinality[attr] < 20):
ldf._data_type[attr] = "nominal"
else:
ldf._data_type[attr] = "quantitative"
Expand Down Expand Up @@ -525,11 +520,8 @@ def compute_stats(self, ldf: LuxDataFrame):
else:
attribute_repr = attribute

if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
else:
ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute
ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])

if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
ldf.dtypes[attribute]
Expand Down
21 changes: 12 additions & 9 deletions tests/test_interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -273,25 +273,28 @@ def test_interestingness_deviation_nan():
import numpy as np

dataset = [
{"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
{"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
{"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
{"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
{"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
{"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
{"date": "2017-08-25", "category": "A", "value": 25.0},
{"date": "2017-08-25", "category": "B", "value": 1.2},
{"date": "2017-08-25", "category": "C", "value": 1.3},
{"date": "2017-08-25", "category": "D", "value": 1.4},
{"date": "2017-08-25", "category": "E", "value": 1.5},
{"date": "2017-08-25", "category": "F", "value": 0.1},
{"date": np.nan, "category": "C", "value": 0.2},
{"date": np.nan, "category": "B", "value": 0.2},
{"date": np.nan, "category": "F", "value": 0.3},
{"date": np.nan, "category": "E", "value": 0.3},
{"date": np.nan, "category": "D", "value": 0.4},
{"date": np.nan, "category": "A", "value": 10.4},
{"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
{"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
{"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
{"date": "2017-07-25", "category": "A", "value": 15.5},
{"date": "2017-07-25", "category": "F", "value": 1.0},
{"date": "2017-07-25", "category": "B", "value": 0.1},
]
test = pd.DataFrame(dataset)
from lux.vis.Vis import Vis

test["date"] = pd.to_datetime(test["date"], format="%Y-%M-%d")
test.set_data_type({"value": "quantitative"})

vis = Vis(["date", "value", "category=A"], test)
vis2 = Vis(["date", "value", "category=B"], test)
from lux.interestingness.interestingness import interestingness
Expand Down
6 changes: 3 additions & 3 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def test_transform(global_var):
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Occurrence"]
assert len(new_df.cardinality) == 7


Expand Down Expand Up @@ -409,7 +409,7 @@ def test_loc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10, "Displacement":"Horsepower"]
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand Down Expand Up @@ -438,7 +438,7 @@ def test_iloc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11, 3:5]
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand Down

0 comments on commit 4a04723

Please sign in to comment.