From c1944a23892b91529987118fd3d36a9725347c60 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 6 Jan 2021 12:02:37 +0800 Subject: [PATCH 1/4] bugfix for describe and convert_dtypes --- lux/core/frame.py | 2 +- lux/core/series.py | 16 ++++++++-------- lux/executor/PandasExecutor.py | 18 +++++------------- tests/test_nan.py | 2 ++ tests/test_pandas.py | 14 ++++++++++++++ 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/lux/core/frame.py b/lux/core/frame.py index e4ed9e3e..8546c168 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -155,7 +155,7 @@ def _set_item(self, key, value): def _infer_structure(self): # If the dataframe is very small and the index column is not a range index, then it is likely that this is an aggregated data is_multi_index_flag = self.index.nlevels != 1 - not_int_index_flag = self.index.dtype != "int64" + not_int_index_flag = not pd.api.types.is_integer_dtype(self.index) small_df_flag = len(self) < 100 self.pre_aggregated = (is_multi_index_flag or not_int_index_flag) and small_df_flag if "Number of Records" in self.columns: diff --git a/lux/core/series.py b/lux/core/series.py index aea13d0c..aebcabbd 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -45,14 +45,14 @@ def _constructor(self): def _constructor_expanddim(self): from lux.core.frame import LuxDataFrame - def f(*args, **kwargs): - df = LuxDataFrame(*args, **kwargs) - for attr in self._metadata: - df.__dict__[attr] = getattr(self, attr, None) - return df - - f._get_axis_number = super(LuxSeries, self)._get_axis_number - return f + # def f(*args, **kwargs): + # df = LuxDataFrame(*args, **kwargs) + # for attr in self._metadata: + # df.__dict__[attr] = getattr(self, attr, None) + # return df + + # f._get_axis_number = super(LuxSeries, self)._get_axis_number + return LuxDataFrame def to_pandas(self): import lux.core diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 9708d8eb..56422866 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -428,9 +428,7 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type[attr] = "temporal" else: ldf.data_type[attr] = "nominal" - # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): - # if self.cardinality[attr]>50: - if ldf.index.dtype != "int64" and ldf.index.name: + if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf.data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] @@ -489,21 +487,15 @@ def compute_stats(self, ldf: LuxDataFrame): ldf.unique_values[attribute_repr] = list(ldf[attribute_repr].unique()) ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) - # commenting this optimization out to make sure I can filter by cardinality when showing recommended vis - - # if ldf.dtypes[attribute] != "float64":# and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]): - # ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) - # ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute]) - # else: - # ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute - - if ldf.dtypes[attribute] == "float64" or ldf.dtypes[attribute] == "int64": + if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( + ldf.dtypes[attribute] + ): ldf._min_max[attribute_repr] = ( ldf[attribute].min(), ldf[attribute].max(), ) - if ldf.index.dtype != "int64": + if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index) diff --git a/tests/test_nan.py b/tests/test_nan.py index b2d28fed..1701215f 100644 --- a/tests/test_nan.py +++ b/tests/test_nan.py @@ -22,11 +22,13 @@ def test_nan_column(global_var): df = pytest.college_df + old_geo = df["Geography"] df["Geography"] = np.nan df._repr_html_() for visList in df.recommendation.keys(): for vis in df.recommendation[visList]: assert vis.get_attr_by_attr_name("Geography") == [] + df["Geography"] = old_geo def test_nan_data_type_detection(): diff --git a/tests/test_pandas.py b/tests/test_pandas.py index b43cc1f9..34f68605 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -44,3 +44,17 @@ def test_head_tail(global_var): "Lux is visualizing the previous version of the dataframe before you applied tail." in df._message.to_html() ) + + +def test_describe(global_var): + df = pytest.college_df + summary = df.describe() + summary._repr_html_() + assert len(summary.recommendation["Column Groups"]) == len(summary.columns) == 10 + + +def test_convert_dtype(global_var): + df = pytest.college_df + cdf = df.convert_dtypes() + cdf._repr_html_() + assert list(cdf.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"] From 5c8b2849d449b1b1e0a7c0b5d57be26c926ae160 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 6 Jan 2021 12:08:02 +0800 Subject: [PATCH 2/4] added back metadata series test --- tests/test_pandas.py | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 34f68605..e4935fde 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -16,16 +16,17 @@ import pytest import pandas as pd -# def test_df_to_series(): -# # Ensure metadata is kept when going from df to series -# df = pd.read_csv("lux/data/car.csv") -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Weight"] -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Weight", "Pandas Series original `name` property not retained." +def test_df_to_series(): + # Ensure metadata is kept when going from df to series + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." + print (df["Weight"]._metadata) + assert df["Weight"]._metadata == ['_intent', 'data_type', 'unique_values', 'cardinality', '_rec_info', '_pandas_only', '_min_max', 'plot_config', '_current_vis', '_widget', '_recommendation', '_prev', '_history', '_saved_export', 'name'], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." def test_head_tail(global_var): From 49daeecbdc4b09b7e0013b6a9b940d7bb303e716 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 6 Jan 2021 12:17:39 +0800 Subject: [PATCH 3/4] black --- tests/test_pandas.py | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/tests/test_pandas.py b/tests/test_pandas.py index e4935fde..4b38ae1a 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -16,15 +16,32 @@ import pytest import pandas as pd + def test_df_to_series(): # Ensure metadata is kept when going from df to series df = pd.read_csv("lux/data/car.csv") - df._repr_html_() # compute metadata + df._repr_html_() # compute metadata assert df.cardinality is not None series = df["Weight"] - assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." - print (df["Weight"]._metadata) - assert df["Weight"]._metadata == ['_intent', 'data_type', 'unique_values', 'cardinality', '_rec_info', '_pandas_only', '_min_max', 'plot_config', '_current_vis', '_widget', '_recommendation', '_prev', '_history', '_saved_export', 'name'], "Metadata is lost when going from Dataframe to Series." + assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." + print(df["Weight"]._metadata) + assert df["Weight"]._metadata == [ + "_intent", + "data_type", + "unique_values", + "cardinality", + "_rec_info", + "_pandas_only", + "_min_max", + "plot_config", + "_current_vis", + "_widget", + "_recommendation", + "_prev", + "_history", + "_saved_export", + "name", + ], "Metadata is lost when going from Dataframe to Series." assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." assert series.name == "Weight", "Pandas Series original `name` property not retained." From 801b469fe5e375916f64a9abf858f0f35ff24fde Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 6 Jan 2021 15:58:42 +0800 Subject: [PATCH 4/4] default to pandas display when df.dtypes printed --- lux/core/series.py | 4 +++- tests/test_pandas.py | 29 ------------------------ tests/test_series.py | 53 ++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 30 deletions(-) create mode 100644 tests/test_series.py diff --git a/lux/core/series.py b/lux/core/series.py index aebcabbd..1e3c4f8c 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -16,6 +16,7 @@ import lux import warnings import traceback +import numpy as np class LuxSeries(pd.Series): @@ -75,7 +76,8 @@ def __repr__(self): ldf = LuxDataFrame(self) try: - if ldf._pandas_only: + is_dtype_series = all(isinstance(val, np.dtype) for val in self.values) + if ldf._pandas_only or is_dtype_series: print(series_repr) ldf._pandas_only = False else: diff --git a/tests/test_pandas.py b/tests/test_pandas.py index 4b38ae1a..26cd7333 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -17,35 +17,6 @@ import pandas as pd -def test_df_to_series(): - # Ensure metadata is kept when going from df to series - df = pd.read_csv("lux/data/car.csv") - df._repr_html_() # compute metadata - assert df.cardinality is not None - series = df["Weight"] - assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." - print(df["Weight"]._metadata) - assert df["Weight"]._metadata == [ - "_intent", - "data_type", - "unique_values", - "cardinality", - "_rec_info", - "_pandas_only", - "_min_max", - "plot_config", - "_current_vis", - "_widget", - "_recommendation", - "_prev", - "_history", - "_saved_export", - "name", - ], "Metadata is lost when going from Dataframe to Series." - assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." - assert series.name == "Weight", "Pandas Series original `name` property not retained." - - def test_head_tail(global_var): df = pytest.car_df df._repr_html_() diff --git a/tests/test_series.py b/tests/test_series.py new file mode 100644 index 00000000..62a4697f --- /dev/null +++ b/tests/test_series.py @@ -0,0 +1,53 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .context import lux +import pytest +import pandas as pd +import warnings + + +def test_df_to_series(): + # Ensure metadata is kept when going from df to series + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." + print(df["Weight"]._metadata) + assert df["Weight"]._metadata == [ + "_intent", + "data_type", + "unique_values", + "cardinality", + "_rec_info", + "_pandas_only", + "_min_max", + "plot_config", + "_current_vis", + "_widget", + "_recommendation", + "_prev", + "_history", + "_saved_export", + "name", + ], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." + + +def test_print_dtypes(global_var): + df = pytest.college_df + with warnings.catch_warnings(record=True) as w: + print(df.dtypes) + assert len(w) == 0, "Warning displayed when printing dtypes"