diff --git a/lux/core/frame.py b/lux/core/frame.py index e4ed9e3e..8546c168 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -155,7 +155,7 @@ def _set_item(self, key, value): def _infer_structure(self): # If the dataframe is very small and the index column is not a range index, then it is likely that this is an aggregated data is_multi_index_flag = self.index.nlevels != 1 - not_int_index_flag = self.index.dtype != "int64" + not_int_index_flag = not pd.api.types.is_integer_dtype(self.index) small_df_flag = len(self) < 100 self.pre_aggregated = (is_multi_index_flag or not_int_index_flag) and small_df_flag if "Number of Records" in self.columns: diff --git a/lux/core/series.py b/lux/core/series.py index aea13d0c..1e3c4f8c 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -16,6 +16,7 @@ import lux import warnings import traceback +import numpy as np class LuxSeries(pd.Series): @@ -45,14 +46,14 @@ def _constructor(self): def _constructor_expanddim(self): from lux.core.frame import LuxDataFrame - def f(*args, **kwargs): - df = LuxDataFrame(*args, **kwargs) - for attr in self._metadata: - df.__dict__[attr] = getattr(self, attr, None) - return df + # def f(*args, **kwargs): + # df = LuxDataFrame(*args, **kwargs) + # for attr in self._metadata: + # df.__dict__[attr] = getattr(self, attr, None) + # return df - f._get_axis_number = super(LuxSeries, self)._get_axis_number - return f + # f._get_axis_number = super(LuxSeries, self)._get_axis_number + return LuxDataFrame def to_pandas(self): import lux.core @@ -75,7 +76,8 @@ def __repr__(self): ldf = LuxDataFrame(self) try: - if ldf._pandas_only: + is_dtype_series = all(isinstance(val, np.dtype) for val in self.values) + if ldf._pandas_only or is_dtype_series: print(series_repr) ldf._pandas_only = False else: diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 9708d8eb..56422866 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -428,9 +428,7 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type[attr] = "temporal" else: ldf.data_type[attr] = "nominal" - # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): - # if self.cardinality[attr]>50: - if ldf.index.dtype != "int64" and ldf.index.name: + if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf.data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] @@ -489,21 +487,15 @@ def compute_stats(self, ldf: LuxDataFrame): ldf.unique_values[attribute_repr] = list(ldf[attribute_repr].unique()) ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) - # commenting this optimization out to make sure I can filter by cardinality when showing recommended vis - - # if ldf.dtypes[attribute] != "float64":# and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]): - # ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) - # ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute]) - # else: - # ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute - - if ldf.dtypes[attribute] == "float64" or ldf.dtypes[attribute] == "int64": + if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( + ldf.dtypes[attribute] + ): ldf._min_max[attribute_repr] = ( ldf[attribute].min(), ldf[attribute].max(), ) - if ldf.index.dtype != "int64": + if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index) diff --git a/tests/test_nan.py b/tests/test_nan.py index b2d28fed..1701215f 100644 --- a/tests/test_nan.py +++ b/tests/test_nan.py @@ -22,11 +22,13 @@ def test_nan_column(global_var): df = pytest.college_df + old_geo = df["Geography"] df["Geography"] = np.nan df._repr_html_() for visList in df.recommendation.keys(): for vis in df.recommendation[visList]: assert vis.get_attr_by_attr_name("Geography") == [] + df["Geography"] = old_geo def test_nan_data_type_detection(): diff --git a/tests/test_pandas.py b/tests/test_pandas.py index b43cc1f9..26cd7333 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -16,17 +16,6 @@ import pytest import pandas as pd -# def test_df_to_series(): -# # Ensure metadata is kept when going from df to series -# df = pd.read_csv("lux/data/car.csv") -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Weight"] -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Weight", "Pandas Series original `name` property not retained." - def test_head_tail(global_var): df = pytest.car_df @@ -44,3 +33,17 @@ def test_head_tail(global_var): "Lux is visualizing the previous version of the dataframe before you applied tail." in df._message.to_html() ) + + +def test_describe(global_var): + df = pytest.college_df + summary = df.describe() + summary._repr_html_() + assert len(summary.recommendation["Column Groups"]) == len(summary.columns) == 10 + + +def test_convert_dtype(global_var): + df = pytest.college_df + cdf = df.convert_dtypes() + cdf._repr_html_() + assert list(cdf.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"] diff --git a/tests/test_series.py b/tests/test_series.py new file mode 100644 index 00000000..62a4697f --- /dev/null +++ b/tests/test_series.py @@ -0,0 +1,53 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .context import lux +import pytest +import pandas as pd +import warnings + + +def test_df_to_series(): + # Ensure metadata is kept when going from df to series + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." + print(df["Weight"]._metadata) + assert df["Weight"]._metadata == [ + "_intent", + "data_type", + "unique_values", + "cardinality", + "_rec_info", + "_pandas_only", + "_min_max", + "plot_config", + "_current_vis", + "_widget", + "_recommendation", + "_prev", + "_history", + "_saved_export", + "name", + ], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." + + +def test_print_dtypes(global_var): + df = pytest.college_df + with warnings.catch_warnings(record=True) as w: + print(df.dtypes) + assert len(w) == 0, "Warning displayed when printing dtypes"