From 6a20c47d3f44c0e9d4151bd379e4953ceb7bf68f Mon Sep 17 00:00:00 2001 From: Kunal Agarwal <32151899+westernguy2@users.noreply.github.com> Date: Sun, 1 Nov 2020 21:56:56 -0800 Subject: [PATCH] Add LuxSeries Implementation (#122) * add preliminary groupby fixes * preliminary LuxSeries implementation * add tests for new Series implementation * clean up the added code * minor code changes * fix issues with Vis with index * small fixes * remove comments * bugfix column group display empty Vis involving groupby index * bugfix Cylinders not showing up as bar charts Co-authored-by: Doris Lee --- lux/action/column_group.py | 14 ++++-- lux/core/frame.py | 22 +++++---- lux/core/series.py | 19 ++++---- lux/executor/PandasExecutor.py | 2 +- lux/processor/Compiler.py | 20 ++++---- lux/utils/utils.py | 2 +- tests/test_maintainence.py | 6 +++ tests/test_pandas_coverage.py | 83 ++++++++++++++++------------------ 8 files changed, 90 insertions(+), 78 deletions(-) diff --git a/lux/action/column_group.py b/lux/action/column_group.py index 054acda9..ba1b9e5a 100644 --- a/lux/action/column_group.py +++ b/lux/action/column_group.py @@ -30,13 +30,17 @@ def column_group(ldf): ldf_flat.columns = ldf_flat.columns.format() ldf_flat = ldf_flat.reset_index() #use a single shared ldf_flat so that metadata doesn't need to be computed for every vis if (ldf.index.nlevels==1): - index_column_name = ldf.index.name + if ldf.index.name: + index_column_name = ldf.index.name + else: + index_column_name = "index" if isinstance(ldf.columns,pd.DatetimeIndex): ldf.columns = ldf.columns.to_native_types() - for attribute in ldf.columns: - vis = Vis([index_column_name,lux.Clause(str(attribute),aggregation=None)],ldf_flat) - collection.append(vis) - vlst = VisList(collection) + for attribute in ldf.columns: + if ldf[attribute].dtype!="object" and (attribute!="index"): + vis = Vis([lux.Clause(index_column_name, data_type = "nominal", data_model = "dimension", aggregation=None), lux.Clause(str(attribute), data_type = "quantitative", aggregation=None)]) + collection.append(vis) + vlst = VisList(collection,ldf_flat) # Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated ldf recommendation["collection"] = vlst diff --git a/lux/core/frame.py b/lux/core/frame.py index 112afb53..bd7cea6a 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -66,12 +66,15 @@ def __init__(self,*args, **kw): @property def _constructor(self): return LuxDataFrame - # @property - # def _constructor_sliced(self): - # def f(*args, **kwargs): - # # adapted from https://github.com/pandas-dev/pandas/issues/13208#issuecomment-326556232 - # return LuxSeries(*args, **kwargs).__finalize__(self, method='inherit') - # return f + @property + def _constructor_sliced(self): + def f(*args, **kwargs): + s = LuxSeries(*args, **kwargs) + for attr in self._metadata: #propagate metadata + s.__dict__[attr] = getattr(self, attr, None) + return s + return f + @property def history(self): return self._history @@ -385,7 +388,7 @@ def maintain_recs(self): id_fields_str = id_fields_str[:-2] rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.") rec_df._prev = None # reset _prev - + if (not hasattr(rec_df,"_recs_fresh") or not rec_df._recs_fresh ): # Check that recs has not yet been computed rec_infolist = [] from lux.action.custom import custom @@ -400,10 +403,9 @@ def maintain_recs(self): if (rec_df.pre_aggregated): if (rec_df.columns.name is not None): rec_df._append_rec(rec_infolist, row_group(rec_df)) - if (rec_df.index.name is not None): - rec_df._append_rec(rec_infolist, column_group(rec_df)) + rec_df._append_rec(rec_infolist, column_group(rec_df)) else: - if self.recommendation == {}: + if rec_df.recommendation == {}: # display conditions for default actions no_vis = lambda ldf: (ldf.current_vis is None) or (ldf.current_vis is not None and len(ldf.current_vis) == 0) one_current_vis = lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1 diff --git a/lux/core/series.py b/lux/core/series.py index e0179362..ecfd9aa2 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -14,11 +14,12 @@ import pandas as pd class LuxSeries(pd.Series): - # _metadata = ['name','_intent','data_type_lookup','data_type', - # 'data_model_lookup','data_model','unique_values','cardinality', - # 'min_max','plot_config', '_current_vis','_widget', '_recommendation'] + _metadata = ['_intent','data_type_lookup','data_type', + 'data_model_lookup','data_model','unique_values','cardinality','_rec_info', '_pandas_only', + '_min_max','plot_config', '_current_vis','_widget', '_recommendation','_prev','_history', '_saved_export'] def __init__(self,*args, **kw): super(LuxSeries, self).__init__(*args, **kw) + @property def _constructor(self): return LuxSeries @@ -26,8 +27,10 @@ def _constructor(self): @property def _constructor_expanddim(self): from lux.core.frame import LuxDataFrame - # def f(*args, **kwargs): - # # adapted from https://github.com/pandas-dev/pandas/issues/13208#issuecomment-326556232 - # return LuxDataFrame(*args, **kwargs).__finalize__(self, method='inherit') - # return f - return LuxDataFrame \ No newline at end of file + def f(*args, **kwargs): + df = LuxDataFrame(*args, **kwargs) + for attr in self._metadata: + df.__dict__[attr] = getattr(self, attr, None) + return df + f._get_axis_number = super(LuxSeries, self)._get_axis_number + return f \ No newline at end of file diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 6924886d..a8d17d36 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -293,7 +293,7 @@ def execute_2D_binning(vis: Vis): result = result.dropna() else: groups = vis._vis_data.groupby(['xBin','yBin'])[x_attr.attribute] - result = groups.agg("count").reset_index() # .agg in this line throws SettingWithCopyWarning + result = groups.agg("count").reset_index(name=x_attr.attribute) # .agg in this line throws SettingWithCopyWarning result = result.rename(columns={x_attr.attribute:"count"}) result = result[result["count"]!=0] diff --git a/lux/processor/Compiler.py b/lux/processor/Compiler.py index 8b513c24..17edb97a 100644 --- a/lux/processor/Compiler.py +++ b/lux/processor/Compiler.py @@ -144,12 +144,12 @@ def populate_data_type_model(ldf, vis_collection) -> VisList: clause.description = "" # TODO: Note that "and not is_datetime_string(clause.attribute))" is a temporary hack and breaks the `test_row_column_group` example if (clause.attribute!="" and clause.attribute!="Record"):# and not is_datetime_string(clause.attribute): - # if (clause.data_type == ""): - clause.data_type = ldf.data_type_lookup[clause.attribute] + if (clause.data_type == ""): + clause.data_type = ldf.data_type_lookup[clause.attribute] if (clause.data_type=="id"): clause.data_type = "nominal" - # if (clause.data_model == ""): - clause.data_model = ldf.data_model_lookup[clause.attribute] + if (clause.data_model == ""): + clause.data_model = ldf.data_model_lookup[clause.attribute] if (clause.value!=""): if (vis.title == ""): #If user provided title for Vis, then don't override. if(isinstance(clause.value,np.datetime64)): @@ -277,11 +277,12 @@ def line_or_bar(ldf, dimension:Clause, measure:Clause): dimension = d1 color_attr = d2 # Colored Bar/Line chart with Count as default measure - if (nmsr == 0): - vis._inferred_intent.append(count_col) - measure = vis.get_attr_by_data_model("measure")[0] - vis._mark, auto_channel = line_or_bar(ldf, dimension, measure) - auto_channel["color"] = color_attr + if not ldf.pre_aggregated: + if (nmsr == 0 and not ldf.pre_aggregated): + vis._inferred_intent.append(count_col) + measure = vis.get_attr_by_data_model("measure")[0] + vis._mark, auto_channel = line_or_bar(ldf, dimension, measure) + auto_channel["color"] = color_attr elif (ndim == 0 and nmsr == 2): # Scatterplot vis._mark = "scatter" @@ -316,7 +317,6 @@ def line_or_bar(ldf, dimension:Clause, measure:Clause): if (auto_channel!={}): vis = Compiler.enforce_specified_channel(vis, auto_channel) vis._inferred_intent.extend(filters) # add back the preserved filters - @staticmethod def enforce_specified_channel(vis: Vis, auto_channel: Dict[str, str]): """ diff --git a/lux/utils/utils.py b/lux/utils/utils.py index 670c37b7..e28d931b 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -54,7 +54,7 @@ def check_if_id_like(df,attribute): import re # Strong signals high_cardinality = df.cardinality[attribute]>500 # so that aggregated reset_index fields don't get misclassified - attribute_contain_id = re.search(r'id',attribute) is not None + attribute_contain_id = re.search(r'id',str(attribute)) is not None almost_all_vals_unique = df.cardinality[attribute] >=0.98* len(df) is_string = pd.api.types.is_string_dtype(df[attribute]) if (is_string): diff --git a/tests/test_maintainence.py b/tests/test_maintainence.py index a3e4da99..797d3462 100644 --- a/tests/test_maintainence.py +++ b/tests/test_maintainence.py @@ -51,10 +51,16 @@ def test_metadata_new_df_operation(): def test_metadata_column_group_reset_df(): df = pd.read_csv("lux/data/car.csv") assert not hasattr(df,"_metadata_fresh") + df['Year'] = pd.to_datetime(df['Year'], format='%Y') + assert hasattr(df,"_metadata_fresh") result = df.groupby("Cylinders").mean() assert not hasattr(result,"_metadata_fresh") result._repr_html_() # Note that this should trigger two compute metadata (one for df, and one for an intermediate df.reset_index used to feed inside created Vis) assert result._metadata_fresh==True, "Failed to maintain metadata after display df" + + colgroup_recs = result.recommendation["Column Groups"] + assert len(colgroup_recs) == 5 + for rec in colgroup_recs: assert rec.mark=="bar", "Column Group not displaying bar charts" def test_recs_inplace_operation(): df = pd.read_csv("lux/data/car.csv") diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index 88d117d7..c4d47616 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -125,15 +125,14 @@ def test_cut(): df = pd.read_csv("lux/data/car.csv") df["Weight"] = pd.cut(df["Weight"], bins = [0, 2500, 7500, 10000], labels = ["small", "medium", "large"]) df._repr_html_() -# def test_groupby_agg_very_small(): +def test_groupby_agg_very_small(): -# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' -# df = pd.read_csv(url) -# df["Year"] = pd.to_datetime(df["Year"], format='%Y') -# new_df = df.groupby("Origin").agg(sum).reset_index() -# new_df._repr_html_() -# assert list(new_df.recommendation.keys() ) == ['Column Groups'] -# assert len(new_df.cardinality) == 7 + df = pd.read_csv("lux/data/car.csv") + df["Year"] = pd.to_datetime(df["Year"], format='%Y') + new_df = df.groupby("Origin").agg(sum).reset_index() + new_df._repr_html_() + assert list(new_df.recommendation.keys() ) == ['Column Groups'] + assert len(new_df.cardinality) == 7 # def test_groupby_multi_index(): # url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' @@ -355,39 +354,37 @@ def compare_vis(vis1, vis2): # Series Tests # ################ -# TODO: These will all fail right now since LuxSeries isn't implemented yet -# def test_df_to_series(): -# # Ensure metadata is kept when going from df to series -# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' -# df = pd.read_csv(url) -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Weight"] -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Weight", "Pandas Series original `name` property not retained." - -# def test_value_counts(): -# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' -# df = pd.read_csv(url) -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Weight"] -# series.value_counts() -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Weight", "Pandas Series original `name` property not retained." - -# def test_str_replace(): -# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' -# df = pd.read_csv(url) -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Brand"].str.replace("chevrolet", "chevy") -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Brand"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Brand", "Pandas Series original `name` property not retained." +def test_df_to_series(): + # Ensure metadata is kept when going from df to series + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." + df["Weight"]._metadata + assert df["Weight"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." + +def test_value_counts(): + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + series.value_counts() + assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." + assert df["Weight"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." + +def test_str_replace(): + url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true' + df = pd.read_csv(url) + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Brand"].str.replace("chevrolet", "chevy") + assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." + assert df["Brand"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Brand", "Pandas Series original `name` property not retained."