Skip to content

Commit

Permalink
Add LuxSeries Implementation (#122)
Browse files Browse the repository at this point in the history
* add preliminary groupby fixes

* preliminary LuxSeries implementation

* add tests for new Series implementation

* clean up the added code

* minor code changes

* fix issues with Vis with index

* small fixes

* remove comments

* bugfix column group display empty Vis involving groupby index

* bugfix Cylinders not showing up as bar charts

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
westernguy2 and dorisjlee committed Nov 2, 2020
1 parent f740e89 commit 6a20c47
Show file tree
Hide file tree
Showing 8 changed files with 90 additions and 78 deletions.
14 changes: 9 additions & 5 deletions lux/action/column_group.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,17 @@ def column_group(ldf):
ldf_flat.columns = ldf_flat.columns.format()
ldf_flat = ldf_flat.reset_index() #use a single shared ldf_flat so that metadata doesn't need to be computed for every vis
if (ldf.index.nlevels==1):
index_column_name = ldf.index.name
if ldf.index.name:
index_column_name = ldf.index.name
else:
index_column_name = "index"
if isinstance(ldf.columns,pd.DatetimeIndex):
ldf.columns = ldf.columns.to_native_types()
for attribute in ldf.columns:
vis = Vis([index_column_name,lux.Clause(str(attribute),aggregation=None)],ldf_flat)
collection.append(vis)
vlst = VisList(collection)
for attribute in ldf.columns:
if ldf[attribute].dtype!="object" and (attribute!="index"):
vis = Vis([lux.Clause(index_column_name, data_type = "nominal", data_model = "dimension", aggregation=None), lux.Clause(str(attribute), data_type = "quantitative", aggregation=None)])
collection.append(vis)
vlst = VisList(collection,ldf_flat)
# Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated ldf

recommendation["collection"] = vlst
Expand Down
22 changes: 12 additions & 10 deletions lux/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,12 +66,15 @@ def __init__(self,*args, **kw):
@property
def _constructor(self):
return LuxDataFrame
# @property
# def _constructor_sliced(self):
# def f(*args, **kwargs):
# # adapted from https://github.com/pandas-dev/pandas/issues/13208#issuecomment-326556232
# return LuxSeries(*args, **kwargs).__finalize__(self, method='inherit')
# return f
@property
def _constructor_sliced(self):
def f(*args, **kwargs):
s = LuxSeries(*args, **kwargs)
for attr in self._metadata: #propagate metadata
s.__dict__[attr] = getattr(self, attr, None)
return s
return f

@property
def history(self):
return self._history
Expand Down Expand Up @@ -385,7 +388,7 @@ def maintain_recs(self):
id_fields_str = id_fields_str[:-2]
rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.")
rec_df._prev = None # reset _prev

if (not hasattr(rec_df,"_recs_fresh") or not rec_df._recs_fresh ): # Check that recs has not yet been computed
rec_infolist = []
from lux.action.custom import custom
Expand All @@ -400,10 +403,9 @@ def maintain_recs(self):
if (rec_df.pre_aggregated):
if (rec_df.columns.name is not None):
rec_df._append_rec(rec_infolist, row_group(rec_df))
if (rec_df.index.name is not None):
rec_df._append_rec(rec_infolist, column_group(rec_df))
rec_df._append_rec(rec_infolist, column_group(rec_df))
else:
if self.recommendation == {}:
if rec_df.recommendation == {}:
# display conditions for default actions
no_vis = lambda ldf: (ldf.current_vis is None) or (ldf.current_vis is not None and len(ldf.current_vis) == 0)
one_current_vis = lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1
Expand Down
19 changes: 11 additions & 8 deletions lux/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,20 +14,23 @@

import pandas as pd
class LuxSeries(pd.Series):
# _metadata = ['name','_intent','data_type_lookup','data_type',
# 'data_model_lookup','data_model','unique_values','cardinality',
# 'min_max','plot_config', '_current_vis','_widget', '_recommendation']
_metadata = ['_intent','data_type_lookup','data_type',
'data_model_lookup','data_model','unique_values','cardinality','_rec_info', '_pandas_only',
'_min_max','plot_config', '_current_vis','_widget', '_recommendation','_prev','_history', '_saved_export']
def __init__(self,*args, **kw):
super(LuxSeries, self).__init__(*args, **kw)

@property
def _constructor(self):
return LuxSeries

@property
def _constructor_expanddim(self):
from lux.core.frame import LuxDataFrame
# def f(*args, **kwargs):
# # adapted from https://github.com/pandas-dev/pandas/issues/13208#issuecomment-326556232
# return LuxDataFrame(*args, **kwargs).__finalize__(self, method='inherit')
# return f
return LuxDataFrame
def f(*args, **kwargs):
df = LuxDataFrame(*args, **kwargs)
for attr in self._metadata:
df.__dict__[attr] = getattr(self, attr, None)
return df
f._get_axis_number = super(LuxSeries, self)._get_axis_number
return f
2 changes: 1 addition & 1 deletion lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -293,7 +293,7 @@ def execute_2D_binning(vis: Vis):
result = result.dropna()
else:
groups = vis._vis_data.groupby(['xBin','yBin'])[x_attr.attribute]
result = groups.agg("count").reset_index() # .agg in this line throws SettingWithCopyWarning
result = groups.agg("count").reset_index(name=x_attr.attribute) # .agg in this line throws SettingWithCopyWarning
result = result.rename(columns={x_attr.attribute:"count"})
result = result[result["count"]!=0]

Expand Down
20 changes: 10 additions & 10 deletions lux/processor/Compiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -144,12 +144,12 @@ def populate_data_type_model(ldf, vis_collection) -> VisList:
clause.description = ""
# TODO: Note that "and not is_datetime_string(clause.attribute))" is a temporary hack and breaks the `test_row_column_group` example
if (clause.attribute!="" and clause.attribute!="Record"):# and not is_datetime_string(clause.attribute):
# if (clause.data_type == ""):
clause.data_type = ldf.data_type_lookup[clause.attribute]
if (clause.data_type == ""):
clause.data_type = ldf.data_type_lookup[clause.attribute]
if (clause.data_type=="id"):
clause.data_type = "nominal"
# if (clause.data_model == ""):
clause.data_model = ldf.data_model_lookup[clause.attribute]
if (clause.data_model == ""):
clause.data_model = ldf.data_model_lookup[clause.attribute]
if (clause.value!=""):
if (vis.title == ""): #If user provided title for Vis, then don't override.
if(isinstance(clause.value,np.datetime64)):
Expand Down Expand Up @@ -277,11 +277,12 @@ def line_or_bar(ldf, dimension:Clause, measure:Clause):
dimension = d1
color_attr = d2
# Colored Bar/Line chart with Count as default measure
if (nmsr == 0):
vis._inferred_intent.append(count_col)
measure = vis.get_attr_by_data_model("measure")[0]
vis._mark, auto_channel = line_or_bar(ldf, dimension, measure)
auto_channel["color"] = color_attr
if not ldf.pre_aggregated:
if (nmsr == 0 and not ldf.pre_aggregated):
vis._inferred_intent.append(count_col)
measure = vis.get_attr_by_data_model("measure")[0]
vis._mark, auto_channel = line_or_bar(ldf, dimension, measure)
auto_channel["color"] = color_attr
elif (ndim == 0 and nmsr == 2):
# Scatterplot
vis._mark = "scatter"
Expand Down Expand Up @@ -316,7 +317,6 @@ def line_or_bar(ldf, dimension:Clause, measure:Clause):
if (auto_channel!={}):
vis = Compiler.enforce_specified_channel(vis, auto_channel)
vis._inferred_intent.extend(filters) # add back the preserved filters

@staticmethod
def enforce_specified_channel(vis: Vis, auto_channel: Dict[str, str]):
"""
Expand Down
2 changes: 1 addition & 1 deletion lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ def check_if_id_like(df,attribute):
import re
# Strong signals
high_cardinality = df.cardinality[attribute]>500 # so that aggregated reset_index fields don't get misclassified
attribute_contain_id = re.search(r'id',attribute) is not None
attribute_contain_id = re.search(r'id',str(attribute)) is not None
almost_all_vals_unique = df.cardinality[attribute] >=0.98* len(df)
is_string = pd.api.types.is_string_dtype(df[attribute])
if (is_string):
Expand Down
6 changes: 6 additions & 0 deletions tests/test_maintainence.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,10 +51,16 @@ def test_metadata_new_df_operation():
def test_metadata_column_group_reset_df():
df = pd.read_csv("lux/data/car.csv")
assert not hasattr(df,"_metadata_fresh")
df['Year'] = pd.to_datetime(df['Year'], format='%Y')
assert hasattr(df,"_metadata_fresh")
result = df.groupby("Cylinders").mean()
assert not hasattr(result,"_metadata_fresh")
result._repr_html_() # Note that this should trigger two compute metadata (one for df, and one for an intermediate df.reset_index used to feed inside created Vis)
assert result._metadata_fresh==True, "Failed to maintain metadata after display df"

colgroup_recs = result.recommendation["Column Groups"]
assert len(colgroup_recs) == 5
for rec in colgroup_recs: assert rec.mark=="bar", "Column Group not displaying bar charts"

def test_recs_inplace_operation():
df = pd.read_csv("lux/data/car.csv")
Expand Down
83 changes: 40 additions & 43 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,15 +125,14 @@ def test_cut():
df = pd.read_csv("lux/data/car.csv")
df["Weight"] = pd.cut(df["Weight"], bins = [0, 2500, 7500, 10000], labels = ["small", "medium", "large"])
df._repr_html_()
# def test_groupby_agg_very_small():
def test_groupby_agg_very_small():

# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df["Year"] = pd.to_datetime(df["Year"], format='%Y')
# new_df = df.groupby("Origin").agg(sum).reset_index()
# new_df._repr_html_()
# assert list(new_df.recommendation.keys() ) == ['Column Groups']
# assert len(new_df.cardinality) == 7
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
new_df = df.groupby("Origin").agg(sum).reset_index()
new_df._repr_html_()
assert list(new_df.recommendation.keys() ) == ['Column Groups']
assert len(new_df.cardinality) == 7

# def test_groupby_multi_index():
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
Expand Down Expand Up @@ -355,39 +354,37 @@ def compare_vis(vis1, vis2):
# Series Tests #
################

# TODO: These will all fail right now since LuxSeries isn't implemented yet
# def test_df_to_series():
# # Ensure metadata is kept when going from df to series
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df._repr_html_() # compute metadata
# assert df.cardinality is not None
# series = df["Weight"]
# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series."
# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
# assert series.name == "Weight", "Pandas Series original `name` property not retained."

# def test_value_counts():
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df._repr_html_() # compute metadata
# assert df.cardinality is not None
# series = df["Weight"]
# series.value_counts()
# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series."
# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
# assert series.name == "Weight", "Pandas Series original `name` property not retained."

# def test_str_replace():
# url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
# df = pd.read_csv(url)
# df._repr_html_() # compute metadata
# assert df.cardinality is not None
# series = df["Brand"].str.replace("chevrolet", "chevy")
# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
# assert df["Brand"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', 'plot_config', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series."
# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
# assert series.name == "Brand", "Pandas Series original `name` property not retained."
def test_df_to_series():
# Ensure metadata is kept when going from df to series
df = pd.read_csv("lux/data/car.csv")
df._repr_html_() # compute metadata
assert df.cardinality is not None
series = df["Weight"]
assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
df["Weight"]._metadata
assert df["Weight"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series."
assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
assert series.name == "Weight", "Pandas Series original `name` property not retained."

def test_value_counts():
df = pd.read_csv("lux/data/car.csv")
df._repr_html_() # compute metadata
assert df.cardinality is not None
series = df["Weight"]
series.value_counts()
assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
assert df["Weight"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series."
assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
assert series.name == "Weight", "Pandas Series original `name` property not retained."

def test_str_replace():
url = 'https://github.com/lux-org/lux-datasets/blob/master/data/cars.csv?raw=true'
df = pd.read_csv(url)
df._repr_html_() # compute metadata
assert df.cardinality is not None
series = df["Brand"].str.replace("chevrolet", "chevy")
assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries."
assert df["Brand"]._metadata == ['_intent','data_type_lookup','data_type','data_model_lookup','data_model','unique_values','cardinality','_rec_info','_pandas_only','_min_max','plot_config','_current_vis','_widget','_recommendation','_prev','_history','_saved_export'], "Metadata is lost when going from Dataframe to Series."
assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series."
assert series.name == "Brand", "Pandas Series original `name` property not retained."

0 comments on commit 6a20c47

Please sign in to comment.