Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added Interestingness Scoring for Colored Bar and Line charts #59

Merged
merged 7 commits into from
Aug 14, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,11 @@ def execute_aggregate(view: Vis,isFiltered = True):
for col in columns[1:]:
view.data[col] = view.data[col].fillna(0)
assert len(list(view.data[groupby_attr.attribute])) == len(all_attr_vals), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`."
#need to compute the statistics and metadata for the view's data if no new rows were added
else:
if view.data.cardinality is None and has_color:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is there a reason why we need to recompute the metadata here?

view.data.compute_stats()
view.data.compute_dataset_metadata()
view.data = view.data.sort_values(by=groupby_attr.attribute, ascending=True)
view.data = view.data.reset_index()
view.data = view.data.drop(columns="index")
Expand Down
25 changes: 23 additions & 2 deletions lux/interestingness/interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,8 +90,29 @@ def interestingness(vis:Vis ,ldf:LuxDataFrame) -> int:
elif (n_msr == 3):
return 0.1
# colored line and barchart cases
elif ((vis.mark == "line" or vis.mark == "bar") and n_dim == 2):
return 0.2
elif (vis.mark == "line" and n_dim == 2):
return 0.15
elif (vis.mark == "bar" and n_dim == 2):
from scipy.stats import chi2_contingency
measure_column = vis.get_attr_by_data_model("measure")[0].attribute
dimension_columns = vis.get_attr_by_data_model("dimension")

groupby_column = dimension_columns[0].attribute
color_column = dimension_columns[1].attribute

contingency_table = []
groupby_cardinality = vis.data.cardinality[groupby_column]
groupby_unique_vals = vis.data.unique_values[groupby_column]
for c in range(0, groupby_cardinality):
contingency_table.append(vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column])
score = 0.12
#ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in
#a category having no counts
try:
score = min(0.13, chi2_contingency(contingency_table)[0])
except ValueError:
pass
return(score)
# Default
else:
return -1
Expand Down
16 changes: 16 additions & 0 deletions tests/test_interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,22 @@ def test_interestingness_1_1_1():
#check for top recommended Filter graph score is not none
assert interestingness(df.recommendation['Filter'][0],df) != None

def test_interestingness_1_2_0():
from lux.vis.Vis import Vis
from lux.vis.Vis import Clause
from lux.interestingness.interestingness import interestingness

df = pd.read_csv("lux/data/car.csv")
y_clause = Clause(attribute = "Name", channel = "y")
color_clause = Clause(attribute = 'Cylinders', channel = "color")

new_vis = Vis([y_clause, color_clause])
new_vis.refresh_source(df)
new_vis
#assert(len(new_vis.data)==color_cardinality*group_by_cardinality)

assert(interestingness(new_vis, df)==0.13)

def test_interestingness_0_2_0():
df = pd.read_csv("lux/data/car.csv")
df["Year"] = pd.to_datetime(df["Year"], format='%Y')
Expand Down