From 2ea9ed11fd3db87a9d0d37a5c750180d533f5e36 Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Sat, 26 Dec 2020 06:47:34 -0800 Subject: [PATCH 01/28] Similarity as a default action (#182) * similarity formatting fixed * added another similarity test case; fixed bug where colored heatmap dimension is temporal (invalidate all 2 msr 1 temporal case) * filter and similarity together * filter and similarity together * remove filter * black line length * file reorg and clean; change sim metric Co-authored-by: Caitlyn Chen Co-authored-by: Doris Lee --- lux/action/filter.py | 26 ++++++- lux/interestingness/interestingness.py | 19 ++++++ lux/{action => interestingness}/similarity.py | 67 +------------------ lux/processor/Compiler.py | 38 +++++++---- tests/test_action.py | 45 +++++++++++++ tests/test_interestingness.py | 8 +-- 6 files changed, 114 insertions(+), 89 deletions(-) rename lux/{action => interestingness}/similarity.py (66%) diff --git a/lux/action/filter.py b/lux/action/filter.py index af9a495b..70d85e0e 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -18,6 +18,7 @@ from lux.vis.VisList import VisList from lux.processor.Compiler import Compiler from lux.utils import utils +from lux.utils.utils import get_filter_specs def filter(ldf): @@ -112,9 +113,28 @@ def get_complementary_ops(fltr_op): new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) + if ( + ldf.current_vis is not None + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ): + recommendation = { + "action": "Similarity", + "description": "Show other charts that are visually similar to the Current vis.", + } + last = get_filter_specs(ldf.intent)[-1] + output = ldf.intent.copy()[0:-1] + # array of possible values for attribute + arr = ldf[last.attribute].unique().tolist() + output.append(lux.Clause(last.attribute, last.attribute, arr)) vlist = lux.vis.VisList.VisList(output, ldf) - for vis in vlist: - vis.score = interestingness(vis, ldf) + vlist_copy = lux.vis.VisList.VisList(output, ldf) + for i in range(len(vlist_copy)): + vlist[i].score = interestingness(vlist_copy[i], ldf) vlist = vlist.topK(15) - recommendation["collection"] = vlist + if recommendation["action"] == "Similarity": + recommendation["collection"] = vlist[1:] + else: + recommendation["collection"] = vlist return recommendation diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 0c94757e..dd9615a6 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -22,6 +22,9 @@ from pandas.api.types import is_datetime64_any_dtype as is_datetime from scipy.spatial.distance import euclidean import lux +from lux.utils.utils import get_filter_specs +from lux.interestingness.similarity import preprocess, euclidean_dist +from lux.vis.VisList import VisList def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: @@ -68,6 +71,22 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: dimension_lst = vis.get_attr_by_data_model("dimension") measure_lst = vis.get_attr_by_data_model("measure") v_size = len(vis.data) + + if ( + n_dim == 1 + and (n_msr == 0 or n_msr == 1) + and ldf.current_vis is not None + and vis.get_attr_by_channel("y")[0].data_type == "quantitative" + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ): + query_vc = VisList(ldf.current_vis, ldf) + query_vis = query_vc[0] + preprocess(query_vis) + preprocess(vis) + return 1 - euclidean_dist(query_vis, vis) + # Line/Bar Chart # print("r:", n_record, "m:", n_msr, "d:",n_dim) if n_dim == 1 and (n_msr == 0 or n_msr == 1): diff --git a/lux/action/similarity.py b/lux/interestingness/similarity.py similarity index 66% rename from lux/action/similarity.py rename to lux/interestingness/similarity.py index 174a4d43..8d810909 100644 --- a/lux/action/similarity.py +++ b/lux/interestingness/similarity.py @@ -17,70 +17,7 @@ import math import numpy as np from lux.vis.VisList import VisList - - -def similar_pattern(ldf, intent, topK=-1): - """ - Generates visualizations with similar patterns to a query visualization. - - Parameters - ---------- - ldf : lux.core.frame - LuxDataFrame with underspecified intent. - - intent: list[lux.Clause] - intent for specifying the visual query for the similarity search. - - topK: int - number of visual recommendations to return. - - Returns - ------- - recommendations : Dict[str,obj] - object with a collection of visualizations that result from the Similarity action - """ - row_specs = list(filter(lambda x: x.value != "", intent)) - if len(row_specs) == 1: - search_space_vc = VisList(ldf.current_vis.collection.copy(), ldf) - - query_vc = VisList(intent, ldf) - query_vis = query_vc[0] - preprocess(query_vis) - # for loop to create assign euclidean distance - recommendation = { - "action": "Similarity", - "description": "Show other charts that are visually similar to the Current vis.", - } - for vis in search_space_vc: - preprocess(vis) - vis.score = euclidean_dist(query_vis, vis) - search_space_vc.normalize_score(invert_order=True) - if topK != -1: - search_space_vc = search_space_vc.topK(topK) - recommendation["collection"] = search_space_vc - return recommendation - else: - print("Query needs to have 1 row value") - - -def aggregate(vis): - """ - Aggregates data values on the y axis so that the vis is a time series - - Parameters - ---------- - vis : lux.vis.Vis - vis that represents the candidate visualization - Returns - ------- - None - """ - if vis.get_attr_by_channel("x") and vis.get_attr_by_channel("y"): - - xAxis = vis.get_attr_by_channel("x")[0].attribute - yAxis = vis.get_attr_by_channel("y")[0].attribute - - vis.data = vis.data[[xAxis, yAxis]].groupby(xAxis, as_index=False).agg({yAxis: "mean"}).copy() +from lux.utils.utils import get_filter_specs def interpolate(vis, length): @@ -204,6 +141,4 @@ def preprocess(vis): ------- None """ - aggregate(vis) - interpolate(vis, 100) normalize(vis) diff --git a/lux/processor/Compiler.py b/lux/processor/Compiler.py index b07f52ce..1f155197 100644 --- a/lux/processor/Compiler.py +++ b/lux/processor/Compiler.py @@ -18,6 +18,7 @@ from lux.core.frame import LuxDataFrame from lux.vis.VisList import VisList from lux.utils import date_utils +from lux.utils import utils import pandas as pd import numpy as np import warnings @@ -179,12 +180,25 @@ def populate_data_type_model(ldf, vlist): else: chart_title = clause.value vis.title = f"{clause.attribute} {clause.filter_op} {chart_title}" + vis._ndim = 0 + vis._nmsr = 0 + + for clause in vis._inferred_intent: + if clause.value == "": + if clause.data_model == "dimension": + vis._ndim += 1 + elif clause.data_model == "measure" and clause.attribute != "Record": + vis._nmsr += 1 @staticmethod def remove_all_invalid(vis_collection: VisList) -> VisList: """ Given an expanded vis list, remove all visualizations that are invalid. - Currently, the invalid visualizations are ones that contain two of the same attribute, no more than two temporal attributes, or overlapping attributes (same filter attribute and visualized attribute). + Currently, the invalid visualizations are ones that do not contain: + - two of the same attribute, + - more than two temporal attributes, + - no overlapping attributes (same filter attribute and visualized attribute), + - more than 1 temporal attribute with 2 or more measures Parameters ---------- vis_collection : list[lux.vis.Vis] @@ -203,7 +217,11 @@ def remove_all_invalid(vis_collection: VisList) -> VisList: if clause.data_type == "temporal": num_temporal_specs += 1 all_distinct_specs = 0 == len(vis._inferred_intent) - len(attribute_set) - if num_temporal_specs < 2 and all_distinct_specs: + if ( + num_temporal_specs < 2 + and all_distinct_specs + and not (vis._nmsr == 2 and num_temporal_specs == 1) + ): new_vc.append(vis) # else: # warnings.warn("\nThere is more than one duplicate attribute specified in the intent.\nPlease check your intent specification again.") @@ -235,17 +253,11 @@ def determine_encoding(ldf: LuxDataFrame, vis: Vis): https://doi.org/10.1109/TVCG.2007.70594 """ # Count number of measures and dimensions - ndim = 0 - nmsr = 0 - filters = [] - for clause in vis._inferred_intent: - if clause.value == "": - if clause.data_model == "dimension": - ndim += 1 - elif clause.data_model == "measure" and clause.attribute != "Record": - nmsr += 1 - else: # preserve to add back to _inferred_intent later - filters.append(clause) + ndim = vis._ndim + nmsr = vis._nmsr + # preserve to add back to _inferred_intent later + filters = utils.get_filter_specs(vis._inferred_intent) + # Helper function (TODO: Move this into utils) def line_or_bar(ldf, dimension: Clause, measure: Clause): dim_type = dimension.data_type diff --git a/tests/test_action.py b/tests/test_action.py index f3f35a21..bbe161e0 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -197,3 +197,48 @@ def test_year_filter_value(global_var): "T00:00:00.000000000" not in vis.to_Altair() ), "Year filter title contains extraneous string, not displayed as summarized string" df.clear_intent() + + +def test_similarity(global_var): + df = pytest.car_df + df["Year"] = pd.to_datetime(df["Year"], format="%Y") + df.set_intent( + [ + lux.Clause("Year", channel="x"), + lux.Clause("Displacement", channel="y"), + lux.Clause("Origin=USA"), + ] + ) + df._repr_html_() + assert len(df.recommendation["Similarity"]) == 2 + ranked_list = df.recommendation["Similarity"] + + japan_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Japan", ranked_list) + )[0] + europe_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Europe", ranked_list) + )[0] + assert japan_vis.score > europe_vis.score + df.clear_intent() + + +def test_similarity2(): + df = pd.read_csv( + "https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/real_estate_tutorial.csv" + ) + + df["Month"] = pd.to_datetime(df["Month"], format="%m") + df["Year"] = pd.to_datetime(df["Year"], format="%Y") + + df.intent = [lux.Clause("Year"), lux.Clause("PctForeclosured"), lux.Clause("City=Crofton")] + + ranked_list = df.recommendation["Similarity"] + + morrisville_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Morrisville", ranked_list) + )[0] + watertown_vis = list( + filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Watertown", ranked_list) + )[0] + assert morrisville_vis.score > watertown_vis.score diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py index a13ee406..15b8e74d 100644 --- a/tests/test_interestingness.py +++ b/tests/test_interestingness.py @@ -226,7 +226,6 @@ def test_interestingness_0_2_0(global_var): assert interestingness(df.recommendation["Enhance"][0], df) != None rank1 = -1 rank2 = -1 - rank3 = -1 for f in range(0, len(df.recommendation["Enhance"])): if ( str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Origin" @@ -238,12 +237,7 @@ def test_interestingness_0_2_0(global_var): and str(df.recommendation["Enhance"][f].mark) == "scatter" ): rank2 = f - if ( - str(df.recommendation["Enhance"][f]._inferred_intent[2].attribute) == "Year" - and str(df.recommendation["Enhance"][f].mark) == "scatter" - ): - rank3 = f - assert rank1 < rank2 and rank1 < rank3 and rank2 < rank3 + assert rank1 < rank2 # check that top recommended filter graph score is not none and that ordering makes intuitive sense assert interestingness(df.recommendation["Filter"][0], df) != None From a2d6de9898a5f71b74d17bd340265e17b8733371 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sun, 27 Dec 2020 12:42:53 +0800 Subject: [PATCH 02/28] bump numpy min version for travis --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index 9b7c8867..cfe869d7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,6 @@ scipy>=1.3.3 altair>=4.0.0 +numpy>=1.16.5 pandas>=1.1.0 scikit-learn>=0.22 # Install only to use SQLExecutor From 3c190a5910330a03017769664b614df781f01866 Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Sun, 27 Dec 2020 04:00:11 -0800 Subject: [PATCH 03/28] Special character issue (#184) * rename col * broken * fixed period replacement bug * add tests * refine tests * refine tests * remove cols * fix tests * add agg * fixed tests * clean up PR Co-authored-by: Caitlyn Chen Co-authored-by: Doris Lee --- lux/utils/utils.py | 4 + lux/vislib/altair/AltairRenderer.py | 5 ++ lux/vislib/altair/BarChart.py | 38 ++++++--- lux/vislib/altair/Heatmap.py | 19 ++++- lux/vislib/altair/Histogram.py | 16 +++- lux/vislib/altair/LineChart.py | 32 +++++--- lux/vislib/altair/ScatterChart.py | 18 +++- tests/test_columns.py | 123 ++++++++++++++++++++++++++++ 8 files changed, 222 insertions(+), 33 deletions(-) create mode 100644 tests/test_columns.py diff --git a/lux/utils/utils.py b/lux/utils/utils.py index 1d32f6e5..e19afcf4 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -58,10 +58,14 @@ def check_import_lux_widget(): def get_agg_title(clause): if clause.aggregation is None: + if len(clause.attribute) > 25: + return clause.attribute[:15] + "..." + clause.attribute[-10:] return f"{clause.attribute}" elif clause.attribute == "Record": return f"Number of Records" else: + if len(clause.attribute) > 15: + return f"{clause._aggregation_name.capitalize()} of {clause.attribute[:15]}..." return f"{clause._aggregation_name.capitalize()} of {clause.attribute}" diff --git a/lux/vislib/altair/AltairRenderer.py b/lux/vislib/altair/AltairRenderer.py index 701f3c0e..2957cd17 100644 --- a/lux/vislib/altair/AltairRenderer.py +++ b/lux/vislib/altair/AltairRenderer.py @@ -66,6 +66,11 @@ def create_vis(self, vis, standalone=True): vis.data[attr].iloc[0], pd.Interval ): vis.data[attr] = vis.data[attr].astype(str) + if "." in attr: + attr_clause = vis.get_attr_by_attr_name(attr)[0] + # Suppress special character ".", not displayable in Altair + # attr_clause.attribute = attr_clause.attribute.replace(".", "") + vis._vis_data = vis.data.rename(columns={attr: attr.replace(".", "")}) if vis.mark == "histogram": chart = Histogram(vis) elif vis.mark == "bar": diff --git a/lux/vislib/altair/BarChart.py b/lux/vislib/altair/BarChart.py index 99e9b1fd..91e17b29 100644 --- a/lux/vislib/altair/BarChart.py +++ b/lux/vislib/altair/BarChart.py @@ -40,6 +40,17 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] + x_attr_abv = x_attr.attribute + y_attr_abv = y_attr.attribute + + if len(x_attr.attribute) > 25: + x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] + if len(y_attr.attribute) > 25: + y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] + + x_attr.attribute = x_attr.attribute.replace(".", "") + y_attr.attribute = y_attr.attribute.replace(".", "") + if x_attr.data_model == "measure": agg_title = get_agg_title(x_attr) measure_attr = x_attr.attribute @@ -47,17 +58,17 @@ def initialize_chart(self): y_attr_field = alt.Y( y_attr.attribute, type=y_attr.data_type, - axis=alt.Axis(labelOverlap=True), + axis=alt.Axis(labelOverlap=True, title=y_attr_abv), ) - x_attr_field = alt.X(x_attr.attribute, type=x_attr.data_type, title=agg_title) - y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(labelOverlap=True))" - x_attr_field_code = ( - f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', title='{agg_title}')" + x_attr_field = alt.X( + x_attr.attribute, type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title) ) + y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{y_attr_abv}'))" + x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{agg_title}'))" if y_attr.sort == "ascending": y_attr_field.sort = "-x" - y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(labelOverlap=True), sort ='-x')" + y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{y_attr_abv}'), sort ='-x')" else: agg_title = get_agg_title(y_attr) measure_attr = y_attr.attribute @@ -65,19 +76,19 @@ def initialize_chart(self): x_attr_field = alt.X( x_attr.attribute, type=x_attr.data_type, - axis=alt.Axis(labelOverlap=True), + axis=alt.Axis(labelOverlap=True, title=x_attr_abv), ) - x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True))" - y_attr_field = alt.Y(y_attr.attribute, type=y_attr.data_type, title=agg_title) - y_attr_field_code = ( - f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}')" + x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{x_attr_abv}'))" + y_attr_field = alt.Y( + y_attr.attribute, type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title) ) + y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{agg_title}'))" if x_attr.sort == "ascending": x_attr_field.sort = "-y" - x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True),sort='-y')" + x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{x_attr_abv}'),sort='-y')" k = 10 self._topkcode = "" - n_bars = len(self.data[bar_attr].unique()) + n_bars = len(self.data.iloc[:, 0].unique()) if n_bars > k: # Truncating to only top k remaining_bars = n_bars - k self.data = self.data.nlargest(k, measure_attr) @@ -101,6 +112,7 @@ def initialize_chart(self): chart = chart + text\n""" chart = alt.Chart(self.data).mark_bar().encode(y=y_attr_field, x=x_attr_field) + # TODO: tooltip messes up the count() bar charts # Can not do interactive whenever you have default count measure otherwise output strange error (Javascript Error: Cannot read property 'length' of undefined) # chart = chart.interactive() # If you want to enable Zooming and Panning diff --git a/lux/vislib/altair/Heatmap.py b/lux/vislib/altair/Heatmap.py index 87c97b13..4432de56 100644 --- a/lux/vislib/altair/Heatmap.py +++ b/lux/vislib/altair/Heatmap.py @@ -39,6 +39,17 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] + x_attr_abv = x_attr.attribute + y_attr_abv = y_attr.attribute + + if len(x_attr.attribute) > 25: + x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] + if len(y_attr.attribute) > 25: + y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] + + x_attr.attribute = x_attr.attribute.replace(".", "") + y_attr.attribute = y_attr.attribute.replace(".", "") + chart = ( alt.Chart(self.data) .mark_rect() @@ -46,14 +57,14 @@ def initialize_chart(self): x=alt.X( "xBinStart", type="quantitative", - axis=alt.Axis(title=x_attr.attribute), + axis=alt.Axis(title=x_attr_abv), bin=alt.BinParams(binned=True), ), x2=alt.X2("xBinEnd"), y=alt.Y( "yBinStart", type="quantitative", - axis=alt.Axis(title=y_attr.attribute), + axis=alt.Axis(title=y_attr_abv), bin=alt.BinParams(binned=True), ), y2=alt.Y2("yBinEnd"), @@ -79,9 +90,9 @@ def initialize_chart(self): self.code += f"visData = pd.DataFrame({str(self.data.to_dict())})\n" self.code += f""" chart = alt.Chart(visData).mark_rect().encode( - x=alt.X('xBinStart', type='quantitative', axis=alt.Axis(title='{x_attr.attribute}'), bin = alt.BinParams(binned=True)), + x=alt.X('xBinStart', type='quantitative', axis=alt.Axis(title='{x_attr_abv}'), bin = alt.BinParams(binned=True)), x2=alt.X2('xBinEnd'), - y=alt.Y('yBinStart', type='quantitative', axis=alt.Axis(title='{y_attr.attribute}'), bin = alt.BinParams(binned=True)), + y=alt.Y('yBinStart', type='quantitative', axis=alt.Axis(title='{y_attr_abv}'), bin = alt.BinParams(binned=True)), y2=alt.Y2('yBinEnd'), opacity = alt.Opacity('count',type='quantitative',scale=alt.Scale(type="log"),legend=None) ) diff --git a/lux/vislib/altair/Histogram.py b/lux/vislib/altair/Histogram.py index fdcaaabc..38e578ab 100644 --- a/lux/vislib/altair/Histogram.py +++ b/lux/vislib/altair/Histogram.py @@ -38,9 +38,17 @@ def initialize_chart(self): self.tooltip = False measure = self.vis.get_attr_by_data_model("measure", exclude_record=True)[0] msr_attr = self.vis.get_attr_by_channel(measure.channel)[0] + + msr_attr_abv = msr_attr.attribute + + if len(msr_attr.attribute) > 17: + msr_attr_abv = msr_attr.attribute[:10] + "..." + msr_attr.attribute[-7:] + x_min = self.vis.min_max[msr_attr.attribute][0] x_max = self.vis.min_max[msr_attr.attribute][1] + msr_attr.attribute = msr_attr.attribute.replace(".", "") + x_range = abs(max(self.vis.data[msr_attr.attribute]) - min(self.vis.data[msr_attr.attribute])) plot_range = abs(x_max - x_min) markbar = x_range / plot_range * 12 @@ -55,7 +63,7 @@ def initialize_chart(self): title=f"{msr_attr.attribute} (binned)", bin=alt.Bin(binned=True), type=msr_attr.data_type, - axis=alt.Axis(labelOverlap=True), + axis=alt.Axis(labelOverlap=True, title=f"{msr_attr_abv} (binned)"), scale=alt.Scale(domain=(x_min, x_max)), ), alt.Y("Number of Records", type="quantitative"), @@ -71,7 +79,7 @@ def initialize_chart(self): msr_attr.attribute, title=f"{msr_attr.attribute} (binned)", bin=alt.Bin(binned=True), - axis=alt.Axis(labelOverlap=True), + axis=alt.Axis(labelOverlap=True, title=f"{msr_attr_abv} (binned)"), scale=alt.Scale(domain=(x_min, x_max)), ), ) @@ -86,14 +94,14 @@ def initialize_chart(self): if measure.channel == "x": self.code += f""" chart = alt.Chart(visData).mark_bar(size={markbar}).encode( - alt.X('{msr_attr.attribute}', title='{msr_attr.attribute} (binned)',bin=alt.Bin(binned=True), type='{msr_attr.data_type}', axis=alt.Axis(labelOverlap=True), scale=alt.Scale(domain=({x_min}, {x_max}))), + alt.X('{msr_attr.attribute}', title='{msr_attr.attribute} (binned)',bin=alt.Bin(binned=True), type='{msr_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{msr_attr_abv} (binned)'), scale=alt.Scale(domain=({x_min}, {x_max}))), alt.Y("Number of Records", type="quantitative") ) """ elif measure.channel == "y": self.code += f""" chart = alt.Chart(visData).mark_bar(size={markbar}).encode( - alt.Y('{msr_attr.attribute}', title='{msr_attr.attribute} (binned)',bin=alt.Bin(binned=True), type='{msr_attr.data_type}', axis=alt.Axis(labelOverlap=True), scale=alt.Scale(domain=({x_min}, {x_max}))), + alt.Y('{msr_attr.attribute}', title='{msr_attr.attribute} (binned)',bin=alt.Bin(binned=True), type='{msr_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{msr_attr_abv} (binned)'), scale=alt.Scale(domain=({x_min}, {x_max}))), alt.X("Number of Records", type="quantitative") ) """ diff --git a/lux/vislib/altair/LineChart.py b/lux/vislib/altair/LineChart.py index 18538d51..54b28c46 100644 --- a/lux/vislib/altair/LineChart.py +++ b/lux/vislib/altair/LineChart.py @@ -39,6 +39,18 @@ def initialize_chart(self): self.tooltip = False # tooltip looks weird for line chart x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] + + x_attr_abv = x_attr.attribute + y_attr_abv = y_attr.attribute + + if len(x_attr.attribute) > 25: + x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] + if len(y_attr.attribute) > 25: + y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] + + x_attr.attribute = x_attr.attribute.replace(".", "") + y_attr.attribute = y_attr.attribute.replace(".", "") + # Remove NaNs only for Line Charts (offsets axis range) self.data = self.data.dropna(subset=[x_attr.attribute, y_attr.attribute]) self.code += "import altair as alt\n" @@ -48,20 +60,20 @@ def initialize_chart(self): if y_attr.data_model == "measure": agg_title = get_agg_title(y_attr) - x_attr_spec = alt.X(x_attr.attribute, type=x_attr.data_type) - y_attr_spec = alt.Y(y_attr.attribute, type=y_attr.data_type, title=agg_title) - x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}')" - y_attr_fieldCode = ( - f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}')" + x_attr_spec = alt.X(x_attr.attribute, type=x_attr.data_type, axis=alt.Axis(title=x_attr_abv)) + y_attr_spec = alt.Y( + y_attr.attribute, type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=y_attr_abv) ) + x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', axis=alt.Axis(title='{x_attr_abv}'))" + y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{y_attr_abv}')" else: agg_title = get_agg_title(x_attr) - x_attr_spec = alt.X(x_attr.attribute, type=x_attr.data_type, title=agg_title) - y_attr_spec = alt.Y(y_attr.attribute, type=y_attr.data_type) - x_attr_field_code = ( - f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', title='{agg_title}')" + x_attr_spec = alt.X( + x_attr.attribute, type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=x_attr_abv) ) - y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}')" + y_attr_spec = alt.Y(y_attr.attribute, type=y_attr.data_type, axis=alt.Axis(title=y_attr_abv)) + x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{x_attr_abv}')" + y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(title='{u_attr_abv}')" chart = alt.Chart(self.data).mark_line().encode(x=x_attr_spec, y=y_attr_spec) chart = chart.interactive() # Enable Zooming and Panning diff --git a/lux/vislib/altair/ScatterChart.py b/lux/vislib/altair/ScatterChart.py index 583291d0..da645cda 100644 --- a/lux/vislib/altair/ScatterChart.py +++ b/lux/vislib/altair/ScatterChart.py @@ -37,12 +37,24 @@ def __repr__(self): def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] + + x_attr_abv = x_attr.attribute + y_attr_abv = y_attr.attribute + + if len(x_attr.attribute) > 25: + x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] + if len(y_attr.attribute) > 25: + y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] + x_min = self.vis.min_max[x_attr.attribute][0] x_max = self.vis.min_max[x_attr.attribute][1] y_min = self.vis.min_max[y_attr.attribute][0] y_max = self.vis.min_max[y_attr.attribute][1] + x_attr.attribute = x_attr.attribute.replace(".", "") + y_attr.attribute = y_attr.attribute.replace(".", "") + chart = ( alt.Chart(self.data) .mark_circle() @@ -51,11 +63,13 @@ def initialize_chart(self): x_attr.attribute, scale=alt.Scale(domain=(x_min, x_max)), type=x_attr.data_type, + axis=alt.Axis(title=x_attr_abv), ), y=alt.Y( y_attr.attribute, scale=alt.Scale(domain=(y_min, y_max)), type=y_attr.data_type, + axis=alt.Axis(title=y_attr_abv), ), ) ) @@ -71,8 +85,8 @@ def initialize_chart(self): dfname = "placeholder_variable" self.code += f""" chart = alt.Chart({dfname}).mark_circle().encode( - x=alt.X('{x_attr.attribute}',scale=alt.Scale(domain=({x_min}, {x_max})),type='{x_attr.data_type}'), - y=alt.Y('{y_attr.attribute}',scale=alt.Scale(domain=({y_min}, {y_max})),type='{y_attr.data_type}') + x=alt.X('{x_attr.attribute}',scale=alt.Scale(domain=({x_min}, {x_max})),type='{x_attr.data_type}', axis=alt.Axis(title='{x_attr_abv}')), + y=alt.Y('{y_attr.attribute}',scale=alt.Scale(domain=({y_min}, {y_max})),type='{y_attr.data_type}', axis=alt.Axis(title='{y_attr_abv}')) ) chart = chart.configure_mark(tooltip=alt.TooltipContent('encoding')) # Setting tooltip as non-null chart = chart.interactive() # Enable Zooming and Panning diff --git a/tests/test_columns.py b/tests/test_columns.py new file mode 100644 index 00000000..6216b471 --- /dev/null +++ b/tests/test_columns.py @@ -0,0 +1,123 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from .context import lux +import pytest +import pandas as pd + +from lux.vis.Vis import Vis + + +def test_special_char(): + dataset = [ + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 5}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 3}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 6}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 7}, + {"special.char": 1, "normal": 2}, + {"special.char": 3, "normal": 10}, + {"special.char": 1, "normal": 1}, + {"special.char": 5, "normal": 2}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 2}, + {"special.char": 1, "normal": 2}, + ] + test = pd.DataFrame(dataset) + + from lux.vis.Vis import Vis + + # TODO: add assert that checks that the bar chart is rendered correctly in Altair + vis = Vis(["special.char"], test) + assert vis.mark == "bar" + assert vis.intent == ["special.char"] + assert vis.get_attr_by_channel("x")[0].attribute == "Record" + assert vis.get_attr_by_channel("y")[0].attribute == "special.char" + vis = vis.to_Altair() + assert ( + "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special.char'))" + in vis + ) + assert ( + "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')" + in vis + ) + # Checking that this works even when there are multiple "." in column + test = test.rename(columns={"special.char": "special..char.."}) + # TODO: add assert that checks that the bar chart is rendered correctly in Altair + vis = Vis(["special..char.."], test) + assert vis.mark == "bar" + assert vis.intent == ["special..char.."] + assert vis.get_attr_by_channel("x")[0].attribute == "Record" + assert vis.get_attr_by_channel("y")[0].attribute == "special..char.." + vis = vis.to_Altair() + assert ( + "alt.Y('specialchar', type= 'nominal', axis=alt.Axis(labelOverlap=True, title='special..char..')" + in vis + ) + assert ( + "alt.X('Record', type= 'quantitative', title='Number of Records', axis=alt.Axis(title='Number of Records')" + in vis + ) + + +long_var = "Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum." + + +def test_abbrev_bar(): + dataset = [ + {long_var: 1, "normal": 3}, + {long_var: 1, "normal": 3}, + {long_var: 1, "normal": 2}, + {long_var: 1, "normal": 4}, + ] + test = pd.DataFrame(dataset) + vis = Vis([long_var, "normal"], test).to_Altair() + assert "axis=alt.Axis(labelOverlap=True, title='Lorem ipsum dol...t laborum.')" in vis + + +def test_abbrev_histogram(): + dataset = [ + {long_var: 1}, + {long_var: 0}, + ] + test = pd.DataFrame(dataset) + vis = Vis([long_var], test).to_Altair() + assert "axis=alt.Axis(labelOverlap=True, title='Lorem ipsu...aborum. (binned)')" in vis + + +def test_abbrev_scatter(): + dataset = [ + {long_var: 1, "normal": 3}, + ] + test = pd.DataFrame(dataset) + vis = Vis([long_var, "normal"], test).to_Altair() + assert "axis=alt.Axis(title='Lorem ipsum dol...t laborum.')" in vis + + +def test_abbrev_agg(): + dataset = [ + {"normal": "USA", long_var: 3}, + {"normal": "Europe", long_var: 3}, + {"normal": "USA", long_var: 2}, + {"normal": "Europe", long_var: 4}, + ] + test = pd.DataFrame(dataset) + vis = Vis([long_var, "normal"], test).to_Altair() + assert "axis=alt.Axis(title='Mean of Lorem ipsum dol...')" in vis From 42b89af2f7b30e138c35fef11809f9404638ef7f Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Mon, 28 Dec 2020 08:52:13 +0800 Subject: [PATCH 04/28] Colored bar interestingness bug (#189) * rewrote chi2 contingency with pd.crosstab * catching KeyError issue with chi2 contingency * padding interestingness with warning instead of error * interestingness now reuses ndim and nmsr computed in Compiler * bug fix for parser with int values * improve Vis repr to better display inferred intent when data is absent but fully compiled intent (all clauses) --- lux/interestingness/interestingness.py | 238 ++++++++++++------------- lux/processor/Parser.py | 8 +- lux/vis/Vis.py | 77 ++++---- 3 files changed, 158 insertions(+), 165 deletions(-) diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index dd9615a6..fb3a3b13 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -25,6 +25,7 @@ from lux.utils.utils import get_filter_specs from lux.interestingness.similarity import preprocess, euclidean_dist from lux.vis.VisList import VisList +import warnings def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: @@ -46,134 +47,123 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: if vis.data is None or len(vis.data) == 0: return -1 # raise Exception("Vis.data needs to be populated before interestingness can be computed. Run Executor.execute(vis,ldf).") - - n_dim = 0 - n_msr = 0 - - filter_specs = utils.get_filter_specs(vis._inferred_intent) - vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) - - record_attrs = list( - filter( - lambda x: x.attribute == "Record" and x.data_model == "measure", - vis_attrs_specs, - ) - ) - n_record = len(record_attrs) - for clause in vis_attrs_specs: - if clause.attribute != "Record": - if clause.data_model == "dimension": - n_dim += 1 - if clause.data_model == "measure": - n_msr += 1 - n_filter = len(filter_specs) - attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] - dimension_lst = vis.get_attr_by_data_model("dimension") - measure_lst = vis.get_attr_by_data_model("measure") - v_size = len(vis.data) - - if ( - n_dim == 1 - and (n_msr == 0 or n_msr == 1) - and ldf.current_vis is not None - and vis.get_attr_by_channel("y")[0].data_type == "quantitative" - and len(ldf.current_vis) == 1 - and ldf.current_vis[0].mark == "line" - and len(get_filter_specs(ldf.intent)) > 0 - ): - query_vc = VisList(ldf.current_vis, ldf) - query_vis = query_vc[0] - preprocess(query_vis) - preprocess(vis) - return 1 - euclidean_dist(query_vis, vis) - - # Line/Bar Chart - # print("r:", n_record, "m:", n_msr, "d:",n_dim) - if n_dim == 1 and (n_msr == 0 or n_msr == 1): - if v_size < 2: - return -1 - - if n_filter == 0: - return unevenness(vis, ldf, measure_lst, dimension_lst) - elif n_filter == 1: - return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) - # Histogram - elif n_dim == 0 and n_msr == 1: - if v_size < 2: - return -1 - if n_filter == 0 and "Number of Records" in vis.data: - if "Number of Records" in vis.data: - v = vis.data["Number of Records"] - return skewness(v) - elif n_filter == 1 and "Number of Records" in vis.data: - return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") - return -1 - # Scatter Plot - elif n_dim == 0 and n_msr == 2: - if v_size < 10: - return -1 - if vis.mark == "heatmap": - return weighted_correlation(vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"]) - if n_filter == 1: - v_filter_size = get_filtered_size(filter_specs, vis.data) - sig = v_filter_size / v_size - else: - sig = 1 - return sig * monotonicity(vis, attr_specs) - # Scatterplot colored by Dimension - elif n_dim == 1 and n_msr == 2: - if v_size < 10: + try: + filter_specs = utils.get_filter_specs(vis._inferred_intent) + vis_attrs_specs = utils.get_attrs_specs(vis._inferred_intent) + n_dim = vis._ndim + n_msr = vis._nmsr + n_filter = len(filter_specs) + attr_specs = [clause for clause in vis_attrs_specs if clause.attribute != "Record"] + dimension_lst = vis.get_attr_by_data_model("dimension") + measure_lst = vis.get_attr_by_data_model("measure") + v_size = len(vis.data) + + if ( + n_dim == 1 + and (n_msr == 0 or n_msr == 1) + and ldf.current_vis is not None + and vis.get_attr_by_channel("y")[0].data_type == "quantitative" + and len(ldf.current_vis) == 1 + and ldf.current_vis[0].mark == "line" + and len(get_filter_specs(ldf.intent)) > 0 + ): + query_vc = VisList(ldf.current_vis, ldf) + query_vis = query_vc[0] + preprocess(query_vis) + preprocess(vis) + return 1 - euclidean_dist(query_vis, vis) + + # Line/Bar Chart + # print("r:", n_record, "m:", n_msr, "d:",n_dim) + if n_dim == 1 and (n_msr == 0 or n_msr == 1): + if v_size < 2: + return -1 + + if n_filter == 0: + return unevenness(vis, ldf, measure_lst, dimension_lst) + elif n_filter == 1: + return deviation_from_overall(vis, ldf, filter_specs, measure_lst[0].attribute) + # Histogram + elif n_dim == 0 and n_msr == 1: + if v_size < 2: + return -1 + if n_filter == 0 and "Number of Records" in vis.data: + if "Number of Records" in vis.data: + v = vis.data["Number of Records"] + return skewness(v) + elif n_filter == 1 and "Number of Records" in vis.data: + return deviation_from_overall(vis, ldf, filter_specs, "Number of Records") return -1 - color_attr = vis.get_attr_by_channel("color")[0].attribute + # Scatter Plot + elif n_dim == 0 and n_msr == 2: + if v_size < 10: + return -1 + if vis.mark == "heatmap": + return weighted_correlation( + vis.data["xBinStart"], vis.data["yBinStart"], vis.data["count"] + ) + if n_filter == 1: + v_filter_size = get_filtered_size(filter_specs, vis.data) + sig = v_filter_size / v_size + else: + sig = 1 + return sig * monotonicity(vis, attr_specs) + # Scatterplot colored by Dimension + elif n_dim == 1 and n_msr == 2: + if v_size < 10: + return -1 + color_attr = vis.get_attr_by_channel("color")[0].attribute + + C = ldf.cardinality[color_attr] + if C < 40: + return 1 / C + else: + return -1 + # Scatterplot colored by dimension + elif n_dim == 1 and n_msr == 2: + return 0.2 + # Scatterplot colored by measure + elif n_msr == 3: + return 0.1 + # colored line and barchart cases + elif vis.mark == "line" and n_dim == 2: + return 0.15 + # for colored bar chart, scoring based on Chi-square test for independence score. + # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users + elif vis.mark == "bar" and n_dim == 2: + from scipy.stats import chi2_contingency + + measure_column = vis.get_attr_by_data_model("measure")[0].attribute + dimension_columns = vis.get_attr_by_data_model("dimension") + + groupby_column = dimension_columns[0].attribute + color_column = dimension_columns[1].attribute + + contingency_tbl = pd.crosstab( + vis.data[groupby_column], + vis.data[color_column], + values=vis.data[measure_column], + aggfunc=sum, + ) - C = ldf.cardinality[color_attr] - if C < 40: - return 1 / C + try: + color_cardinality = ldf.cardinality[color_column] + groupby_cardinality = ldf.cardinality[groupby_column] + # scale down score based on number of categories + chi2_score = chi2_contingency(contingency_tbl)[0] * 0.9 ** ( + color_cardinality + groupby_cardinality + ) + score = min(0.10, chi2_score) + except (ValueError, KeyError): + # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in a category having no counts + score = -1 + return score + # Default else: return -1 - # Scatterplot colored by dimension - elif n_dim == 1 and n_msr == 2: - return 0.2 - # Scatterplot colored by measure - elif n_msr == 3: - return 0.1 - # colored line and barchart cases - elif vis.mark == "line" and n_dim == 2: - return 0.15 - # for colored bar chart, scoring based on Chi-square test for independence score. - # gives higher scores to colored bar charts with fewer total categories as these charts are easier to read and thus more useful for users - elif vis.mark == "bar" and n_dim == 2: - from scipy.stats import chi2_contingency - - measure_column = vis.get_attr_by_data_model("measure")[0].attribute - dimension_columns = vis.get_attr_by_data_model("dimension") - - groupby_column = dimension_columns[0].attribute - color_column = dimension_columns[1].attribute - - contingency_table = [] - groupby_cardinality = ldf.cardinality[groupby_column] - groupby_unique_vals = ldf.unique_values[groupby_column] - for c in range(0, groupby_cardinality): - contingency_table.append( - vis.data[vis.data[groupby_column] == groupby_unique_vals[c]][measure_column] - ) - score = 0.12 - # ValueError results if an entire column of the contingency table is 0, can happen if an applied filter results in - # a category having no counts - - try: - color_cardinality = ldf.cardinality[color_column] - # scale down score based on number of categories - chi2_score = chi2_contingency(contingency_table)[0] * 0.9 ** ( - color_cardinality + groupby_cardinality - ) - score = min(0.10, chi2_score) - except ValueError: - pass - return score - # Default - else: + except: + # Supress interestingness related issues + warnings.warn(f"An error occurred when computing interestingness for: {vis}") return -1 diff --git a/lux/processor/Parser.py b/lux/processor/Parser.py index 065d420e..a6538e09 100644 --- a/lux/processor/Parser.py +++ b/lux/processor/Parser.py @@ -95,7 +95,9 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]: # TODO: Move validation check to Validator # if ((clause.description in list(ldf.columns)) or clause.description == "?"):# if clause.description in the list of attributes # clause.description contain ">","<". or "=" - if any(ext in [">", "<", "=", "!="] for ext in clause.description): + if type(clause.description) == str and any( + ext in [">", "<", "=", "!="] for ext in clause.description + ): # then parse it and assign to clause.attribute, clause.filter_op, clause.values clause.filter_op = re.findall(r"/.*/|>|=|<|>=|<=|!=", clause.description)[0] split_description = clause.description.split(clause.filter_op) @@ -107,7 +109,7 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]: clause.attribute = clause.description elif type(clause.description) == list: clause.attribute = clause.description - # else: # then it is probably a value - # clause.values = clause.description + else: # then it is probably a value + clause.value = clause.description return intent # ldf._intent = intent diff --git a/lux/vis/Vis.py b/lux/vis/Vis.py index 980162dc..9b2fc455 100644 --- a/lux/vis/Vis.py +++ b/lux/vis/Vis.py @@ -24,12 +24,10 @@ class Vis: """ def __init__(self, intent, source=None, title="", score=0.0): - self._intent = intent # This is the user's original intent to Vis - self._inferred_intent = intent # This is the re-written, expanded version of user's original intent (include inferred vis info) - self._source = source # This is the original data that is attached to the Vis - self._vis_data = ( - None # This is the data that represents the Vis (e.g., selected, aggregated, binned) - ) + self._intent = intent # user's original intent to Vis + self._inferred_intent = intent # re-written, expanded version of user's original intent + self._source = source # original data attached to the Vis + self._vis_data = None # processed data for Vis (e.g., selected, aggregated, binned) self._code = None self._mark = "" self._min_max = {} @@ -39,39 +37,42 @@ def __init__(self, intent, source=None, title="", score=0.0): self.refresh_source(self._source) def __repr__(self): - if self._source is None: - return f"" - filter_intents = None - channels, additional_channels = [], [] - for clause in self._inferred_intent: - - if hasattr(clause, "value"): - if clause.value != "": - filter_intents = clause - if hasattr(clause, "attribute"): - if clause.attribute != "": - if clause.aggregation != "" and clause.aggregation is not None: - attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")" - elif clause.bin_size > 0: - attribute = "BIN(" + clause.attribute + ")" - else: - attribute = clause.attribute - if clause.channel == "x": - channels.insert(0, [clause.channel, attribute]) - elif clause.channel == "y": - channels.insert(1, [clause.channel, attribute]) - elif clause.channel != "": - additional_channels.append([clause.channel, attribute]) - - channels.extend(additional_channels) - str_channels = "" - for channel in channels: - str_channels += channel[0] + ": " + channel[1] + ", " - - if filter_intents: - return f"" + all_clause = all([isinstance(unit, lux.Clause) for unit in self._inferred_intent]) + if all_clause: + filter_intents = None + channels, additional_channels = [], [] + for clause in self._inferred_intent: + + if hasattr(clause, "value"): + if clause.value != "": + filter_intents = clause + if hasattr(clause, "attribute"): + if clause.attribute != "": + if clause.aggregation != "" and clause.aggregation is not None: + attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")" + elif clause.bin_size > 0: + attribute = "BIN(" + clause.attribute + ")" + else: + attribute = clause.attribute + if clause.channel == "x": + channels.insert(0, [clause.channel, attribute]) + elif clause.channel == "y": + channels.insert(1, [clause.channel, attribute]) + elif clause.channel != "": + additional_channels.append([clause.channel, attribute]) + + channels.extend(additional_channels) + str_channels = "" + for channel in channels: + str_channels += channel[0] + ": " + channel[1] + ", " + + if filter_intents: + return f"" + else: + return f"" else: - return f"" + # When Vis not compiled (e.g., when self._source not populated), print original intent + return f"" @property def data(self): From a06d417aa3cd9b8ab071935405bbb4be97b5f0fb Mon Sep 17 00:00:00 2001 From: Kunal Agarwal <32151899+westernguy2@users.noreply.github.com> Date: Wed, 30 Dec 2020 17:59:34 -0800 Subject: [PATCH 05/28] Add sampling parameters as a global config (#192) * update export tutorial to add explanation for standalone argument * minor fixes and remove cell output in notebooks * added contributing doc * fix bugs and uncomment some tests * remove raise warning * remove unnecessary import * split up rename test into two parts * fix setting warning, fix data_type bugs and add relevant tests * remove ordinal data type * add test for small dataframe resetting index * add loc and iloc tests * fix attribute access directly to dataframe * add small changes to code * added test for qcut and cut * add check if dtype is Interval * added qcut test * fix Record KeyError * add tests * take care of reset_index case * small edits * add data_model to column_group Clause * small edits for row_group * fixes to row group * add config for start and cap for samples * finish sampling config and tests * black formatting * add documentation for sampling config * remove small added issues * minor changes to docs * implement heatmap flag and add tests * black formatting and documentation edits Co-authored-by: Doris Lee --- doc/source/guide/FAQ.rst | 17 +++++++ doc/source/reference/config.rst | 28 +++++++++++ lux/_config/config.py | 87 +++++++++++++++++++++++++++++++++ lux/executor/PandasExecutor.py | 12 +++-- tests/test_config.py | 35 +++++++++++++ 5 files changed, 174 insertions(+), 5 deletions(-) diff --git a/doc/source/guide/FAQ.rst b/doc/source/guide/FAQ.rst index 61c9800b..203f015c 100644 --- a/doc/source/guide/FAQ.rst +++ b/doc/source/guide/FAQ.rst @@ -64,6 +64,23 @@ How do I turn off Lux? To display only the Pandas view of the dataframe, print the dataframe by doing :code:`df.to_pandas()`. To turn off Lux completely, remove the :code:`import lux` statement and restart your Jupyter notebook. +How do I disable sampling and have Lux visualize the full dataset? +"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" + Lux displays a warning saying "Large dataframe detected: Lux is only visualizing a random sample". If you would like to disable sampling, you can run: + + .. code-block:: python + + lux.config.sampling = False + + Note that if you have already loaded your data in and printed the visualizations, you would need to reinitialize the Dataframe by setting the config before loading in your data, as such: + + .. code-block:: python + + lux.config.sampling = False + df = pd.read_csv("...") + + If you want to fine-tune the sampling parameters, you can edit :code:`lux.config.sampling_start` and :code:`lux.config.sampling_cap`. See `this page `_ for more details. + Troubleshooting Tips -------------------- diff --git a/doc/source/reference/config.rst b/doc/source/reference/config.rst index acbd211d..a1474dc5 100644 --- a/doc/source/reference/config.rst +++ b/doc/source/reference/config.rst @@ -44,3 +44,31 @@ If you try to set the default_display to anything other than 'lux' or 'pandas,' :align: center :alt: Retrieves a single attribute from Lux's Action Manager using its defined id. +Change the sampling parameters of Lux +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +To speed up the visualization processing, by default, Lux performs random sampling on datasets with more than 10000 rows. For datasets over 30000 rows, Lux will randomly sample 30000 rows from the dataset. + +If we want to change these parameters, we can set the `sampling_start` and `sampling_cap` via `lux.config` to change the default form of output. The `sampling_start` is by default set to 10000 and the `sampling_cap` is by default set to 30000. In the following block, we increase these sampling bounds. + +.. code-block:: python + + lux.config.sampling_start = 20000 + lux.config.sampling_cap = 40000 + +If we want Lux to use the full dataset in the visualization, we can also disable sampling altogether (but note that this may result in long processing times). Below is an example if disabling the sampling: + +.. code-block:: python + + lux.config.sampling = False + +Disable the use of heatmaps for large datasets +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In addition to sampling, Lux replaces scatter plots with heatmaps for datasets with over 5000 rows to speed up the visualization process. + +We can disable this feature and revert back to using a scatter plot by running the following code block (but note that this may result in long processing times). + +.. code-block:: python + + lux.config.heatmap = False diff --git a/lux/_config/config.py b/lux/_config/config.py index 10e21762..21eace4f 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -155,6 +155,93 @@ def __init__(self): self.plot_config = None self.SQLconnection = "" self.executor = None + self._sampling_start = 10000 + self._sampling_cap = 30000 + self._sampling_flag = True + self._heatmap_flag = True + + @property + def sampling_cap(self): + return self._sampling_cap + + @sampling_cap.setter + def sampling_cap(self, sample_number: int) -> None: + """ + Parameters + ---------- + sample_number : int + Cap on the number of rows to sample. Must be larger than _sampling_start + """ + if type(sample_number) == int: + assert sample_number >= self._sampling_start + self._sampling_cap = sample_number + else: + warnings.warn( + "The cap on the number samples must be an integer.", + stacklevel=2, + ) + + @property + def sampling_start(self): + return self._sampling_start + + @sampling_start.setter + def sampling_start(self, sample_number: int) -> None: + """ + Parameters + ---------- + sample_number : int + Number of rows required to begin sampling. Must be smaller or equal to _sampling_cap + + """ + if type(sample_number) == int: + assert sample_number <= self._sampling_cap + self._sampling_start = sample_number + else: + warnings.warn( + "The sampling starting point must be an integer.", + stacklevel=2, + ) + + @property + def sampling(self): + return self._sampling_flag + + @sampling.setter + def sampling(self, sample_flag: bool) -> None: + """ + Parameters + ---------- + sample_flag : bool + Whether or not sampling will occur. + """ + if type(sample_flag) == bool: + self._sampling_flag = sample_flag + else: + warnings.warn( + "The flag for sampling must be a boolean.", + stacklevel=2, + ) + + @property + def heatmap(self): + return self._heatmap_flag + + @heatmap.setter + def heatmap(self, heatmap_flag: bool) -> None: + """ + Parameters + ---------- + heatmap_flag : bool + Whether or not a heatmap will be used instead of a scatter plot. + """ + if type(heatmap_flag) == bool: + self._heatmap_flag = heatmap_flag + else: + warnings.warn( + "The flag for enabling/disabling heatmaps must be a boolean.", + stacklevel=2, + ) @property def default_display(self): diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 4d055f72..5aa7b02d 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -40,17 +40,19 @@ def __repr__(self): @staticmethod def execute_sampling(ldf: LuxDataFrame): # General Sampling for entire dataframe - SAMPLE_START = 10000 - SAMPLE_CAP = 30000 + SAMPLE_FLAG = lux.config.sampling + SAMPLE_START = lux.config.sampling_start + SAMPLE_CAP = lux.config.sampling_cap SAMPLE_FRAC = 0.75 - if len(ldf) > SAMPLE_CAP: + + if SAMPLE_FLAG and len(ldf) > SAMPLE_CAP: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(n=SAMPLE_CAP, random_state=1) ldf._message.add_unique( f"Large dataframe detected: Lux is only visualizing a random sample capped at {SAMPLE_CAP} rows.", priority=99, ) - elif len(ldf) > SAMPLE_START: + elif SAMPLE_FLAG and len(ldf) > SAMPLE_START: if ldf._sampled is None: # memoize unfiltered sample df ldf._sampled = ldf.sample(frac=SAMPLE_FRAC, random_state=1) ldf._message.add_unique( @@ -99,7 +101,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame): PandasExecutor.execute_binning(vis) elif vis.mark == "scatter": HBIN_START = 5000 - if len(ldf) > HBIN_START: + if lux.config.heatmap and len(ldf) > HBIN_START: vis._postbin = True ldf._message.add_unique( f"Large scatterplots detected: Lux is automatically binning scatterplots to heatmaps.", diff --git a/tests/test_config.py b/tests/test_config.py index ed8b4fd1..b5906c6b 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -196,6 +196,41 @@ def change_color_make_transparent_add_title(chart): assert title_addition in exported_code_str +def test_sampling_flag_config(): + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") + df._repr_html_() + assert df.recommendation["Correlation"][0].data.shape[0] == 30000 + lux.config.sampling = False + df = df.copy() + df._repr_html_() + assert df.recommendation["Correlation"][0].data.shape[0] == 48895 + lux.config.sampling = True + + +def test_sampling_parameters_config(): + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() + assert df.recommendation["Correlation"][0].data.shape[0] == 392 + lux.config.sampling_start = 50 + lux.config.sampling_cap = 100 + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() + assert df.recommendation["Correlation"][0].data.shape[0] == 100 + lux.config.sampling_cap = 30000 + lux.config.sampling_start = 10000 + + +def test_heatmap_flag_config(): + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") + df._repr_html_() + assert df.recommendation["Correlation"][0]._postbin + lux.config.heatmap = False + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") + df = df.copy() + assert not df.recommendation["Correlation"][0]._postbin + lux.config.heatmap = True + + # TODO: This test does not pass in pytest but is working in Jupyter notebook. # def test_plot_setting(global_var): # df = pytest.car_df From 7f7a9054d235363cadd94c3f2492eb39300a1668 Mon Sep 17 00:00:00 2001 From: jinimukh <46768380+jinimukh@users.noreply.github.com> Date: Sat, 2 Jan 2021 06:06:43 -0800 Subject: [PATCH 06/28] Coalesce all data_type attributes of frame into one (#185) * coalesce data_types into data_type_lookup * black reformat * changed to better variable names * lux not defined error * fixed * black format --- lux/action/filter.py | 4 +- lux/action/univariate.py | 4 +- lux/core/frame.py | 41 ++++++------------- lux/core/series.py | 3 -- lux/executor/Executor.py | 22 ++++++++-- lux/executor/PandasExecutor.py | 44 ++++++++------------ lux/processor/Compiler.py | 13 ++++-- lux/utils/date_utils.py | 8 +++- tests/test_dates.py | 5 ++- tests/test_nan.py | 6 ++- tests/test_pandas_coverage.py | 75 +++++++++++++++++++--------------- tests/test_performance.py | 2 +- tests/test_type.py | 34 +++++++-------- 13 files changed, 133 insertions(+), 128 deletions(-) diff --git a/lux/action/filter.py b/lux/action/filter.py index 70d85e0e..e8833b0f 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -45,7 +45,7 @@ def filter(ldf): # get unique values for all categorical values specified and creates corresponding filters fltr = filters[0] - if ldf.data_type_lookup[fltr.attribute] == "nominal": + if ldf.data_type[fltr.attribute] == "nominal": recommendation = { "action": "Filter", "description": f"Changing the

{fltr.attribute}

filter to an alternative value.", @@ -60,7 +60,7 @@ def filter(ldf): new_spec.append(new_filter) temp_vis = Vis(new_spec) output.append(temp_vis) - elif ldf.data_type_lookup[fltr.attribute] == "quantitative": + elif ldf.data_type[fltr.attribute] == "quantitative": recommendation = { "action": "Filter", "description": f"Changing the

{fltr.attribute}

filter to an alternative inequality operation.", diff --git a/lux/action/univariate.py b/lux/action/univariate.py index 8f8cd1ac..030a6f03 100644 --- a/lux/action/univariate.py +++ b/lux/action/univariate.py @@ -48,9 +48,7 @@ def univariate(ldf, *args): possible_attributes = [ c for c in ldf.columns - if ldf.data_type_lookup[c] == "quantitative" - and ldf.cardinality[c] > 5 - and c != "Number of Records" + if ldf.data_type[c] == "quantitative" and ldf.cardinality[c] > 5 and c != "Number of Records" ] intent = [lux.Clause(possible_attributes)] intent.extend(filter_specs) diff --git a/lux/core/frame.py b/lux/core/frame.py index 7eab2ed1..e4ed9e3e 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -21,6 +21,8 @@ from lux.utils.message import Message from lux.utils.utils import check_import_lux_widget from typing import Dict, Union, List, Callable + +# from lux.executor.Executor import * import warnings import traceback import lux @@ -35,10 +37,7 @@ class LuxDataFrame(pd.DataFrame): _metadata = [ "_intent", "_inferred_intent", - "data_type_lookup", "data_type", - "data_model_lookup", - "data_model", "unique_values", "cardinality", "_rec_info", @@ -77,10 +76,7 @@ def __init__(self, *args, **kw): self._message = Message() self._pandas_only = False # Metadata - self.data_type_lookup = None self.data_type = None - self.data_model_lookup = None - self.data_model = None self.unique_values = None self.cardinality = None self._min_max = None @@ -126,10 +122,7 @@ def expire_recs(self): def expire_metadata(self): # Set metadata as null self._metadata_fresh = False - self.data_type_lookup = None self.data_type = None - self.data_model_lookup = None - self.data_model = None self.unique_values = None self.cardinality = None self._min_max = None @@ -294,15 +287,11 @@ def compute_SQL_dataset_metadata(self): self.get_SQL_attributes() for attr in list(self.columns): self[attr] = None - self.data_type_lookup = {} self.data_type = {} #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this ##### in the initialization and do it just once self.compute_SQL_data_type() self.compute_SQL_stats() - self.data_model_lookup = {} - self.data_model = {} - self.compute_data_model() def compute_SQL_stats(self): # precompute statistics @@ -312,7 +301,7 @@ def compute_SQL_stats(self): self.get_SQL_unique_values() # self.get_SQL_cardinality() for attribute in self.columns: - if self.data_type_lookup[attribute] == "quantitative": + if self.data_type[attribute] == "quantitative": self._min_max[attribute] = ( self[attribute].min(), self[attribute].max(), @@ -349,7 +338,7 @@ def get_SQL_unique_values(self): self.unique_values = unique_vals def compute_SQL_data_type(self): - data_type_lookup = {} + data_type = {} sql_dtypes = {} self.get_SQL_cardinality() if "." in self.table_name: @@ -362,11 +351,9 @@ def compute_SQL_data_type(self): datatype = list(pd.read_sql(query, lux.config.SQLconnection)["data_type"])[0] sql_dtypes[attr] = datatype - data_type = {"quantitative": [], "nominal": [], "temporal": []} for attr in list(self.columns): if str(attr).lower() in ["month", "year"]: - data_type_lookup[attr] = "temporal" - data_type["temporal"].append(attr) + data_type[attr] = "temporal" elif sql_dtypes[attr] in [ "character", "character varying", @@ -374,8 +361,7 @@ def compute_SQL_data_type(self): "uuid", "text", ]: - data_type_lookup[attr] = "nominal" - data_type["nominal"].append(attr) + data_type[attr] = "nominal" elif sql_dtypes[attr] in [ "integer", "real", @@ -384,15 +370,11 @@ def compute_SQL_data_type(self): "serial", ]: if self.cardinality[attr] < 13: - data_type_lookup[attr] = "nominal" - data_type["nominal"].append(attr) + data_type[attr] = "nominal" else: - data_type_lookup[attr] = "quantitative" - data_type["quantitative"].append(attr) + data_type[attr] = "quantitative" elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]: - data_type_lookup[attr] = "temporal" - data_type["temporal"].append(attr) - self.data_type_lookup = data_type_lookup + data_type[attr] = "temporal" self.data_type = data_type def _append_rec(self, rec_infolist, recommendations: Dict): @@ -419,8 +401,9 @@ def maintain_recs(self): rec_df._message = Message() # Add warning message if there exist ID fields id_fields_str = "" - if len(rec_df.data_type["id"]) > 0: - for id_field in rec_df.data_type["id"]: + inverted_data_type = lux.config.executor.invert_data_type(rec_df.data_type) + if len(inverted_data_type["id"]) > 0: + for id_field in inverted_data_type["id"]: id_fields_str += f"{id_field}, " id_fields_str = id_fields_str[:-2] rec_df._message.add(f"{id_fields_str} is not visualized since it resembles an ID field.") diff --git a/lux/core/series.py b/lux/core/series.py index 0ba805ce..aea13d0c 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -21,10 +21,7 @@ class LuxSeries(pd.Series): _metadata = [ "_intent", - "data_type_lookup", "data_type", - "data_model_lookup", - "data_model", "unique_values", "cardinality", "_rec_info", diff --git a/lux/executor/Executor.py b/lux/executor/Executor.py index 972f6fb6..c93ea9b4 100644 --- a/lux/executor/Executor.py +++ b/lux/executor/Executor.py @@ -51,9 +51,9 @@ def compute_stats(self): def compute_data_type(self): return NotImplemented - @staticmethod - def compute_data_model(self): - return NotImplemented + # @staticmethod + # def compute_data_model(self): + # return NotImplemented def mapping(self, rmap): group_map = {} @@ -67,3 +67,19 @@ def reverseMapping(self, map): for val in map[valKey]: reverse_map[val] = valKey return reverse_map + + def invert_data_type(self, data_type): + return self.mapping(data_type) + + def compute_data_model(self, data_type): + data_type_inverted = self.invert_data_type(data_type) + data_model = { + "measure": data_type_inverted["quantitative"], + "dimension": data_type_inverted["nominal"] + + data_type_inverted["temporal"] + + data_type_inverted["id"], + } + return data_model + + def compute_data_model_lookup(self, data_type): + return self.reverseMapping(self.compute_data_model(data_type)) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 5aa7b02d..fe2542b7 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -376,12 +376,8 @@ def execute_2D_binning(vis: Vis): ############ Metadata: data type, model ############# ####################################################### def compute_dataset_metadata(self, ldf: LuxDataFrame): - ldf.data_type_lookup = {} ldf.data_type = {} self.compute_data_type(ldf) - ldf.data_model_lookup = {} - ldf.data_model = {} - self.compute_data_model(ldf) def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime @@ -389,51 +385,50 @@ def compute_data_type(self, ldf: LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] if is_datetime(ldf[attr]): - ldf.data_type_lookup[attr] = "temporal" + ldf.data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): - ldf.data_type_lookup[attr] = "temporal" + ldf.data_type[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): - ldf.data_type_lookup[attr] = "temporal" + ldf.data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: - ldf.data_type_lookup[attr] = "temporal" + ldf.data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20: - ldf.data_type_lookup[attr] = "nominal" + ldf.data_type[attr] = "nominal" else: - ldf.data_type_lookup[attr] = "quantitative" + ldf.data_type[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: if ldf.cardinality[attr] == len(ldf): - ldf.data_type_lookup[attr] = "nominal" + ldf.data_type[attr] = "nominal" if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20: - ldf.data_type_lookup[attr] = "nominal" + ldf.data_type[attr] = "nominal" else: - ldf.data_type_lookup[attr] = "quantitative" + ldf.data_type[attr] = "quantitative" if check_if_id_like(ldf, attr): - ldf.data_type_lookup[attr] = "id" + ldf.data_type[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf, attr): - ldf.data_type_lookup[attr] = "id" + ldf.data_type[attr] = "id" else: - ldf.data_type_lookup[attr] = "nominal" + ldf.data_type[attr] = "nominal" # check if attribute is any type of datetime dtype elif is_datetime_series(ldf.dtypes[attr]): - ldf.data_type_lookup[attr] = "temporal" + ldf.data_type[attr] = "temporal" else: - ldf.data_type_lookup[attr] = "nominal" + ldf.data_type[attr] = "nominal" # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): # if self.cardinality[attr]>50: if ldf.index.dtype != "int64" and ldf.index.name: - ldf.data_type_lookup[ldf.index.name] = "nominal" - ldf.data_type = self.mapping(ldf.data_type_lookup) + ldf.data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] for attr in ldf.columns: - if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]): + if ldf.data_type[attr] == "temporal" and not is_datetime(ldf[attr]): non_datetime_attrs.append(attr) warn_msg = "" if len(non_datetime_attrs) == 1: @@ -470,13 +465,6 @@ def _is_datetime_string(self, series): return True return False - def compute_data_model(self, ldf: LuxDataFrame): - ldf.data_model = { - "measure": ldf.data_type["quantitative"], - "dimension": ldf.data_type["nominal"] + ldf.data_type["temporal"] + ldf.data_type["id"], - } - ldf.data_model_lookup = self.reverseMapping(ldf.data_model) - def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} diff --git a/lux/processor/Compiler.py b/lux/processor/Compiler.py index 1f155197..3131cce3 100644 --- a/lux/processor/Compiler.py +++ b/lux/processor/Compiler.py @@ -159,6 +159,8 @@ def populate_data_type_model(ldf, vlist): # TODO: copy might not be neccesary from lux.utils.date_utils import is_datetime_string + data_model_lookup = lux.config.executor.compute_data_model_lookup(ldf.data_type) + for vis in vlist: for clause in vis._inferred_intent: if clause.description == "?": @@ -167,11 +169,11 @@ def populate_data_type_model(ldf, vlist): # and not is_datetime_string(clause.attribute): if clause.attribute != "" and clause.attribute != "Record": if clause.data_type == "": - clause.data_type = ldf.data_type_lookup[clause.attribute] + clause.data_type = ldf.data_type[clause.attribute] if clause.data_type == "id": clause.data_type = "nominal" if clause.data_model == "": - clause.data_model = ldf.data_model_lookup[clause.attribute] + clause.data_model = data_model_lookup[clause.attribute] if clause.value != "": # If user provided title for Vis, then don't override. if vis.title == "": @@ -439,6 +441,9 @@ def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame) import copy from lux.utils.utils import convert_to_list + inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type) + data_model = lux.config.executor.compute_data_model(ldf.data_type) + intent = {"attributes": [], "filters": []} for clause in _inferred_intent: spec_options = [] @@ -446,9 +451,9 @@ def populate_wildcard_options(_inferred_intent: List[Clause], ldf: LuxDataFrame) if clause.attribute == "?": options = set(list(ldf.columns)) # all attributes if clause.data_type != "": - options = options.intersection(set(ldf.data_type[clause.data_type])) + options = options.intersection(set(inverted_data_type[clause.data_type])) if clause.data_model != "": - options = options.intersection(set(ldf.data_model[clause.data_model])) + options = options.intersection(set(data_model[clause.data_model])) options = list(options) else: options = convert_to_list(clause.attribute) diff --git a/lux/utils/date_utils.py b/lux/utils/date_utils.py index d3ed03ae..66086f1c 100644 --- a/lux/utils/date_utils.py +++ b/lux/utils/date_utils.py @@ -13,6 +13,7 @@ # limitations under the License. import pandas as pd +import lux def date_formatter(time_stamp, ldf): @@ -38,10 +39,13 @@ def date_formatter(time_stamp, ldf): date_str: str A reformatted version of the time_stamp according to granularity """ + + inverted_data_type = lux.config.executor.invert_data_type(ldf.data_type) + # TODO: method for data_type_lookup to data_type datetime = pd.to_datetime(time_stamp) - if ldf.data_type["temporal"]: + if inverted_data_type["temporal"]: # assumes only one temporal column, may need to change this function to recieve multiple temporal columns in the future - date_column = ldf[ldf.data_type["temporal"][0]] + date_column = ldf[inverted_data_type["temporal"][0]] granularity = compute_date_granularity(date_column) date_str = "" diff --git a/tests/test_dates.py b/tests/test_dates.py index 6d4c6407..ce859f5d 100644 --- a/tests/test_dates.py +++ b/tests/test_dates.py @@ -96,7 +96,7 @@ def test_refresh_inplace(): ) with pytest.warns(UserWarning, match="Lux detects that the attribute 'date' may be temporal."): df._repr_html_() - assert df.data_type_lookup["date"] == "temporal" + assert df.data_type["date"] == "temporal" from lux.vis.Vis import Vis @@ -104,7 +104,8 @@ def test_refresh_inplace(): df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d") df.maintain_metadata() - assert df.data_type["temporal"][0] == "date" + inverted_data_type = lux.config.executor.invert_data_type(df.data_type) + assert inverted_data_type["temporal"][0] == "date" vis.refresh_source(df) assert vis.mark == "line" diff --git a/tests/test_nan.py b/tests/test_nan.py index 96918af0..b2d28fed 100644 --- a/tests/test_nan.py +++ b/tests/test_nan.py @@ -46,14 +46,16 @@ def test_nan_data_type_detection(): ] test = pd.DataFrame(dataset) test.maintain_metadata() - assert test.data_type["nominal"] == [ + inverted_data_type = lux.config.executor.invert_data_type(test.data_type) + assert inverted_data_type["nominal"] == [ "fully_nan", "some_nan", "some_nan2", ], "Categorical columns containing NaNs should be treated as nominal data type" nona_test = test.dropna(subset=["some_nan"]) nona_test.maintain_metadata() - assert nona_test.data_type["nominal"] == [ + inverted_data_type = lux.config.executor.invert_data_type(nona_test.data_type) + assert inverted_data_type["nominal"] == [ "fully_nan", "some_nan", "some_nan2", diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index f5977da5..21b257a2 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -41,23 +41,32 @@ def test_rename_inplace(global_var): # new_df is the old dataframe (df) with the new column name changed inplace new_df, df = df, new_df - assert df.data_type_lookup != new_df.data_type_lookup + assert df.data_type != new_df.data_type - assert df.data_type_lookup["Name"] == new_df.data_type_lookup["Car Name"] + assert df.data_type["Name"] == new_df.data_type["Car Name"] - assert df.data_type != new_df.data_type + inverted_data_type = lux.config.executor.invert_data_type(df.data_type) + new_inverted_data_type = lux.config.executor.invert_data_type(new_df.data_type) + + assert inverted_data_type != new_inverted_data_type + + assert inverted_data_type["nominal"][0] == "Name" + assert new_inverted_data_type["nominal"][0] == "Car Name" + + data_model_lookup = lux.config.executor.compute_data_model_lookup(df.data_type) + new_data_model_lookup = lux.config.executor.compute_data_model_lookup(new_df.data_type) - assert df.data_type["nominal"][0] == "Name" - assert new_df.data_type["nominal"][0] == "Car Name" + assert data_model_lookup != new_data_model_lookup - assert df.data_model_lookup != new_df.data_model_lookup + assert data_model_lookup["Name"] == new_data_model_lookup["Car Name"] - assert df.data_model_lookup["Name"] == new_df.data_model_lookup["Car Name"] + data_model = lux.config.executor.compute_data_model(df.data_type) + new_data_model = lux.config.executor.compute_data_model(new_df.data_type) - assert df.data_model != new_df.data_model + assert data_model != new_data_model - assert df.data_model["dimension"][0] == "Name" - assert new_df.data_model["dimension"][0] == "Car Name" + assert data_model["dimension"][0] == "Name" + assert new_data_model["dimension"][0] == "Car Name" assert list(df.unique_values.values()) == list(new_df.unique_values.values()) assert list(df.cardinality.values()) == list(new_df.cardinality.values()) @@ -71,23 +80,32 @@ def test_rename(global_var): df._repr_html_() new_df = df.rename(columns={"Name": "Car Name"}, inplace=False) new_df._repr_html_() - assert df.data_type_lookup != new_df.data_type_lookup + assert df.data_type != new_df.data_type - assert df.data_type_lookup["Name"] == new_df.data_type_lookup["Car Name"] + assert df.data_type["Name"] == new_df.data_type["Car Name"] - assert df.data_type != new_df.data_type + inverted_data_type = lux.config.executor.invert_data_type(df.data_type) + new_inverted_data_type = lux.config.executor.invert_data_type(new_df.data_type) + + assert inverted_data_type != new_inverted_data_type + + assert inverted_data_type["nominal"][0] == "Name" + assert new_inverted_data_type["nominal"][0] == "Car Name" + + data_model_lookup = lux.config.executor.compute_data_model_lookup(df.data_type) + new_data_model_lookup = lux.config.executor.compute_data_model_lookup(new_df.data_type) - assert df.data_type["nominal"][0] == "Name" - assert new_df.data_type["nominal"][0] == "Car Name" + assert data_model_lookup != new_data_model_lookup - assert df.data_model_lookup != new_df.data_model_lookup + assert data_model_lookup["Name"] == new_data_model_lookup["Car Name"] - assert df.data_model_lookup["Name"] == new_df.data_model_lookup["Car Name"] + data_model = lux.config.executor.compute_data_model(df.data_type) + new_data_model = lux.config.executor.compute_data_model(new_df.data_type) - assert df.data_model != new_df.data_model + assert data_model != new_data_model - assert df.data_model["dimension"][0] == "Name" - assert new_df.data_model["dimension"][0] == "Car Name" + assert data_model["dimension"][0] == "Name" + assert new_data_model["dimension"][0] == "Car Name" assert list(df.unique_values.values()) == list(new_df.unique_values.values()) assert list(df.cardinality.values()) == list(new_df.cardinality.values()) @@ -305,7 +323,7 @@ def test_change_dtype(global_var): "Occurrence", "Temporal", ] - assert len(df.data_type_lookup) == 10 + assert len(df.data_type) == 10 def test_get_dummies(global_var): @@ -319,7 +337,7 @@ def test_get_dummies(global_var): "Occurrence", "Temporal", ] - assert len(new_df.data_type_lookup) == 339 + assert len(new_df.data_type) == 339 def test_drop(global_var): @@ -502,10 +520,7 @@ def test_df_to_series(global_var): df["Weight"]._metadata assert df["Weight"]._metadata == [ "_intent", - "data_type_lookup", "data_type", - "data_model_lookup", - "data_model", "unique_values", "cardinality", "_rec_info", @@ -533,10 +548,7 @@ def test_value_counts(global_var): assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." assert df["Weight"]._metadata == [ "_intent", - "data_type_lookup", "data_type", - "data_model_lookup", - "data_model", "unique_values", "cardinality", "_rec_info", @@ -563,10 +575,7 @@ def test_str_replace(global_var): assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." assert df["Brand"]._metadata == [ "_intent", - "data_type_lookup", "data_type", - "data_model_lookup", - "data_model", "unique_values", "cardinality", "_rec_info", @@ -600,7 +609,7 @@ def test_read_json(global_var): "Occurrence", "Temporal", ] - assert len(df.data_type_lookup) == 10 + assert len(df.data_type) == 10 def test_read_sas(global_var): @@ -608,4 +617,4 @@ def test_read_sas(global_var): df = pd.read_sas(url, format="sas7bdat") df._repr_html_() assert list(df.recommendation.keys()) == ["Correlation", "Distribution", "Temporal"] - assert len(df.data_type_lookup) == 6 + assert len(df.data_type) == 6 diff --git a/tests/test_performance.py b/tests/test_performance.py index 4e557075..256d45fb 100644 --- a/tests/test_performance.py +++ b/tests/test_performance.py @@ -38,7 +38,7 @@ def test_q1_performance_census(global_var): delta2 < 0.15 < delta ), "Subsequent display of recommendations on Census dataset took a total of {delta2:0.4f} seconds, longer than expected." - assert df.data_type_lookup == { + assert df.data_type == { "age": "quantitative", "workclass": "nominal", "fnlwgt": "quantitative", diff --git a/tests/test_type.py b/tests/test_type.py index 4dbac971..ac6472fc 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -21,15 +21,15 @@ def test_check_cars(): df = pd.read_csv("lux/data/car.csv") df.maintain_metadata() - assert df.data_type_lookup["Name"] == "nominal" - assert df.data_type_lookup["MilesPerGal"] == "quantitative" - assert df.data_type_lookup["Cylinders"] == "nominal" - assert df.data_type_lookup["Displacement"] == "quantitative" - assert df.data_type_lookup["Horsepower"] == "quantitative" - assert df.data_type_lookup["Weight"] == "quantitative" - assert df.data_type_lookup["Acceleration"] == "quantitative" - assert df.data_type_lookup["Year"] == "temporal" - assert df.data_type_lookup["Origin"] == "nominal" + assert df.data_type["Name"] == "nominal" + assert df.data_type["MilesPerGal"] == "quantitative" + assert df.data_type["Cylinders"] == "nominal" + assert df.data_type["Displacement"] == "quantitative" + assert df.data_type["Horsepower"] == "quantitative" + assert df.data_type["Weight"] == "quantitative" + assert df.data_type["Acceleration"] == "quantitative" + assert df.data_type["Year"] == "temporal" + assert df.data_type["Origin"] == "nominal" def test_check_int_id(): @@ -37,7 +37,8 @@ def test_check_int_id(): "https://github.com/lux-org/lux-datasets/blob/master/data/instacart_sample.csv?raw=true" ) df._repr_html_() - assert len(df.data_type["id"]) == 3 + inverted_data_type = lux.config.executor.invert_data_type(df.data_type) + assert len(inverted_data_type["id"]) == 3 assert ( "order_id, product_id, user_id is not visualized since it resembles an ID field." in df._message.to_html() @@ -57,7 +58,7 @@ def test_check_hpi(): df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/hpi.csv?raw=true") df.maintain_metadata() - assert df.data_type_lookup == { + assert df.data_type == { "HPIRank": "quantitative", "Country": "nominal", "SubRegion": "nominal", @@ -77,7 +78,7 @@ def test_check_hpi(): def test_check_airbnb(): df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/airbnb_nyc.csv?raw=true") df.maintain_metadata() - assert df.data_type_lookup == { + assert df.data_type == { "id": "id", "name": "nominal", "host_id": "id", @@ -111,7 +112,7 @@ def test_check_datetime(): } ) df.maintain_metadata() - assert df.data_type_lookup == { + assert df.data_type == { "a": "temporal", "b": "temporal", "c": "temporal", @@ -126,7 +127,7 @@ def test_check_datetime(): def test_check_stock(): df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true") df.maintain_metadata() - assert df.data_type_lookup == { + assert df.data_type == { "symbol": "nominal", "monthdate": "temporal", "price": "quantitative", @@ -136,7 +137,7 @@ def test_check_stock(): def test_check_college(): df = pd.read_csv("lux/data/college.csv") df.maintain_metadata() - assert df.data_type_lookup == { + assert df.data_type == { "Name": "nominal", "PredominantDegree": "nominal", "HighestDegree": "nominal", @@ -176,7 +177,8 @@ def test_float_categorical(): ] df = pd.DataFrame(values) df.maintain_metadata() - assert df.data_type["nominal"] == [ + inverted_data_type = lux.config.executor.invert_data_type(df.data_type) + assert inverted_data_type["nominal"] == [ "A", "B", "C", From 6fc9876dec8c45f58bc1d2477afacca4b717d1aa Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Tue, 5 Jan 2021 08:51:08 +0800 Subject: [PATCH 07/28] Update CONTRIBUTING.md --- CONTRIBUTING.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index faf3872e..a6598459 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -1,4 +1,4 @@ -Lux is a project undergoing active development. If you are interested in contributing to Lux, the open tasks on [GitHub issues](https://github.com/lux-org/lux/issues), esp. issues labelled with the tag [`easy`](https://github.com/lux-org/lux/labels/easy), are good places for newcomers to contribute. This guide contains information on the workflow for contributing to the Lux codebase. For more information on the Lux architecture, see this [documentation page](https://lux-api.readthedocs.io/en/latest/source/advanced/architecture.html). For any additional questions and issues, please post on the [Slack channel](http://lux-project.slack.com/). +Lux is a project undergoing active development. If you are interested in contributing to Lux, the open tasks on [GitHub issues](https://github.com/lux-org/lux/issues), esp. issues labelled with the tag [`easy`](https://github.com/lux-org/lux/labels/easy), are good places for newcomers to contribute. This guide contains information on the workflow for contributing to the Lux codebase. For more information on the Lux architecture, see this [documentation page](https://lux-api.readthedocs.io/en/latest/source/advanced/architecture.html). # Setting up Build and Installation Process @@ -14,6 +14,7 @@ You can install Lux by building from the source code in your fork directly: ```bash cd lux/ pip install --user -r requirements.txt +pip install --user -r requirements-dev.txt python setup.py install ``` @@ -36,7 +37,7 @@ lux/ ``` # Code Formatting -In order to keep our codebase clean and readible, we are using PEP8 guidelines. To help us maintain and check code style, we are using [black](https://github.com/psf/black). Simply run `black .` before commiting. Failure to do so may fail the tests run on Travis. This package should have been installed for you. +In order to keep our codebase clean and readible, we are using PEP8 guidelines. To help us maintain and check code style, we are using [black](https://github.com/psf/black). Simply run `black .` before commiting. Failure to do so may fail the tests run on Travis. This package should have been installed for you as part of [requirements-dev](https://github.com/lux-org/lux/blob/master/requirements-dev.txt). # Running the Test Suite @@ -64,11 +65,11 @@ Once the pull request is submitted, the maintainer will get notified and review # Building Documentation -To build the documentation in HTML, you can run this command locally in the `doc/` folder: +Lux uses [Sphinx](https://www.sphinx-doc.org/en/master/) to generate the documentations, which contains both the docstring and the written documentation in the `doc/` folder. To build the documentation in HTML, you can run this command locally in the `doc/` folder: ```bash make html ``` -This generates all the HTML documentation files in `doc/_build/html/`. The configuration file `conf.py` contains information related to Sphinx settings. The Sphinx documentations are written as ReStructuredText (`*.rst` files) and mostly stored in the `source/` folder. The documentation inside `source/reference` is auto-generated by Sphinx. The repository is linked with ReadTheDocs, which triggers the build for the latest documentation based on the most recent commit. As a result, we do not commit anything inside `doc/_build` in the Github repository. +This generates all the HTML documentation files in `doc/_build/html/`. The configuration file `conf.py` contains information related to Sphinx settings. The Sphinx documentations are written as ReStructuredText (`*.rst` files) and mostly stored in the `source/` folder. The documentation inside `source/reference` is auto-generated by Sphinx. The repository is linked with [ReadTheDocs](https://readthedocs.org/projects/lux-api/), which triggers the build for the latest documentation based on the most recent commit. As a result, we do not commit anything inside `doc/_build` in the Github repository. From e7572ed74b33a4e701809e32efbf31f166dae2fa Mon Sep 17 00:00:00 2001 From: thyneb19 Date: Tue, 5 Jan 2021 01:27:46 -0800 Subject: [PATCH 08/28] Bug Fix: User-provided Index causes KeyError in Pandas Execution (#191) * Moved Executor Parameters to Global Config * Black formatting * Moved table_name parameter to frame.py. Removed executor_type parameter executor_type parameter no longer necessary to maintain * Fixed reference to table_name parameter table_name is now a parameter within frame.py * Adjusted Functions to Set SQL Connection Moved set_SQL_connection function to config. Added set_SQL_table function within frame.py to let users specify which database table will be associated with their dataframe * Update SQLExecutor name parameter * Fix Executor Reference Update current_vis() to reference lux.config.executor * Update frame.py * Moved set functions to global config * Fixed Index Issue in Pandas Executor Issue caused when user sets an index. The Pandas Executor was not correctly renaming this new index column to Record in execute_aggregate() * Added tests for set_index functions * Black formatting * Update Pandas Executor to handle NA values Readded missing dropna parameter within execute_aggregate() groupby function call * Updated Pandas Coverage Tests Commented out set_index case which has not been addressed yet * Black Formatting * Update to Pandas Executor Index Handling Cleaned up how execute_aggregrate renames index columns. Now retrieves the index name from vis.data instead of filtering out non-index columns. Created separate test function for when user specifies an index in read_csv. Co-authored-by: 19thyneb Co-authored-by: Doris Lee --- lux/executor/PandasExecutor.py | 15 ++++++--- tests/test_pandas_coverage.py | 56 ++++++++++++++++++++++++++++++++++ 2 files changed, 67 insertions(+), 4 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index fe2542b7..9708d8eb 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -156,22 +156,29 @@ def execute_aggregate(vis: Vis, isFiltered=True): color_cardinality = 1 if measure_attr != "": if measure_attr.attribute == "Record": + # need to get the index name so that we can rename the index column to "Record" + # if there is no index, default to "index" + index_name = vis.data.index.name + if index_name == None: + index_name = "index" + vis._vis_data = vis.data.reset_index() # if color is specified, need to group by groupby_attr and color_attr - if has_color: vis._vis_data = ( vis.data.groupby([groupby_attr.attribute, color_attr.attribute], dropna=False) .count() .reset_index() + .rename(columns={index_name: "Record"}) ) - vis._vis_data = vis.data.rename(columns={"index": "Record"}) vis._vis_data = vis.data[[groupby_attr.attribute, color_attr.attribute, "Record"]] else: vis._vis_data = ( - vis.data.groupby(groupby_attr.attribute, dropna=False).count().reset_index() + vis.data.groupby(groupby_attr.attribute, dropna=False) + .count() + .reset_index() + .rename(columns={index_name: "Record"}) ) - vis._vis_data = vis.data.rename(columns={"index": "Record"}) vis._vis_data = vis.data[[groupby_attr.attribute, "Record"]] else: # if color is specified, need to group by groupby_attr and color_attr diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index 21b257a2..bb7000b2 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -505,6 +505,62 @@ def compare_vis(vis1, vis2): assert vis1.score == vis2.score +def test_index(global_var): + # testing set_index and reset_index functions + # setting a column as an index should remove it from the dataframe's column list + # and change the dataframe's index name parameter + df = pd.read_csv("lux/data/car.csv") + df["Year"] = pd.to_datetime(df["Year"], format="%Y") + + df = df.set_index(["Name"]) + # if this assert fails, then the index column has not properly been removed from the dataframe's column and registered as an index + assert "Name" not in df.columns and df.index.name == "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + df = df.reset_index() + assert "Name" in df.columns and df.index.name != "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + + df.set_index(["Name"], inplace=True) + assert "Name" not in df.columns and df.index.name == "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + df.reset_index(inplace=True) + assert "Name" in df.columns and df.index.name != "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + + df = df.set_index(["Name"]) + assert "Name" not in df.columns and df.index.name == "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + df = df.reset_index(drop=True) + assert "Name" not in df.columns and df.index.name != "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + + +def test_index_col(global_var): + df = pd.read_csv("lux/data/car.csv", index_col="Name") + # if this assert fails, then the index column has not properly been removed from the dataframe's column and registered as an index + assert "Name" not in df.columns and df.index.name == "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + df = df.reset_index() + assert "Name" in df.columns and df.index.name != "Name" + df._repr_html_() + assert len(df.recommendation) > 0 + + # this case is not yet addressed, need to have a check that eliminates bar charts with duplicate column names + # df = df.set_index(["Name"], drop=False) + # assert "Name" not in df.columns and df.index.name == "Name" + # df._repr_html_() + # assert len(df.recommendation) > 0 + # df = df.reset_index(drop=True) + # assert "Name" not in df.columns and df.index.name != "Name" + + ################ # Series Tests # ################ From 13c4d09751e949f8a08bc4c5a3e4985d6fcd8b4f Mon Sep 17 00:00:00 2001 From: Caitlyn Chen Date: Tue, 5 Jan 2021 22:36:00 -0800 Subject: [PATCH 09/28] Initialize Config once only during __init__ (#194) * basic matplotlib chart example * migrate register default action to init * config class * move actions * fixed tests * changes * alright * fix plot_config * black reformat * black reformat Co-authored-by: Doris Lee Co-authored-by: Caitlyn Chen Co-authored-by: Ujjaini Mukhopadhyay --- lux/__init__.py | 17 ++- lux/_config/__init__.py | 11 +- lux/_config/config.py | 198 +++++++------------------ lux/action/custom.py | 12 +- lux/action/default.py | 28 ++++ lux/action/filter.py | 2 +- lux/action/row_group.py | 8 +- lux/core/frame.py | 37 +---- lux/interestingness/interestingness.py | 6 +- lux/vislib/altair/BarChart.py | 10 +- lux/vislib/altair/LineChart.py | 10 +- tests/test_action.py | 26 +++- tests/test_config.py | 34 +++-- tests/test_pandas_coverage.py | 8 +- 14 files changed, 182 insertions(+), 225 deletions(-) create mode 100644 lux/action/default.py diff --git a/lux/__init__.py b/lux/__init__.py index e86c556a..4b135ace 100644 --- a/lux/__init__.py +++ b/lux/__init__.py @@ -17,11 +17,12 @@ from lux.core.frame import LuxDataFrame from ._version import __version__, version_info from lux._config import config -from lux._config.config import ( - register_action, - remove_action, - actions, - update_actions, - config, - warning_format, -) +from lux._config.config import warning_format + +from lux._config import Config + +config = Config() + +from lux.action.default import register_default_actions + +register_default_actions() diff --git a/lux/_config/__init__.py b/lux/_config/__init__.py index b56ee044..fdebec12 100644 --- a/lux/_config/__init__.py +++ b/lux/_config/__init__.py @@ -1,9 +1,4 @@ from lux._config import config -from lux._config.config import ( - register_action, - remove_action, - actions, - update_actions, - config, - warning_format, -) +from lux._config.config import warning_format + +from .config import Config diff --git a/lux/_config/config.py b/lux/_config/config.py index 21eace4f..5c1a6e2c 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -5,148 +5,10 @@ from collections import namedtuple from typing import Any, Callable, Dict, Iterable, List, Optional import warnings +import lux RegisteredOption = namedtuple("RegisteredOption", "name action display_condition args") -# holds registered option metadata -_registered_actions: Dict[str, RegisteredOption] = {} -# flags whether or not an action has been registered or removed and should be re-rendered by frame.py -update_actions: Dict[str, bool] = {} -update_actions["flag"] = False - - -class OptionError(AttributeError, KeyError): - """ - Exception for pandas.options, backwards compatible with KeyError - checks - """ - - -def _get_action(pat: str, silent: bool = False): - return _registered_actions[pat] - - -class DictWrapper: - def __init__(self, d: Dict[str, Any], prefix: str = ""): - object.__setattr__(self, "d", d) - object.__setattr__(self, "prefix", prefix) - - def __init__(self, d: Dict[str, RegisteredOption], prefix: str = ""): - object.__setattr__(self, "d", d) - object.__setattr__(self, "prefix", prefix) - - def __getattr__(self, name: str): - """ - Gets a specific registered action by id - - Parameters - ---------- - name : str - the name of the action - Return - ------- - DictWrapper object for the action - """ - prefix = object.__getattribute__(self, "prefix") - if prefix: - prefix += "." - prefix += name - try: - v = object.__getattribute__(self, "d")[name] - except KeyError as err: - raise OptionError("No such option") from err - if isinstance(v, dict): - return DictWrapper(v, prefix) - else: - return _get_action(prefix) - - def __getactions__(self): - """ - Gathers all currently registered actions in a list of DictWrapper - - Return - ------- - List of DictWrapper objects that are registered - """ - l = [] - for name in self.__dir__(): - l.append(self.__getattr__(name)) - return l - - def __len__(self): - return len(list(self.d.keys())) - - def __dir__(self) -> Iterable[str]: - return list(self.d.keys()) - - -actions = DictWrapper(_registered_actions) - - -def register_action( - name: str = "", - action: Callable[[Any], Any] = None, - display_condition: Optional[Callable[[Any], Any]] = None, - *args, -) -> None: - """ - Registers the provided action globally in lux - - Parameters - ---------- - name : str - the name of the action - action : Callable[[Any], Any] - the function used to generate the recommendations - display_condition : Callable[[Any], Any] - the function to check whether or not the function should be applied - args: Any - any additional arguments the function may require - """ - if action: - is_callable(action) - - if display_condition: - is_callable(display_condition) - _registered_actions[name] = RegisteredOption( - name=name, action=action, display_condition=display_condition, args=args - ) - update_actions["flag"] = True - - -def remove_action(name: str = "") -> None: - """ - Removes the provided action globally in lux - - Parameters - ---------- - name : str - the name of the action to remove - """ - if name not in _registered_actions: - raise ValueError(f"Option '{name}' has not been registered") - - del _registered_actions[name] - update_actions["flag"] = True - - -def is_callable(obj) -> bool: - """ - Parameters - ---------- - obj: Any - the object to be checked - - Returns - ------- - validator : bool - returns True if object is callable - raises ValueError otherwise. - """ - if not callable(obj): - raise ValueError("Value must be a callable") - return True - class Config: def __init__(self): @@ -155,6 +17,11 @@ def __init__(self): self.plot_config = None self.SQLconnection = "" self.executor = None + # holds registered option metadata + self.actions: Dict[str, RegisteredOption] = {} + # flags whether or not an action has been registered or removed and should be re-rendered by frame.py + self.update_actions: Dict[str, bool] = {} + self.update_actions["flag"] = False self._sampling_start = 10000 self._sampling_cap = 30000 self._sampling_flag = True @@ -266,6 +133,56 @@ def default_display(self, type: str) -> None: stacklevel=2, ) + def _get_action(self, pat: str, silent: bool = False): + return lux.actions[pat] + + def register_action( + self, + name: str = "", + action: Callable[[Any], Any] = None, + display_condition: Optional[Callable[[Any], Any]] = None, + *args, + ) -> None: + """ + Registers the provided action globally in lux + + Parameters + ---------- + name : str + the name of the action + action : Callable[[Any], Any] + the function used to generate the recommendations + display_condition : Callable[[Any], Any] + the function to check whether or not the function should be applied + args: Any + any additional arguments the function may require + """ + if action: + if not callable(action): + raise ValueError("Action must be a callable") + if display_condition: + if not callable(display_condition): + raise ValueError("Display condition must be a callable") + self.actions[name] = RegisteredOption( + name=name, action=action, display_condition=display_condition, args=args + ) + self.update_actions["flag"] = True + + def remove_action(self, name: str = "") -> None: + """ + Removes the provided action globally in lux + + Parameters + ---------- + name : str + the name of the action to remove + """ + if name not in self.actions: + raise ValueError(f"Option '{name}' has not been registered") + + del self.actions[name] + self.update_actions["flag"] = True + def set_SQL_connection(self, connection): self.SQLconnection = connection @@ -291,8 +208,5 @@ def set_SQL_connection(self, connection): self.SQLconnection = connection -config = Config() - - def warning_format(message, category, filename, lineno, file=None, line=None): return "%s:%s: %s:%s\n" % (filename, lineno, category.__name__, message) diff --git a/lux/action/custom.py b/lux/action/custom.py index 25539159..2f0183a4 100644 --- a/lux/action/custom.py +++ b/lux/action/custom.py @@ -63,16 +63,16 @@ def custom_actions(ldf): recommendations : Dict[str,obj] object with a collection of visualizations that were previously registered. """ - if lux.actions.__len__() > 0: + if len(lux.config.actions) > 0: recommendations = [] - for action_name in lux.actions.__dir__(): - display_condition = lux.actions.__getattr__(action_name).display_condition + for action_name in lux.config.actions.keys(): + display_condition = lux.config.actions[action_name].display_condition if display_condition is None or (display_condition is not None and display_condition(ldf)): - args = lux.actions.__getattr__(action_name).args + args = lux.config.actions[action_name].args if args: - recommendation = lux.actions.__getattr__(action_name).action(ldf, args) + recommendation = lux.config.actions[action_name].action(ldf, args) else: - recommendation = lux.actions.__getattr__(action_name).action(ldf) + recommendation = lux.config.actions[action_name].action(ldf) recommendations.append(recommendation) return recommendations else: diff --git a/lux/action/default.py b/lux/action/default.py new file mode 100644 index 00000000..b075232d --- /dev/null +++ b/lux/action/default.py @@ -0,0 +1,28 @@ +def register_default_actions(): + import lux + from lux.action.custom import custom + from lux.action.correlation import correlation + from lux.action.univariate import univariate + from lux.action.enhance import enhance + from lux.action.filter import add_filter + from lux.action.generalize import generalize + + print("Register default actions") + # display conditions for default actions + no_vis = lambda ldf: (ldf.current_vis is None) or ( + ldf.current_vis is not None and len(ldf.current_vis) == 0 + ) + one_current_vis = lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1 + multiple_current_vis = lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) > 1 + + # globally register default actions + lux.config.register_action("correlation", correlation, no_vis) + lux.config.register_action("distribution", univariate, no_vis, "quantitative") + lux.config.register_action("occurrence", univariate, no_vis, "nominal") + lux.config.register_action("temporal", univariate, no_vis, "temporal") + + lux.config.register_action("Enhance", enhance, one_current_vis) + lux.config.register_action("Filter", add_filter, one_current_vis) + lux.config.register_action("Generalize", generalize, one_current_vis) + + lux.config.register_action("Custom", custom, multiple_current_vis) diff --git a/lux/action/filter.py b/lux/action/filter.py index e8833b0f..c6def0dc 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -21,7 +21,7 @@ from lux.utils.utils import get_filter_specs -def filter(ldf): +def add_filter(ldf): """ Iterates over all possible values of a categorical variable and generates visualizations where each categorical value filters the data. diff --git a/lux/action/row_group.py b/lux/action/row_group.py index 01ab2a32..db27ab86 100644 --- a/lux/action/row_group.py +++ b/lux/action/row_group.py @@ -45,7 +45,13 @@ def row_group(ldf): # rowdf.cardinality["index"]=len(rowdf) # if isinstance(ldf.columns,pd.DatetimeIndex): # rowdf.data_type_lookup[dim_name]="temporal" - vis = Vis([dim_name, lux.Clause(row.name, data_model="measure", aggregation=None)], rowdf) + vis = Vis( + [ + dim_name, + lux.Clause(row.name, data_model="measure", aggregation=None), + ], + rowdf, + ) collection.append(vis) vlst = VisList(collection) # Note that we are not computing interestingness score here because we want to preserve the arrangement of the aggregated data diff --git a/lux/core/frame.py b/lux/core/frame.py index e4ed9e3e..a41db6be 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -384,7 +384,7 @@ def _append_rec(self, rec_infolist, recommendations: Dict): def maintain_recs(self): # `rec_df` is the dataframe to generate the recommendations on # check to see if globally defined actions have been registered/removed - if lux.update_actions["flag"] == True: + if lux.config.update_actions["flag"] == True: self._recs_fresh = False show_prev = False # flag indicating whether rec_df is showing previous df or current self if self._prev is not None: @@ -412,50 +412,23 @@ def maintain_recs(self): # Check that recs has not yet been computed if not hasattr(rec_df, "_recs_fresh") or not rec_df._recs_fresh: rec_infolist = [] - from lux.action.custom import custom - from lux.action.custom import custom_actions - from lux.action.correlation import correlation - from lux.action.univariate import univariate - from lux.action.enhance import enhance - from lux.action.filter import filter - from lux.action.generalize import generalize from lux.action.row_group import row_group from lux.action.column_group import column_group + # TODO: Rewrite these as register action inside default actions if rec_df.pre_aggregated: if rec_df.columns.name is not None: rec_df._append_rec(rec_infolist, row_group(rec_df)) rec_df._append_rec(rec_infolist, column_group(rec_df)) else: - if rec_df._recommendation == {}: - # display conditions for default actions - no_vis = lambda ldf: (ldf.current_vis is None) or ( - ldf.current_vis is not None and len(ldf.current_vis) == 0 - ) - one_current_vis = ( - lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) == 1 - ) - multiple_current_vis = ( - lambda ldf: ldf.current_vis is not None and len(ldf.current_vis) > 1 - ) - - # globally register default actions - lux.register_action("correlation", correlation, no_vis) - lux.register_action("distribution", univariate, no_vis, "quantitative") - lux.register_action("occurrence", univariate, no_vis, "nominal") - lux.register_action("temporal", univariate, no_vis, "temporal") - - lux.register_action("Enhance", enhance, one_current_vis) - lux.register_action("Filter", filter, one_current_vis) - lux.register_action("Generalize", generalize, one_current_vis) - - lux.register_action("Custom", custom, multiple_current_vis) + # if rec_df._recommendation == {}: + from lux.action.custom import custom_actions # generate vis from globally registered actions and append to dataframe custom_action_collection = custom_actions(rec_df) for rec in custom_action_collection: rec_df._append_rec(rec_infolist, rec) - lux.update_actions["flag"] = False + lux.config.update_actions["flag"] = False # Store _rec_info into a more user-friendly dictionary form rec_df._recommendation = {} diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index fb3a3b13..cf81045e 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -195,7 +195,11 @@ def weighted_correlation(x, y, w): def deviation_from_overall( - vis: Vis, ldf: LuxDataFrame, filter_specs: list, msr_attribute: str, exclude_nan: bool = True + vis: Vis, + ldf: LuxDataFrame, + filter_specs: list, + msr_attribute: str, + exclude_nan: bool = True, ) -> int: """ Difference in bar chart/histogram shape from overall chart diff --git a/lux/vislib/altair/BarChart.py b/lux/vislib/altair/BarChart.py index 91e17b29..66c36d02 100644 --- a/lux/vislib/altair/BarChart.py +++ b/lux/vislib/altair/BarChart.py @@ -61,7 +61,10 @@ def initialize_chart(self): axis=alt.Axis(labelOverlap=True, title=y_attr_abv), ) x_attr_field = alt.X( - x_attr.attribute, type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title) + x_attr.attribute, + type=x_attr.data_type, + title=agg_title, + axis=alt.Axis(title=agg_title), ) y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{y_attr_abv}'))" x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{agg_title}'))" @@ -80,7 +83,10 @@ def initialize_chart(self): ) x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{x_attr_abv}'))" y_attr_field = alt.Y( - y_attr.attribute, type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title) + y_attr.attribute, + type=y_attr.data_type, + title=agg_title, + axis=alt.Axis(title=agg_title), ) y_attr_field_code = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{agg_title}'))" if x_attr.sort == "ascending": diff --git a/lux/vislib/altair/LineChart.py b/lux/vislib/altair/LineChart.py index 54b28c46..c711127a 100644 --- a/lux/vislib/altair/LineChart.py +++ b/lux/vislib/altair/LineChart.py @@ -62,14 +62,20 @@ def initialize_chart(self): agg_title = get_agg_title(y_attr) x_attr_spec = alt.X(x_attr.attribute, type=x_attr.data_type, axis=alt.Axis(title=x_attr_abv)) y_attr_spec = alt.Y( - y_attr.attribute, type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=y_attr_abv) + y_attr.attribute, + type=y_attr.data_type, + title=agg_title, + axis=alt.Axis(title=y_attr_abv), ) x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', axis=alt.Axis(title='{x_attr_abv}'))" y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{y_attr_abv}')" else: agg_title = get_agg_title(x_attr) x_attr_spec = alt.X( - x_attr.attribute, type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=x_attr_abv) + x_attr.attribute, + type=x_attr.data_type, + title=agg_title, + axis=alt.Axis(title=x_attr_abv), ) y_attr_spec = alt.Y(y_attr.attribute, type=y_attr.data_type, axis=alt.Axis(title=y_attr_abv)) x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{x_attr_abv}')" diff --git a/tests/test_action.py b/tests/test_action.py index bbe161e0..97aa732c 100644 --- a/tests/test_action.py +++ b/tests/test_action.py @@ -214,10 +214,16 @@ def test_similarity(global_var): ranked_list = df.recommendation["Similarity"] japan_vis = list( - filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Japan", ranked_list) + filter( + lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Japan", + ranked_list, + ) )[0] europe_vis = list( - filter(lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Europe", ranked_list) + filter( + lambda vis: vis.get_attr_by_attr_name("Origin")[0].value == "Europe", + ranked_list, + ) )[0] assert japan_vis.score > europe_vis.score df.clear_intent() @@ -231,14 +237,24 @@ def test_similarity2(): df["Month"] = pd.to_datetime(df["Month"], format="%m") df["Year"] = pd.to_datetime(df["Year"], format="%Y") - df.intent = [lux.Clause("Year"), lux.Clause("PctForeclosured"), lux.Clause("City=Crofton")] + df.intent = [ + lux.Clause("Year"), + lux.Clause("PctForeclosured"), + lux.Clause("City=Crofton"), + ] ranked_list = df.recommendation["Similarity"] morrisville_vis = list( - filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Morrisville", ranked_list) + filter( + lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Morrisville", + ranked_list, + ) )[0] watertown_vis = list( - filter(lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Watertown", ranked_list) + filter( + lambda vis: vis.get_attr_by_attr_name("City")[0].value == "Watertown", + ranked_list, + ) )[0] assert morrisville_vis.score > watertown_vis.score diff --git a/tests/test_config.py b/tests/test_config.py index b5906c6b..644c4628 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -42,9 +42,9 @@ def contain_horsepower(df): return False if validator: - lux.register_action("bars", random_categorical, contain_horsepower) + lux.config.register_action("bars", random_categorical, contain_horsepower) else: - lux.register_action("bars", random_categorical) + lux.config.register_action("bars", random_categorical) return df @@ -93,8 +93,8 @@ def test_no_validator(): def test_invalid_function(global_var): df = pd.read_csv("lux/data/car.csv") - with pytest.raises(ValueError, match="Value must be a callable"): - lux.register_action("bars", "not a Callable") + with pytest.raises(ValueError, match="Action must be a callable"): + lux.config.register_action("bars", "not a Callable") def test_invalid_validator(global_var): @@ -112,8 +112,8 @@ def random_categorical(ldf): "collection": vlist, } - with pytest.raises(ValueError, match="Value must be a callable"): - lux.register_action("bars", random_categorical, "not a Callable") + with pytest.raises(ValueError, match="Display condition must be a callable"): + lux.config.register_action("bars", random_categorical, "not a Callable") def test_remove_action(): @@ -128,7 +128,7 @@ def test_remove_action(): len(df.recommendation["bars"]) > 0, "Bars should be rendered after it has been registered with correct intent.", ) - lux.remove_action("bars") + lux.config.remove_action("bars") df._repr_html_() assert ( "bars" not in df.recommendation, @@ -138,29 +138,29 @@ def test_remove_action(): def test_remove_invalid_action(global_var): - df = pytest.car_df + df = pd.read_csv("lux/data/car.csv") with pytest.raises(ValueError, match="Option 'bars' has not been registered"): - lux.remove_action("bars") + lux.config.remove_action("bars") +# TODO: This test does not pass in pytest but is working in Jupyter notebook. def test_remove_default_actions(global_var): - # df = pytest.car_df - df = pd.read_csv("lux/data/car.csv") + df = pytest.car_df df._repr_html_() - lux.remove_action("distribution") + lux.config.remove_action("distribution") df._repr_html_() assert "Distribution" not in df.recommendation - lux.remove_action("occurrence") + lux.config.remove_action("occurrence") df._repr_html_() assert "Occurrence" not in df.recommendation - lux.remove_action("temporal") + lux.config.remove_action("temporal") df._repr_html_() assert "Temporal" not in df.recommendation - lux.remove_action("correlation") + lux.config.remove_action("correlation") df._repr_html_() assert "Correlation" not in df.recommendation @@ -179,6 +179,10 @@ def test_remove_default_actions(global_var): assert len(df.recommendation["bars"]) > 0 df.clear_intent() + from lux.action.default import register_default_actions + + register_default_actions() + def test_set_default_plot_config(): def change_color_make_transparent_add_title(chart): diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index bb7000b2..e861769b 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -171,7 +171,8 @@ def test_groupby_agg_big(global_var): assert len(new_df.cardinality) == 8 year_vis = list( filter( - lambda vis: vis.get_attr_by_attr_name("Year") != [], new_df.recommendation["Column Groups"] + lambda vis: vis.get_attr_by_attr_name("Year") != [], + new_df.recommendation["Column Groups"], ) )[0] assert year_vis.mark == "bar" @@ -179,7 +180,10 @@ def test_groupby_agg_big(global_var): new_df = new_df.T new_df._repr_html_() year_vis = list( - filter(lambda vis: vis.get_attr_by_attr_name("Year") != [], new_df.recommendation["Row Groups"]) + filter( + lambda vis: vis.get_attr_by_attr_name("Year") != [], + new_df.recommendation["Row Groups"], + ) )[0] assert year_vis.mark == "bar" assert year_vis.get_attr_by_channel("x")[0].attribute == "Year" From 778146813380109b134590395df46d9d7e97aec2 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Wed, 6 Jan 2021 17:32:04 +0800 Subject: [PATCH 10/28] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8958e96c..38098805 100644 --- a/README.md +++ b/README.md @@ -40,10 +40,10 @@ import lux import pandas as pd ``` -Then, Lux can be used as-is, without modifying any of your existing Pandas code. Here, we use Pandas's [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html) command to load in a [dataset of colleges](https://collegescorecard.ed.gov/data/documentation/) and their properties. +Then, Lux can be used as-is, without modifying any of your existing Pandas code. Here, we use Pandas's [read_csv](https://pandas.pydata.org/pandas-docs/stable/generated/pandas.read_csv.html) command to load in a [dataset of colleges](https://github.com/lux-org/lux-datasets/blob/master/data/college.csv) and their properties. ```python - df = pd.read_csv("college.csv") + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/college.csv") df ``` From d485c719ce312a1f601bc5f6c5bc78433881196c Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Thu, 7 Jan 2021 13:57:35 +0800 Subject: [PATCH 11/28] Series Bugfix for describe and convert_dtypes (#197) * bugfix for describe and convert_dtypes * added back metadata series test * black * default to pandas display when df.dtypes printed --- lux/core/frame.py | 2 +- lux/core/series.py | 18 +++++++----- lux/executor/PandasExecutor.py | 18 ++++-------- tests/test_nan.py | 2 ++ tests/test_pandas.py | 25 +++++++++------- tests/test_series.py | 53 ++++++++++++++++++++++++++++++++++ 6 files changed, 85 insertions(+), 33 deletions(-) create mode 100644 tests/test_series.py diff --git a/lux/core/frame.py b/lux/core/frame.py index a41db6be..08401ac6 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -155,7 +155,7 @@ def _set_item(self, key, value): def _infer_structure(self): # If the dataframe is very small and the index column is not a range index, then it is likely that this is an aggregated data is_multi_index_flag = self.index.nlevels != 1 - not_int_index_flag = self.index.dtype != "int64" + not_int_index_flag = not pd.api.types.is_integer_dtype(self.index) small_df_flag = len(self) < 100 self.pre_aggregated = (is_multi_index_flag or not_int_index_flag) and small_df_flag if "Number of Records" in self.columns: diff --git a/lux/core/series.py b/lux/core/series.py index aea13d0c..1e3c4f8c 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -16,6 +16,7 @@ import lux import warnings import traceback +import numpy as np class LuxSeries(pd.Series): @@ -45,14 +46,14 @@ def _constructor(self): def _constructor_expanddim(self): from lux.core.frame import LuxDataFrame - def f(*args, **kwargs): - df = LuxDataFrame(*args, **kwargs) - for attr in self._metadata: - df.__dict__[attr] = getattr(self, attr, None) - return df + # def f(*args, **kwargs): + # df = LuxDataFrame(*args, **kwargs) + # for attr in self._metadata: + # df.__dict__[attr] = getattr(self, attr, None) + # return df - f._get_axis_number = super(LuxSeries, self)._get_axis_number - return f + # f._get_axis_number = super(LuxSeries, self)._get_axis_number + return LuxDataFrame def to_pandas(self): import lux.core @@ -75,7 +76,8 @@ def __repr__(self): ldf = LuxDataFrame(self) try: - if ldf._pandas_only: + is_dtype_series = all(isinstance(val, np.dtype) for val in self.values) + if ldf._pandas_only or is_dtype_series: print(series_repr) ldf._pandas_only = False else: diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 9708d8eb..56422866 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -428,9 +428,7 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type[attr] = "temporal" else: ldf.data_type[attr] = "nominal" - # for attr in list(df.dtypes[df.dtypes=="int64"].keys()): - # if self.cardinality[attr]>50: - if ldf.index.dtype != "int64" and ldf.index.name: + if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf.data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] @@ -489,21 +487,15 @@ def compute_stats(self, ldf: LuxDataFrame): ldf.unique_values[attribute_repr] = list(ldf[attribute_repr].unique()) ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr]) - # commenting this optimization out to make sure I can filter by cardinality when showing recommended vis - - # if ldf.dtypes[attribute] != "float64":# and not pd.api.types.is_datetime64_ns_dtype(self.dtypes[attribute]): - # ldf.unique_values[attribute_repr] = list(ldf[attribute].unique()) - # ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute]) - # else: - # ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute - - if ldf.dtypes[attribute] == "float64" or ldf.dtypes[attribute] == "int64": + if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype( + ldf.dtypes[attribute] + ): ldf._min_max[attribute_repr] = ( ldf[attribute].min(), ldf[attribute].max(), ) - if ldf.index.dtype != "int64": + if not pd.api.types.is_integer_dtype(ldf.index): index_column_name = ldf.index.name ldf.unique_values[index_column_name] = list(ldf.index) ldf.cardinality[index_column_name] = len(ldf.index) diff --git a/tests/test_nan.py b/tests/test_nan.py index b2d28fed..1701215f 100644 --- a/tests/test_nan.py +++ b/tests/test_nan.py @@ -22,11 +22,13 @@ def test_nan_column(global_var): df = pytest.college_df + old_geo = df["Geography"] df["Geography"] = np.nan df._repr_html_() for visList in df.recommendation.keys(): for vis in df.recommendation[visList]: assert vis.get_attr_by_attr_name("Geography") == [] + df["Geography"] = old_geo def test_nan_data_type_detection(): diff --git a/tests/test_pandas.py b/tests/test_pandas.py index b43cc1f9..26cd7333 100644 --- a/tests/test_pandas.py +++ b/tests/test_pandas.py @@ -16,17 +16,6 @@ import pytest import pandas as pd -# def test_df_to_series(): -# # Ensure metadata is kept when going from df to series -# df = pd.read_csv("lux/data/car.csv") -# df._repr_html_() # compute metadata -# assert df.cardinality is not None -# series = df["Weight"] -# assert isinstance(series,lux.core.series.LuxSeries), "Derived series is type LuxSeries." -# assert df["Weight"]._metadata == ['name','_intent', 'data_type_lookup', 'data_type', 'data_model_lookup', 'data_model', 'unique_values', 'cardinality', 'min_max', '_current_vis', '_widget', '_recommendation'], "Metadata is lost when going from Dataframe to Series." -# assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." -# assert series.name == "Weight", "Pandas Series original `name` property not retained." - def test_head_tail(global_var): df = pytest.car_df @@ -44,3 +33,17 @@ def test_head_tail(global_var): "Lux is visualizing the previous version of the dataframe before you applied tail." in df._message.to_html() ) + + +def test_describe(global_var): + df = pytest.college_df + summary = df.describe() + summary._repr_html_() + assert len(summary.recommendation["Column Groups"]) == len(summary.columns) == 10 + + +def test_convert_dtype(global_var): + df = pytest.college_df + cdf = df.convert_dtypes() + cdf._repr_html_() + assert list(cdf.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"] diff --git a/tests/test_series.py b/tests/test_series.py new file mode 100644 index 00000000..62a4697f --- /dev/null +++ b/tests/test_series.py @@ -0,0 +1,53 @@ +# Copyright 2019-2020 The Lux Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from .context import lux +import pytest +import pandas as pd +import warnings + + +def test_df_to_series(): + # Ensure metadata is kept when going from df to series + df = pd.read_csv("lux/data/car.csv") + df._repr_html_() # compute metadata + assert df.cardinality is not None + series = df["Weight"] + assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." + print(df["Weight"]._metadata) + assert df["Weight"]._metadata == [ + "_intent", + "data_type", + "unique_values", + "cardinality", + "_rec_info", + "_pandas_only", + "_min_max", + "plot_config", + "_current_vis", + "_widget", + "_recommendation", + "_prev", + "_history", + "_saved_export", + "name", + ], "Metadata is lost when going from Dataframe to Series." + assert df.cardinality is not None, "Metadata is lost when going from Dataframe to Series." + assert series.name == "Weight", "Pandas Series original `name` property not retained." + + +def test_print_dtypes(global_var): + df = pytest.college_df + with warnings.catch_warnings(record=True) as w: + print(df.dtypes) + assert len(w) == 0, "Warning displayed when printing dtypes" From 459b4bf35325cff9fcbb0ce401e007b88778f9f9 Mon Sep 17 00:00:00 2001 From: jinimukh <46768380+jinimukh@users.noreply.github.com> Date: Thu, 7 Jan 2021 01:31:20 -0800 Subject: [PATCH 12/28] Update Lux Docs (#195) * add black to travis * reformat all code and adjust test * remove .idea * fix contributing doc * small change in contributing * update * reformat, update command to fix version * remove dev dependencies * first pass -- inline comments * _config/config.py * delete test notebook * action * line length 105 * executor * interestingness * processor * vislib * tests, travis, CONTRIBUTING * .format () changed * replace tabs with escape chars * update using black * more rewrites and merges into single line * update pyproject.toml and makefile * coalesce data_types into data_type_lookup * black reformat * changed to better variable names * lux not defined error * fixed * black format * config doc updated * fix link for executor * more links * fixed overview * more links fixed * pandas methods no longer included * updates to some docstrings * black reformat * minor fixes * minor fix Co-authored-by: Doris Lee --- doc/conf.py | 2 +- doc/source/advanced/architecture.rst | 2 +- doc/source/advanced/date.rst | 2 +- doc/source/advanced/interestingness.rst | 33 +-- doc/source/getting_started/overview.rst | 18 +- doc/source/guide/FAQ.rst | 29 ++- doc/source/guide/intent.rst | 5 +- doc/source/guide/style.rst | 2 + doc/source/reference/API.rst | 22 +- doc/source/reference/config.rst | 48 +++- .../gen/lux._config.config.Config.rst | 34 +++ .../gen/lux.core.frame.LuxDataFrame.rst | 4 +- .../gen/lux.core.series.LuxSeries.rst | 234 ++++++++++++++++++ .../gen/lux.executor.Executor.Executor.rst | 2 + ...executor.PandasExecutor.PandasExecutor.rst | 2 + .../lux.executor.SQLExecutor.SQLExecutor.rst | 2 + .../reference/gen/lux.history.event.Event.rst | 22 ++ .../gen/lux.history.history.Event.rst | 22 ++ .../gen/lux.history.history.History.rst | 23 ++ doc/source/reference/gen/lux.vis.Vis.Vis.rst | 2 +- doc/source/reference/lux._config.config.rst | 9 + doc/source/reference/lux.action.rst | 19 +- doc/source/reference/lux.core.rst | 15 +- doc/source/reference/lux.rst | 1 - doc/source/reference/lux.utils.rst | 8 - lux/_config/config.py | 56 ++++- lux/core/frame.py | 77 +++--- lux/core/series.py | 18 +- 28 files changed, 580 insertions(+), 133 deletions(-) create mode 100644 doc/source/reference/gen/lux._config.config.Config.rst create mode 100644 doc/source/reference/gen/lux.core.series.LuxSeries.rst create mode 100644 doc/source/reference/gen/lux.history.event.Event.rst create mode 100644 doc/source/reference/gen/lux.history.history.Event.rst create mode 100644 doc/source/reference/gen/lux.history.history.History.rst create mode 100644 doc/source/reference/lux._config.config.rst diff --git a/doc/conf.py b/doc/conf.py index 03862439..cafb3786 100644 --- a/doc/conf.py +++ b/doc/conf.py @@ -63,7 +63,7 @@ "sphinx_automodapi.automodsumm", ] -autodoc_default_flags = ["members", "inherited-members"] +autodoc_default_flags = ["members", "no-undoc-members"] autodoc_member_order = "groupwise" autosummary_generate = True numpydoc_show_class_members = False diff --git a/doc/source/advanced/architecture.rst b/doc/source/advanced/architecture.rst index f257cb34..0b2c51f4 100644 --- a/doc/source/advanced/architecture.rst +++ b/doc/source/advanced/architecture.rst @@ -80,4 +80,4 @@ Number of Dimensions Number of Measures Mark Type Executor ---------- The data executor populates each Vis with a subset of the dataframe based on the specified intent. -You can learn more about executors in Lux `here `_. \ No newline at end of file +You can learn more about executors in Lux `here `_. \ No newline at end of file diff --git a/doc/source/advanced/date.rst b/doc/source/advanced/date.rst index 6adf4028..901ecf84 100644 --- a/doc/source/advanced/date.rst +++ b/doc/source/advanced/date.rst @@ -98,7 +98,7 @@ Below we look at an example stocks dataset that also has `date` field with each .. code-block:: python - df = pd.read_csv("../../lux/data/stocks.csv") + df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true") df.dtypes diff --git a/doc/source/advanced/interestingness.rst b/doc/source/advanced/interestingness.rst index 0827ab65..8581a8ea 100644 --- a/doc/source/advanced/interestingness.rst +++ b/doc/source/advanced/interestingness.rst @@ -1,24 +1,24 @@ -********************** +******************************* Interestingness Scoring -********************** +******************************* In Lux, recommended visualizations are scored and ranked based on their statistical properties. Lux uses various standard metrics for determining how interesting a visualization is. The choice of an interestingness metric is dependent on the chart type, as shown in the following table. -+----------------+---------+------------------------------------------------------------------+ -| Chart Type | Filter? | Function | -+================+=========+==================================================================+ -| Bar/Line Chart | ✔ | :func:`lux.interestingness.interestingness.unevenness` | -| +---------+------------------------------------------------------------------+ ++----------------+---------+--------------------------------------------------------------------+ +| Chart Type | Filter? | Function | ++================+=========+====================================================================+ +| Bar/Line Chart | ✔ | :func:`lux.interestingness.interestingness.unevenness` | +| +---------+--------------------------------------------------------------------+ | | X | :func:`lux.interestingness.interestingness.deviation_from_overall` | -+----------------+---------+------------------------------------------------------------------+ -| Histogram | ✔ | :func:`lux.interestingness.interestingness.skewness` | -| +---------+------------------------------------------------------------------+ ++----------------+---------+--------------------------------------------------------------------+ +| Histogram | ✔ | :func:`lux.interestingness.interestingness.skewness` | +| +---------+--------------------------------------------------------------------+ | | X | :func:`lux.interestingness.interestingness.deviation_from_overall` | -+----------------+---------+------------------------------------------------------------------+ -| Scatterplot | ✔/X | :func:`lux.interestingness.interestingness.monotonicity` | -+----------------+---------+------------------------------------------------------------------+ ++----------------+---------+--------------------------------------------------------------------+ +| Scatterplot | ✔/X | :func:`lux.interestingness.interestingness.monotonicity` | ++----------------+---------+--------------------------------------------------------------------+ Bar Chart Interestingness ========================= @@ -30,7 +30,7 @@ Bar charts without filters: Unevenness A chart is scored higher if it is more uneven, indicating high variation in the individual bar values in the chart. The score is computed based -on the difference between the value of the bar chart .. math::`V` and the flat uniform distribution .. math::`V_{flat}`. +on the difference between the value of the bar chart :math:`V` and the flat uniform distribution :math:`V_{flat}`. The difference is captured via the Euclidean distance (L2 norm). @@ -42,6 +42,7 @@ The difference is captured via the Euclidean distance (L2 norm). .. Example: "Occurrence" recommendation .. _barWithFilter: + Bar charts with filters: Deviation from Overall ----------------------------------------------- @@ -77,6 +78,7 @@ The skewness is computed based on `scipy.stats.skew `_] [`Binder `_] +.. note:: You can follow along this tutorial in a Jupyter notebook. [`Github `_] [`Binder `_] This tutorial provides an overview of how you can use Lux in your data exploration workflow. @@ -25,8 +25,7 @@ Lux preserves the Pandas dataframe semantics -- which means that you can apply a df = pd.read_csv("lux/data/college.csv") -Lux is built on the philosophy that generating useful visualizations should be as simple as printing out a dataframe. -When you print out the dataframe in the notebook, you should see the default Pandas table display with an additional Toggle button. +To visualize your dataframe in Lux, simply print out the dataframe. You should see the default Pandas table display with an additional toggle button. .. code-block:: python @@ -37,7 +36,7 @@ When you print out the dataframe in the notebook, you should see the default Pan :align: center :alt: click on toggle, scroll on Correlation -By clicking on the Toggle button, you can now explore the data visually through Lux. You should see three tabs of visualizations recommended to you. +By clicking on the Toggle button, you can now explore the data visually through Lux. You should see several categories of visualizations recommended to you by browsing through the different tabs. .. image:: ../../../../lux-resources/doc_img/overview-2.gif :width: 700 @@ -75,7 +74,7 @@ As shown in the example above, by default, we display three types of actions sho :alt: Example of even and uneven category distributions -Refer to :doc:`this page <../advanced/action>` for details on different types of action in Lux. +Refer to :doc:`this page <../reference/lux.action>` for details on different types of action in Lux. Expressing Analysis Interest and Goals with User `Intent` ---------------------------------------------------------- @@ -111,7 +110,7 @@ You can specify a variety of things that you might be interested in, for example df.intent = ["MedianEarnings", "FundingModel=Public"] df -For more advance use of intent, refer to :doc:`this page <../getting_started/intent>` on how to specify the intent. +For more advance use of intent, refer to :doc:`this page <../guide/intent>` on how to specify the intent. Steering Recommendations via User Intent ---------------------------------------- @@ -129,7 +128,7 @@ Given the updated intent, additional actions (Enhance and Filter) are generated. - {MedianEarnings, **AverageCost**} - {MedianEarnings, **AverageFacultySalary**}. -.. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/overview-4.png +.. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/overview-4.png?raw=true :width: 700 :align: center :alt: screenshot of Enhance @@ -140,10 +139,7 @@ Given the updated intent, additional actions (Enhance and Filter) are generated. - {MedianEarnings, **Region=Southeast**} - {MedianEarnings, **Region=Great Lakes**}. -.. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/overview-5.png +.. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/overview-5.png?raw=true :width: 700 :align: center :alt: screenshot of Filter - - -.. Lux is built on the principle that users should always be able to visualize and explore anything they specify, without having to think about how the visualization should look like. diff --git a/doc/source/guide/FAQ.rst b/doc/source/guide/FAQ.rst index 203f015c..c99d29ed 100644 --- a/doc/source/guide/FAQ.rst +++ b/doc/source/guide/FAQ.rst @@ -12,38 +12,38 @@ Note that you must perform :code:`import lux` before you load in or create the d What if my data is stored in a relational database? """""""""""""""""""""""""""""""""""""""""""""""""""""""" - Lux has `some limited support `_ for SQL (currently only tested for Postgres). We are actively working on extending Lux to databases. If you are interested in using this feature, please `contact us `_ for more information. + Lux has `some limited support `__ for SQL (currently only tested for Postgres). We are actively working on extending Lux to databases. If you are interested in using this feature, please `contact us `_ for more information. What do I do with date-related attributes in my dataset? """""""""""""""""""""""""""""""""""""""""""""""""""""""" - Lux supports a variety of temporal data types in Pandas. For more information on how to handle temporal data in Lux, refer to `the datetime guide `_. + Lux supports a variety of temporal data types in Pandas. For more information on how to handle temporal data in Lux, refer to `the datetime guide `__. How do I access all of the current recommendations shown in my widget? """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - The recommendations for Lux can be accessed via the :code:`recommendation` property of the dataframe (e.g., df.recommendation). + The recommendations for Lux can be accessed via the :code:`recommendation` property of the dataframe (e.g., :code:`df.recommendation`). How do I set the Lux widgets to show up on default? """""""""""""""""""""""""""""""""""""""""""""""""""""""" - By default, we show the Pandas display and users can use the toggle button to switch to the Lux display. The `default_display` property allows users to change the setting so that the Lux widget is set as the default view for future operations on the specified dataframe: + By default, we show the Pandas display and users can use the toggle button to switch to the Lux display. The :code:`default_display` property allows users to change the setting so that the Lux widget is set as the default view for future operations: .. code-block:: python - df.config.default_display = "lux" + lux.config.default_display = "lux" To switch back to Pandas as the default display: .. code-block:: python - df.config.default_display = "pandas" + lux.config.default_display = "pandas" I want to change the opacity of my chart, add title, change chart font size, etc. How do I modify chart settings? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - To add custom plot settings to the recommendations, you can set the global :code:`plot_config` property. See `this tutorial `_ on how to configure chart properties. Lux currently only support chart modifications in Altair. + To add custom plot settings to the recommendations, you can set the :code:`lux.config.plot_config` property. See `this tutorial `__ on how to configure chart properties. Lux currently only support chart modifications in Altair. How do I change aggregation functions, binning, or axis channels to non-default values? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" To change the aggregation function to be something that is not average or set an attribute to display on the x-axis instead of y-axis, you can override the default values in the :code:`lux.Clause` specification. - To override automatically inferred properties, you can specify additional arguements inside `lux.Clause` to set the value of the Clause properties. See `this page `_ for more details. + To override automatically inferred properties, you can specify additional arguements inside :py:class:`lux.vis.Clause` to set the value of the Clause properties. See `this page `__ for more details. I want to look at the default recommendations that were recommended to me, how can I get the dataframe to display those? """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -66,7 +66,7 @@ How do I turn off Lux? How do I disable sampling and have Lux visualize the full dataset? """""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" - Lux displays a warning saying "Large dataframe detected: Lux is only visualizing a random sample". If you would like to disable sampling, you can run: + When visualizing large datasets, Lux may display a warning stating "`Large dataframe detected: Lux is only visualizing a random sample`". If you would like to disable sampling, you can run: .. code-block:: python @@ -79,12 +79,12 @@ How do I disable sampling and have Lux visualize the full dataset? lux.config.sampling = False df = pd.read_csv("...") - If you want to fine-tune the sampling parameters, you can edit :code:`lux.config.sampling_start` and :code:`lux.config.sampling_cap`. See `this page `_ for more details. + If you want to fine-tune the sampling parameters, you can edit :code:`lux.config.sampling_start` and :code:`lux.config.sampling_cap`. See `this page `__ for more details. Troubleshooting Tips -------------------- -To troubleshoot your Lux installation, we recommend cloning `this repo `_ and using one of the `demo notebooks `_ to test out Lux. +To troubleshoot your Lux installation, we recommend cloning `this repo `__ and using one of the `demo notebooks `__ to test out Lux. The Lux Jupyter widget does not show up when I print a dataframe. """"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""" @@ -96,9 +96,10 @@ The Lux Jupyter widget does not show up when I print a dataframe. - Validating: OK - If you are able to import lux successfully and you do not see the "Toggle button" when you print the dataframe, it may be possible that Lux is not compatible with your browser. Lux is compatible with Google Chrome, but have not been extensively tested on Safari or Firefox. - - If you recieve the error message :code:`A Jupyter widget could not be displayed because the widget state could not be found.` This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the appropriate cells.`, you may want to restart the notebook and rerun the cell. + - If you recieve the error message :code:`A Jupyter widget could not be displayed because the widget state could not be found.` This could happen if the kernel storing the widget is no longer available, or if the widget state was not saved in the notebook. You may be able to create the widget by running the particular cell again. If this doesn't work, then you may want try restarting the notebook and rerun the cell. - If you receive the error message :code:`ModuleNotFoundError: No module named 'luxwidget'`, it is possible that your luxwidget and lux-api versions are not in sync. The latest version of lux-api requires luxwidget v0.1 or above. Try running the following code: - If you receive the error message :code:`PermissionError: [Errno 13] Permission denied.` during the execution of the command :code:`jupyter nbextension install --py luxwidget`, then you can add the flag :code:`--user` (:code:`jupyter nbextension enable --py --user luxwidget`). + - Alternatively, if none of the above works. You can try creating a fresh virtual environment and follow the `quick install instructions `_. .. code-block:: bash @@ -112,8 +113,6 @@ The Lux Jupyter widget does not show up when I print a dataframe. jupyter nbextension install --py luxwidget jupyter nbextension enable --py luxwidget - - Alternatively, you can also try creating a fresh virtual environment and follow the `quick install instructions `_. I'm not able to export my visualizations via the :code:`exported` property. @@ -140,7 +139,7 @@ I'm not able to export my visualizations via the :code:`exported` property. I have an issue that is not addressed by any of the FAQs. """""""""""""""""""""""""""""""""""""""""""""""""""""""""" -Please submit a `Github Issue `_ or ask a question on `Slack `_. +Please submit a `Github Issue `__ or ask a question on `Slack `__. .. Not Currently Supported .. - What do I do if I want to change the data type of an attribute? diff --git a/doc/source/guide/intent.rst b/doc/source/guide/intent.rst index eba6c287..0a1a8e59 100644 --- a/doc/source/guide/intent.rst +++ b/doc/source/guide/intent.rst @@ -107,8 +107,9 @@ Note that since there are three different visualizations that is generated based :alt: add screenshot You can specify to Lux that you are interested in learning more about colleges in New England. - In the resulting Filter action, we see that Lux suggests visualizations in other `Region`s as recommendations. - + + In the resulting Filter action, we see that Lux suggests visualizations in other `Region` as recommendations. + .. code-block:: python df.intent = ["Region=New England"] diff --git a/doc/source/guide/style.rst b/doc/source/guide/style.rst index 516f1e5a..58a0fa71 100644 --- a/doc/source/guide/style.rst +++ b/doc/source/guide/style.rst @@ -115,4 +115,6 @@ We want to decrease the opacity of scatterplots, but keep the opacity for the ot :width: 700 :align: center +.. note:: For now, if the visualization has already been rendered before, you will need to run `df.expire_recs()` to see the updated visualization. + We can modify the scatterplot setting, without changing the settings for the other chart types. diff --git a/doc/source/reference/API.rst b/doc/source/reference/API.rst index 68b31def..42659464 100644 --- a/doc/source/reference/API.rst +++ b/doc/source/reference/API.rst @@ -4,17 +4,35 @@ API **** +Core Lux Objects +----------------- + +.. autosummary:: + :toctree: gen + :nosignatures: + + lux.core.frame.LuxDataFrame + lux.core.series.LuxSeries + +Configuration Options +---------------------- + +.. autosummary:: + :toctree: gen + :nosignatures: + + lux._config.config.Config + Basic API Interface ------------------- .. autosummary:: :toctree: gen :nosignatures: - + lux.vis.Vis.Vis lux.vis.VisList.VisList lux.vis.Vis.Clause - lux.core.frame.LuxDataFrame Advanced Internals (Dev) ------------------------- diff --git a/doc/source/reference/config.rst b/doc/source/reference/config.rst index a1474dc5..7b85b687 100644 --- a/doc/source/reference/config.rst +++ b/doc/source/reference/config.rst @@ -2,23 +2,22 @@ Configuration Settings *********************** -In Lux, users can customize various global settings to configure the behavior of Lux through :py:mod:`lux.config.Config`. This page documents some of the configurations that you can apply in Lux. +In Lux, users can customize various global settings to configure the behavior of Lux through :py:class:`lux.config.Config`. This page documents some of the configurations that you can apply in Lux. Change the default display of Lux ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -We can set the `default_display` of the global class 'Config' to change the default form of output. In the following block, we set it to 'lux,' therefore the VisList will display first. +We can set the :code:`default_display` to change whether the Pandas table or Lux widget is displayed by default. In the following block, we set the default display to 'lux', therefore the Lux widget will display first. .. code-block:: python - lux.config.default_display = "lux" # Set Lux as default display + lux.config.default_display = "lux" df .. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/display-1.png?raw=true :width: 700 :align: center - :alt: Retrieves a single attribute from Lux's Action Manager using its defined id. We can set the default_display back to 'pandas,' which would allow for the dataframe object to display first. You can still toggle to Lux/Pandas respectively using the 'Toggle' button. @@ -30,7 +29,6 @@ We can set the default_display back to 'pandas,' which would allow for the dataf .. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/display-2.png?raw=true :width: 700 :align: center - :alt: Retrieves a single attribute from Lux's Action Manager using its defined id. If you try to set the default_display to anything other than 'lux' or 'pandas,' a warning will be shown, and the display will default to the previous setting. @@ -42,7 +40,6 @@ If you try to set the default_display to anything other than 'lux' or 'pandas,' .. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/display-3.png?raw=true :width: 700 :align: center - :alt: Retrieves a single attribute from Lux's Action Manager using its defined id. Change the sampling parameters of Lux ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -72,3 +69,42 @@ We can disable this feature and revert back to using a scatter plot by running t .. code-block:: python lux.config.heatmap = False + + +Default Renderer +~~~~~~~~~~~~~~~~~ + +Charts in Lux are rendered using `Altair `__. We are working on supporting plotting via `matplotlib `__ and other plotting libraries. + +To change the default renderer, run the following code block: + +.. code-block:: python + + lux.config.renderer = "matplotlib" + +Plot Configurations +~~~~~~~~~~~~~~~~~~~ + +Altair supports plot configurations to be applied on top of the generated graphs. To set a default plot configuration, first write a function that can take in a `chart` and returns a `chart`. For example: + +.. code-block:: python + + def change_color_add_title(chart): + chart = chart.configure_mark(color="green") # change mark color to green + chart.title = "Custom Title" # add title to chart + return chart + +Then, set the `plot_config` to this function so that this function is applied to every plot generated. + +.. code-block:: python + + lux.config.plot_config = change_color_add_title + +The above results in the following changes: + +.. image:: https://github.com/lux-org/lux-resources/blob/master/doc_img/style-2.png?raw=true + :width: 600 + :align: center + +See `this page `__ for more details. + diff --git a/doc/source/reference/gen/lux._config.config.Config.rst b/doc/source/reference/gen/lux._config.config.Config.rst new file mode 100644 index 00000000..0000b36f --- /dev/null +++ b/doc/source/reference/gen/lux._config.config.Config.rst @@ -0,0 +1,34 @@ +lux.\_config.config.Config +========================== + +.. currentmodule:: lux._config.config + +.. autoclass:: Config + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~Config.__init__ + ~Config.set_SQL_connection + ~Config.set_executor_type + + + + + + .. rubric:: Attributes + + .. autosummary:: + + ~Config.default_display + ~Config.heatmap + ~Config.sampling + ~Config.sampling_cap + ~Config.sampling_start + + \ No newline at end of file diff --git a/doc/source/reference/gen/lux.core.frame.LuxDataFrame.rst b/doc/source/reference/gen/lux.core.frame.LuxDataFrame.rst index 71259efb..600daf83 100644 --- a/doc/source/reference/gen/lux.core.frame.LuxDataFrame.rst +++ b/doc/source/reference/gen/lux.core.frame.LuxDataFrame.rst @@ -171,11 +171,11 @@ lux.core.frame.LuxDataFrame ~LuxDataFrame.rsub ~LuxDataFrame.rtruediv ~LuxDataFrame.sample + ~LuxDataFrame.save_as_html ~LuxDataFrame.select_dtypes ~LuxDataFrame.sem - ~LuxDataFrame.set_SQL_connection + ~LuxDataFrame.set_SQL_table ~LuxDataFrame.set_axis - ~LuxDataFrame.set_executor_type ~LuxDataFrame.set_index ~LuxDataFrame.set_intent ~LuxDataFrame.set_intent_as_vis diff --git a/doc/source/reference/gen/lux.core.series.LuxSeries.rst b/doc/source/reference/gen/lux.core.series.LuxSeries.rst new file mode 100644 index 00000000..0f50d3e4 --- /dev/null +++ b/doc/source/reference/gen/lux.core.series.LuxSeries.rst @@ -0,0 +1,234 @@ +lux.core.series.LuxSeries +========================= + +.. currentmodule:: lux.core.series + +.. autoclass:: LuxSeries + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~LuxSeries.__init__ + ~LuxSeries.abs + ~LuxSeries.add + ~LuxSeries.add_prefix + ~LuxSeries.add_suffix + ~LuxSeries.agg + ~LuxSeries.aggregate + ~LuxSeries.align + ~LuxSeries.all + ~LuxSeries.any + ~LuxSeries.append + ~LuxSeries.apply + ~LuxSeries.argmax + ~LuxSeries.argmin + ~LuxSeries.argsort + ~LuxSeries.asfreq + ~LuxSeries.asof + ~LuxSeries.astype + ~LuxSeries.at_time + ~LuxSeries.autocorr + ~LuxSeries.backfill + ~LuxSeries.between + ~LuxSeries.between_time + ~LuxSeries.bfill + ~LuxSeries.bool + ~LuxSeries.clip + ~LuxSeries.combine + ~LuxSeries.combine_first + ~LuxSeries.compare + ~LuxSeries.convert_dtypes + ~LuxSeries.copy + ~LuxSeries.corr + ~LuxSeries.count + ~LuxSeries.cov + ~LuxSeries.cummax + ~LuxSeries.cummin + ~LuxSeries.cumprod + ~LuxSeries.cumsum + ~LuxSeries.describe + ~LuxSeries.diff + ~LuxSeries.display_pandas + ~LuxSeries.div + ~LuxSeries.divide + ~LuxSeries.divmod + ~LuxSeries.dot + ~LuxSeries.drop + ~LuxSeries.drop_duplicates + ~LuxSeries.droplevel + ~LuxSeries.dropna + ~LuxSeries.duplicated + ~LuxSeries.eq + ~LuxSeries.equals + ~LuxSeries.ewm + ~LuxSeries.expanding + ~LuxSeries.explode + ~LuxSeries.factorize + ~LuxSeries.ffill + ~LuxSeries.fillna + ~LuxSeries.filter + ~LuxSeries.first + ~LuxSeries.first_valid_index + ~LuxSeries.floordiv + ~LuxSeries.ge + ~LuxSeries.get + ~LuxSeries.groupby + ~LuxSeries.gt + ~LuxSeries.head + ~LuxSeries.hist + ~LuxSeries.idxmax + ~LuxSeries.idxmin + ~LuxSeries.infer_objects + ~LuxSeries.interpolate + ~LuxSeries.isin + ~LuxSeries.isna + ~LuxSeries.isnull + ~LuxSeries.item + ~LuxSeries.items + ~LuxSeries.iteritems + ~LuxSeries.keys + ~LuxSeries.kurt + ~LuxSeries.kurtosis + ~LuxSeries.last + ~LuxSeries.last_valid_index + ~LuxSeries.le + ~LuxSeries.lt + ~LuxSeries.mad + ~LuxSeries.map + ~LuxSeries.mask + ~LuxSeries.max + ~LuxSeries.mean + ~LuxSeries.median + ~LuxSeries.memory_usage + ~LuxSeries.min + ~LuxSeries.mod + ~LuxSeries.mode + ~LuxSeries.mul + ~LuxSeries.multiply + ~LuxSeries.ne + ~LuxSeries.nlargest + ~LuxSeries.notna + ~LuxSeries.notnull + ~LuxSeries.nsmallest + ~LuxSeries.nunique + ~LuxSeries.pad + ~LuxSeries.pct_change + ~LuxSeries.pipe + ~LuxSeries.pop + ~LuxSeries.pow + ~LuxSeries.prod + ~LuxSeries.product + ~LuxSeries.quantile + ~LuxSeries.radd + ~LuxSeries.rank + ~LuxSeries.ravel + ~LuxSeries.rdiv + ~LuxSeries.rdivmod + ~LuxSeries.reindex + ~LuxSeries.reindex_like + ~LuxSeries.rename + ~LuxSeries.rename_axis + ~LuxSeries.reorder_levels + ~LuxSeries.repeat + ~LuxSeries.replace + ~LuxSeries.resample + ~LuxSeries.reset_index + ~LuxSeries.rfloordiv + ~LuxSeries.rmod + ~LuxSeries.rmul + ~LuxSeries.rolling + ~LuxSeries.round + ~LuxSeries.rpow + ~LuxSeries.rsub + ~LuxSeries.rtruediv + ~LuxSeries.sample + ~LuxSeries.searchsorted + ~LuxSeries.sem + ~LuxSeries.set_axis + ~LuxSeries.shift + ~LuxSeries.skew + ~LuxSeries.slice_shift + ~LuxSeries.sort_index + ~LuxSeries.sort_values + ~LuxSeries.squeeze + ~LuxSeries.std + ~LuxSeries.sub + ~LuxSeries.subtract + ~LuxSeries.sum + ~LuxSeries.swapaxes + ~LuxSeries.swaplevel + ~LuxSeries.tail + ~LuxSeries.take + ~LuxSeries.to_clipboard + ~LuxSeries.to_csv + ~LuxSeries.to_dict + ~LuxSeries.to_excel + ~LuxSeries.to_frame + ~LuxSeries.to_hdf + ~LuxSeries.to_json + ~LuxSeries.to_latex + ~LuxSeries.to_list + ~LuxSeries.to_markdown + ~LuxSeries.to_numpy + ~LuxSeries.to_pandas + ~LuxSeries.to_period + ~LuxSeries.to_pickle + ~LuxSeries.to_sql + ~LuxSeries.to_string + ~LuxSeries.to_timestamp + ~LuxSeries.to_xarray + ~LuxSeries.tolist + ~LuxSeries.transform + ~LuxSeries.transpose + ~LuxSeries.truediv + ~LuxSeries.truncate + ~LuxSeries.tshift + ~LuxSeries.tz_convert + ~LuxSeries.tz_localize + ~LuxSeries.unique + ~LuxSeries.unstack + ~LuxSeries.update + ~LuxSeries.value_counts + ~LuxSeries.var + ~LuxSeries.view + ~LuxSeries.where + ~LuxSeries.xs + + + + + + .. rubric:: Attributes + + .. autosummary:: + + ~LuxSeries.T + ~LuxSeries.array + ~LuxSeries.at + ~LuxSeries.attrs + ~LuxSeries.axes + ~LuxSeries.dtype + ~LuxSeries.dtypes + ~LuxSeries.empty + ~LuxSeries.hasnans + ~LuxSeries.iat + ~LuxSeries.iloc + ~LuxSeries.index + ~LuxSeries.is_monotonic + ~LuxSeries.is_monotonic_decreasing + ~LuxSeries.is_monotonic_increasing + ~LuxSeries.is_unique + ~LuxSeries.loc + ~LuxSeries.name + ~LuxSeries.nbytes + ~LuxSeries.ndim + ~LuxSeries.shape + ~LuxSeries.size + ~LuxSeries.values + + \ No newline at end of file diff --git a/doc/source/reference/gen/lux.executor.Executor.Executor.rst b/doc/source/reference/gen/lux.executor.Executor.Executor.rst index 71e7c0d2..c45473c6 100644 --- a/doc/source/reference/gen/lux.executor.Executor.Executor.rst +++ b/doc/source/reference/gen/lux.executor.Executor.Executor.rst @@ -15,12 +15,14 @@ lux.executor.Executor.Executor ~Executor.__init__ ~Executor.compute_data_model + ~Executor.compute_data_model_lookup ~Executor.compute_data_type ~Executor.compute_stats ~Executor.execute ~Executor.execute_aggregate ~Executor.execute_binning ~Executor.execute_filter + ~Executor.invert_data_type ~Executor.mapping ~Executor.reverseMapping diff --git a/doc/source/reference/gen/lux.executor.PandasExecutor.PandasExecutor.rst b/doc/source/reference/gen/lux.executor.PandasExecutor.PandasExecutor.rst index 83997a90..a65b633e 100644 --- a/doc/source/reference/gen/lux.executor.PandasExecutor.PandasExecutor.rst +++ b/doc/source/reference/gen/lux.executor.PandasExecutor.PandasExecutor.rst @@ -16,6 +16,7 @@ lux.executor.PandasExecutor.PandasExecutor ~PandasExecutor.__init__ ~PandasExecutor.apply_filter ~PandasExecutor.compute_data_model + ~PandasExecutor.compute_data_model_lookup ~PandasExecutor.compute_data_type ~PandasExecutor.compute_dataset_metadata ~PandasExecutor.compute_stats @@ -25,6 +26,7 @@ lux.executor.PandasExecutor.PandasExecutor ~PandasExecutor.execute_binning ~PandasExecutor.execute_filter ~PandasExecutor.execute_sampling + ~PandasExecutor.invert_data_type ~PandasExecutor.mapping ~PandasExecutor.reverseMapping diff --git a/doc/source/reference/gen/lux.executor.SQLExecutor.SQLExecutor.rst b/doc/source/reference/gen/lux.executor.SQLExecutor.SQLExecutor.rst index c6c3f631..f5ddf2ec 100644 --- a/doc/source/reference/gen/lux.executor.SQLExecutor.SQLExecutor.rst +++ b/doc/source/reference/gen/lux.executor.SQLExecutor.SQLExecutor.rst @@ -15,12 +15,14 @@ lux.executor.SQLExecutor.SQLExecutor ~SQLExecutor.__init__ ~SQLExecutor.compute_data_model + ~SQLExecutor.compute_data_model_lookup ~SQLExecutor.compute_data_type ~SQLExecutor.compute_stats ~SQLExecutor.execute ~SQLExecutor.execute_aggregate ~SQLExecutor.execute_binning ~SQLExecutor.execute_filter + ~SQLExecutor.invert_data_type ~SQLExecutor.mapping ~SQLExecutor.reverseMapping diff --git a/doc/source/reference/gen/lux.history.event.Event.rst b/doc/source/reference/gen/lux.history.event.Event.rst new file mode 100644 index 00000000..6674286a --- /dev/null +++ b/doc/source/reference/gen/lux.history.event.Event.rst @@ -0,0 +1,22 @@ +lux.history.event.Event +======================= + +.. currentmodule:: lux.history.event + +.. autoclass:: Event + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~Event.__init__ + + + + + + \ No newline at end of file diff --git a/doc/source/reference/gen/lux.history.history.Event.rst b/doc/source/reference/gen/lux.history.history.Event.rst new file mode 100644 index 00000000..2b03f4e3 --- /dev/null +++ b/doc/source/reference/gen/lux.history.history.Event.rst @@ -0,0 +1,22 @@ +lux.history.history.Event +========================= + +.. currentmodule:: lux.history.history + +.. autoclass:: Event + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~Event.__init__ + + + + + + \ No newline at end of file diff --git a/doc/source/reference/gen/lux.history.history.History.rst b/doc/source/reference/gen/lux.history.history.History.rst new file mode 100644 index 00000000..e10bd1af --- /dev/null +++ b/doc/source/reference/gen/lux.history.history.History.rst @@ -0,0 +1,23 @@ +lux.history.history.History +=========================== + +.. currentmodule:: lux.history.history + +.. autoclass:: History + + + .. automethod:: __init__ + + + .. rubric:: Methods + + .. autosummary:: + + ~History.__init__ + ~History.append_event + + + + + + \ No newline at end of file diff --git a/doc/source/reference/gen/lux.vis.Vis.Vis.rst b/doc/source/reference/gen/lux.vis.Vis.Vis.rst index 39e2983b..dc79967d 100644 --- a/doc/source/reference/gen/lux.vis.Vis.Vis.rst +++ b/doc/source/reference/gen/lux.vis.Vis.Vis.rst @@ -22,10 +22,10 @@ lux.vis.Vis.Vis ~Vis.refresh_source ~Vis.remove_column_from_spec ~Vis.remove_filter_from_spec - ~Vis.to_code ~Vis.set_intent ~Vis.to_Altair ~Vis.to_VegaLite + ~Vis.to_code diff --git a/doc/source/reference/lux._config.config.rst b/doc/source/reference/lux._config.config.rst new file mode 100644 index 00000000..73f977f0 --- /dev/null +++ b/doc/source/reference/lux._config.config.rst @@ -0,0 +1,9 @@ +lux.config.config package +=================================== + +lux._config.config.Config module +--------------------------------- + +.. automodule:: lux._config.config.config + :members: + diff --git a/doc/source/reference/lux.action.rst b/doc/source/reference/lux.action.rst index 52ff79fe..b29055ab 100644 --- a/doc/source/reference/lux.action.rst +++ b/doc/source/reference/lux.action.rst @@ -10,7 +10,7 @@ lux.action.column\_group module .. automodule:: lux.action.column_group :members: :undoc-members: - :show-inheritance: + lux.action.correlation module ----------------------------- @@ -18,7 +18,7 @@ lux.action.correlation module .. automodule:: lux.action.correlation :members: :undoc-members: - :show-inheritance: + lux.action.custom module ------------------------ @@ -26,7 +26,6 @@ lux.action.custom module .. automodule:: lux.action.custom :members: :undoc-members: - :show-inheritance: lux.action.enhance module ------------------------- @@ -34,7 +33,6 @@ lux.action.enhance module .. automodule:: lux.action.enhance :members: :undoc-members: - :show-inheritance: lux.action.filter module ------------------------ @@ -42,7 +40,6 @@ lux.action.filter module .. automodule:: lux.action.filter :members: :undoc-members: - :show-inheritance: lux.action.generalize module ---------------------------- @@ -50,7 +47,6 @@ lux.action.generalize module .. automodule:: lux.action.generalize :members: :undoc-members: - :show-inheritance: lux.action.row\_group module ---------------------------- @@ -58,15 +54,6 @@ lux.action.row\_group module .. automodule:: lux.action.row_group :members: :undoc-members: - :show-inheritance: - -lux.action.similarity module ----------------------------- - -.. automodule:: lux.action.similarity - :members: - :undoc-members: - :show-inheritance: lux.action.univariate module ---------------------------- @@ -74,7 +61,6 @@ lux.action.univariate module .. automodule:: lux.action.univariate :members: :undoc-members: - :show-inheritance: Module contents @@ -83,4 +69,3 @@ Module contents .. automodule:: lux.action :members: :undoc-members: - :show-inheritance: diff --git a/doc/source/reference/lux.core.rst b/doc/source/reference/lux.core.rst index 93a8e7bf..f1d40000 100644 --- a/doc/source/reference/lux.core.rst +++ b/doc/source/reference/lux.core.rst @@ -9,14 +9,13 @@ lux.core.frame module .. automodule:: lux.core.frame :members: - :undoc-members: - :show-inheritance: + :exclude-members: head, describe, info, tail + -Module contents ---------------- - -.. automodule:: lux.core +lux.core.series module +----------------------- + +.. automodule:: lux.core.series :members: - :undoc-members: - :show-inheritance: + diff --git a/doc/source/reference/lux.rst b/doc/source/reference/lux.rst index 472a6abd..52732221 100644 --- a/doc/source/reference/lux.rst +++ b/doc/source/reference/lux.rst @@ -22,4 +22,3 @@ Module contents .. automodule:: lux :members: :undoc-members: - :show-inheritance: diff --git a/doc/source/reference/lux.utils.rst b/doc/source/reference/lux.utils.rst index 965d9dd1..5a3177fb 100644 --- a/doc/source/reference/lux.utils.rst +++ b/doc/source/reference/lux.utils.rst @@ -12,14 +12,6 @@ lux.utils.date\_utils module :undoc-members: :show-inheritance: -lux.utils.renderjson module ---------------------------- - -.. automodule:: lux.utils.renderjson - :members: - :undoc-members: - :show-inheritance: - lux.utils.utils module ---------------------- diff --git a/lux/_config/config.py b/lux/_config/config.py index 5c1a6e2c..419f9909 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -11,6 +11,10 @@ class Config: + """ + Class for Lux configurations applied globally across entire session + """ + def __init__(self): self._default_display = "pandas" self.renderer = "altair" @@ -29,6 +33,12 @@ def __init__(self): @property def sampling_cap(self): + """ + Parameters + ---------- + sample_number : int + Cap on the number of rows to sample. Must be larger than _sampling_start + """ return self._sampling_cap @sampling_cap.setter @@ -37,7 +47,7 @@ def sampling_cap(self, sample_number: int) -> None: Parameters ---------- sample_number : int - Cap on the number of rows to sample. Must be larger than _sampling_start + Cap on the number of rows to sample. Must be larger than _sampling_start """ if type(sample_number) == int: assert sample_number >= self._sampling_start @@ -50,6 +60,13 @@ def sampling_cap(self, sample_number: int) -> None: @property def sampling_start(self): + """ + Parameters + ---------- + sample_number : int + Number of rows required to begin sampling. Must be smaller or equal to _sampling_cap + + """ return self._sampling_start @sampling_start.setter @@ -58,7 +75,7 @@ def sampling_start(self, sample_number: int) -> None: Parameters ---------- sample_number : int - Number of rows required to begin sampling. Must be smaller or equal to _sampling_cap + Number of rows required to begin sampling. Must be smaller or equal to _sampling_cap """ if type(sample_number) == int: @@ -72,6 +89,12 @@ def sampling_start(self, sample_number: int) -> None: @property def sampling(self): + """ + Parameters + ---------- + sample_flag : bool + Whether or not sampling will occur. + """ return self._sampling_flag @sampling.setter @@ -80,7 +103,7 @@ def sampling(self, sample_flag: bool) -> None: Parameters ---------- sample_flag : bool - Whether or not sampling will occur. + Whether or not sampling will occur. """ if type(sample_flag) == bool: self._sampling_flag = sample_flag @@ -92,6 +115,12 @@ def sampling(self, sample_flag: bool) -> None: @property def heatmap(self): + """ + Parameters + ---------- + heatmap_flag : bool + Whether or not a heatmap will be used instead of a scatter plot. + """ return self._heatmap_flag @heatmap.setter @@ -100,7 +129,7 @@ def heatmap(self, heatmap_flag: bool) -> None: Parameters ---------- heatmap_flag : bool - Whether or not a heatmap will be used instead of a scatter plot. + Whether or not a heatmap will be used instead of a scatter plot. """ if type(heatmap_flag) == bool: self._heatmap_flag = heatmap_flag @@ -112,6 +141,13 @@ def heatmap(self, heatmap_flag: bool) -> None: @property def default_display(self): + """ + Set the widget display to show Pandas by default or Lux by default + Parameters + ---------- + type : str + Default display type, can take either the string `lux` or `pandas` (regardless of capitalization) + """ return self._default_display @default_display.setter @@ -121,7 +157,7 @@ def default_display(self, type: str) -> None: Parameters ---------- type : str - Default display type, can take either the string `lux` or `pandas` (regardless of capitalization) + Default display type, can take either the string `lux` or `pandas` (regardless of capitalization) """ if type.lower() == "lux": self._default_display = "lux" @@ -184,6 +220,13 @@ def remove_action(self, name: str = "") -> None: self.update_actions["flag"] = True def set_SQL_connection(self, connection): + """ + Sets SQL connection to a database + + Parameters: + connection : SQLAlchemy connectable, str, or sqlite3 connection + For more information, `see here `__ + """ self.SQLconnection = connection def set_executor_type(self, exe): @@ -204,9 +247,6 @@ def set_executor_type(self, exe): self.executor = PandasExecutor() - def set_SQL_connection(self, connection): - self.SQLconnection = connection - def warning_format(message, category, filename, lineno, file=None, line=None): return "%s:%s: %s:%s\n" % (filename, lineno, category.__name__, message) diff --git a/lux/core/frame.py b/lux/core/frame.py index 08401ac6..a91c8802 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -112,6 +112,9 @@ def maintain_metadata(self): self._metadata_fresh = True def expire_recs(self): + """ + Expires and resets all recommendations + """ self._recs_fresh = False self._recommendation = {} self.current_vis = None @@ -120,7 +123,9 @@ def expire_recs(self): self._sampled = None def expire_metadata(self): - # Set metadata as null + """ + Expire all saved metadata to trigger a recomputation the next time the data is required. + """ self._metadata_fresh = False self.data_type = None self.unique_values = None @@ -166,6 +171,19 @@ def _infer_structure(self): @property def intent(self): + """ + Main function to set the intent of the dataframe. + The intent input goes through the parser, so that the string inputs are parsed into a lux.Clause object. + + Parameters + ---------- + intent : List[str,Clause] + intent list, can be a mix of string shorthand or a lux.Clause object + + Notes + ----- + :doc:`../guide/intent` + """ return self._intent @intent.setter @@ -187,19 +205,6 @@ def clear_intent(self): self.expire_recs() def set_intent(self, intent: List[Union[str, Clause]]): - """ - Main function to set the intent of the dataframe. - The intent input goes through the parser, so that the string inputs are parsed into a lux.Clause object. - - Parameters - ---------- - intent : List[str,Clause] - intent list, can be a mix of string shorthand or a lux.Clause object - - Notes - ----- - :doc:`../guide/clause` - """ self.expire_recs() self._intent = intent self._parse_validate_compile_intent() @@ -226,11 +231,12 @@ def copy_intent(self): def set_intent_as_vis(self, vis: Vis): """ - Set intent of the dataframe as the Vis + Set intent of the dataframe based on the intent of a Vis Parameters ---------- vis : Vis + Input Vis object """ self.expire_recs() self._intent = vis._inferred_intent @@ -461,7 +467,9 @@ def exported(self) -> Union[Dict[str, VisList], VisList]: ----- Convert the _selectedVisIdxs dictionary into a programmable VisList Example _selectedVisIdxs : - {'Correlation': [0, 2], 'Occurrence': [1]} + + {'Correlation': [0, 2], 'Occurrence': [1]} + indicating the 0th and 2nd vis from the `Correlation` tab is selected, and the 1st vis from the `Occurrence` tab is selected. Returns @@ -653,27 +661,38 @@ def render_widget(self, renderer: str = "altair", input_current_vis=""): Generate a LuxWidget based on the LuxDataFrame Structure of widgetJSON: + { - 'current_vis': {}, - 'recommendation': [ - { - 'action': 'Correlation', - 'description': "some description", - 'vspec': [ - {Vega-Lite spec for vis 1}, - {Vega-Lite spec for vis 2}, - ... - ] - }, - ... repeat for other actions - ] + + 'current_vis': {}, + 'recommendation': [ + + { + + 'action': 'Correlation', + 'description': "some description", + 'vspec': [ + + {Vega-Lite spec for vis 1}, + {Vega-Lite spec for vis 2}, + ... + + ] + + }, + ... repeat for other actions + + ] + } + Parameters ---------- renderer : str, optional Choice of visualization rendering library, by default "altair" input_current_vis : lux.LuxDataFrame, optional User-specified current vis to override default Current Vis, by default + """ check_import_lux_widget() import luxwidget diff --git a/lux/core/series.py b/lux/core/series.py index 1e3c4f8c..3a4068d3 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -20,6 +20,10 @@ class LuxSeries(pd.Series): + """ + A subclass of pd.Series that supports all 1-D Series operations + """ + _metadata = [ "_intent", "data_type", @@ -55,14 +59,18 @@ def _constructor_expanddim(self): # f._get_axis_number = super(LuxSeries, self)._get_axis_number return LuxDataFrame - def to_pandas(self): + def to_pandas(self) -> pd.Series: + """ + Convert Lux Series to Pandas Series + + Returns + ------- + pd.Series + """ import lux.core return lux.core.originalSeries(self, copy=False) - def display_pandas(self): - return self.to_pandas() - def __repr__(self): from IPython.display import display from IPython.display import clear_output @@ -156,5 +164,5 @@ def on_button_clicked(b): stacklevel=2, ) warnings.warn(traceback.format_exc()) - display(self.display_pandas()) + display(self.to_pandas()) return "" From 3393b9ff5c240c624250a900f4aa68ec7caa994c Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Thu, 7 Jan 2021 22:11:24 +0800 Subject: [PATCH 13/28] Supporting dataframe with integer columns (#203) * bugfix for describe and convert_dtypes * added back metadata series test * black * default to pandas display when df.dtypes printed * various fixes to support int columns --- lux/action/enhance.py | 4 +- lux/action/filter.py | 2 +- lux/action/generalize.py | 6 +-- lux/executor/PandasExecutor.py | 6 +-- lux/processor/Parser.py | 59 +++++++++++++++-------------- lux/processor/Validator.py | 12 +++--- lux/utils/utils.py | 15 ++++---- lux/vis/Clause.py | 20 +++++----- lux/vis/Vis.py | 11 ++++-- lux/vis/VisList.py | 23 +++++------ lux/vislib/altair/AltairChart.py | 14 ++++++- lux/vislib/altair/AltairRenderer.py | 11 +++--- lux/vislib/altair/BarChart.py | 31 +++++++-------- lux/vislib/altair/Heatmap.py | 14 ++++--- lux/vislib/altair/Histogram.py | 18 +++++---- lux/vislib/altair/LineChart.py | 30 +++++++++------ lux/vislib/altair/ScatterChart.py | 20 +++++----- tests/test_columns.py | 10 +++++ 18 files changed, 172 insertions(+), 134 deletions(-) diff --git a/lux/action/enhance.py b/lux/action/enhance.py index 0370b2ce..94a4ea60 100644 --- a/lux/action/enhance.py +++ b/lux/action/enhance.py @@ -37,8 +37,8 @@ def enhance(ldf): # Collect variables that already exist in the intent attr_specs = list(filter(lambda x: x.value == "" and x.attribute != "Record", ldf._intent)) fltr_str = [fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters] - attr_str = [clause.attribute for clause in attr_specs] - intended_attrs = '

' + ", ".join(attr_str + fltr_str) + "

" + attr_str = [str(clause.attribute) for clause in attr_specs] + intended_attrs = f'

{", ".join(attr_str + fltr_str)}

' if len(attr_specs) == 1: recommendation = { "action": "Enhance", diff --git a/lux/action/filter.py b/lux/action/filter.py index c6def0dc..6b27b843 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -91,7 +91,7 @@ def get_complementary_ops(fltr_op): else: intended_attrs = ", ".join( [ - clause.attribute + str(clause.attribute) for clause in ldf._intent if clause.value == "" and clause.attribute != "Record" ] diff --git a/lux/action/generalize.py b/lux/action/generalize.py index d95bcb26..91b83239 100644 --- a/lux/action/generalize.py +++ b/lux/action/generalize.py @@ -42,8 +42,8 @@ def generalize(ldf): filters = utils.get_filter_specs(ldf._intent) fltr_str = [fltr.attribute + fltr.filter_op + str(fltr.value) for fltr in filters] - attr_str = [clause.attribute for clause in attributes] - intended_attrs = '

' + ", ".join(attr_str + fltr_str) + "

" + attr_str = [str(clause.attribute) for clause in attributes] + intended_attrs = f'

{", ".join(attr_str + fltr_str)}

' recommendation = { "action": "Generalize", @@ -66,7 +66,7 @@ def generalize(ldf): temp_vis.remove_column_from_spec(column, remove_first=True) excluded_columns.append(column) output.append(temp_vis) - elif type(columns) == str: + else: if columns not in excluded_columns: temp_vis = Vis(ldf.copy_intent(), score=1) temp_vis.remove_column_from_spec(columns, remove_first=True) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 56422866..cb8a8ce7 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -90,11 +90,11 @@ def execute(vislist: VisList, ldf: LuxDataFrame): # Select relevant data based on attribute information attributes = set([]) for clause in vis._inferred_intent: - if clause.attribute: - if clause.attribute != "Record": - attributes.add(clause.attribute) + if clause.attribute != "Record": + attributes.add(clause.attribute) # TODO: Add some type of cap size on Nrows ? vis._vis_data = vis.data[list(attributes)] + if vis.mark == "bar" or vis.mark == "line": PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed) elif vis.mark == "histogram": diff --git a/lux/processor/Parser.py b/lux/processor/Parser.py index a6538e09..090c619d 100644 --- a/lux/processor/Parser.py +++ b/lux/processor/Parser.py @@ -46,7 +46,6 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]: ) import re - # intent = ldf.get_context() new_context = [] # checks for and converts users' string inputs into lux specifications for clause in intent: @@ -59,37 +58,40 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]: valid_values.append(v) temp_spec = Clause(attribute=valid_values) new_context.append(temp_spec) - elif isinstance(clause, str): - # case where user specifies a filter - if "=" in clause: - eqInd = clause.index("=") - var = clause[0:eqInd] - if "|" in clause: - values = clause[eqInd + 1 :].split("|") - for v in values: - # if v in ldf.unique_values[var]: #TODO: Move validation check to Validator - valid_values.append(v) + elif isinstance(clause, Clause): + new_context.append(clause) + else: + if isinstance(clause, str): + # case where user specifies a filter + if "=" in clause: + eqInd = clause.index("=") + var = clause[0:eqInd] + if "|" in clause: + values = clause[eqInd + 1 :].split("|") + for v in values: + # if v in ldf.unique_values[var]: #TODO: Move validation check to Validator + valid_values.append(v) + else: + valid_values = clause[eqInd + 1 :] + # if var in list(ldf.columns): #TODO: Move validation check to Validator + temp_spec = Clause(attribute=var, filter_op="=", value=valid_values) + new_context.append(temp_spec) + # case where user specifies a variable else: - valid_values = clause[eqInd + 1 :] - # if var in list(ldf.columns): #TODO: Move validation check to Validator - temp_spec = Clause(attribute=var, filter_op="=", value=valid_values) - new_context.append(temp_spec) - # case where user specifies a variable + if "|" in clause: + values = clause.split("|") + for v in values: + # if v in list(ldf.columns): #TODO: Move validation check to Validator + valid_values.append(v) + else: + valid_values = clause + temp_spec = Clause(attribute=valid_values) + new_context.append(temp_spec) else: - if "|" in clause: - values = clause.split("|") - for v in values: - # if v in list(ldf.columns): #TODO: Move validation check to Validator - valid_values.append(v) - else: - valid_values = clause - temp_spec = Clause(attribute=valid_values) + temp_spec = Clause(attribute=clause) new_context.append(temp_spec) - elif type(clause) is Clause: - new_context.append(clause) - intent = new_context - # ldf._intent = new_context + intent = new_context for clause in intent: if clause.description: # TODO: Move validation check to Validator @@ -112,4 +114,3 @@ def parse(intent: List[Union[Clause, str]]) -> List[Clause]: else: # then it is probably a value clause.value = clause.description return intent - # ldf._intent = intent diff --git a/lux/processor/Validator.py b/lux/processor/Validator.py index c72dc63b..2550ac31 100644 --- a/lux/processor/Validator.py +++ b/lux/processor/Validator.py @@ -57,9 +57,7 @@ def validate_intent(intent: List[Clause], ldf: LuxDataFrame) -> None: def validate_clause(clause): warn_msg = "" - if not ( - (clause.attribute and clause.attribute == "?") or (clause.value and clause.value == "?") - ): + if not (clause.attribute == "?" or clause.value == "?" or clause.attribute == ""): if isinstance(clause.attribute, list): for attr in clause.attribute: if attr not in list(ldf.columns): @@ -69,7 +67,9 @@ def validate_clause(clause): else: if clause.attribute != "Record": # we don't value check datetime since datetime can take filter values that don't exactly match the exact TimeStamp representation - if clause.attribute and not is_datetime_string(clause.attribute): + if isinstance(clause.attribute, str) and not is_datetime_string( + clause.attribute + ): if not clause.attribute in list(ldf.columns): search_val = clause.attribute match_attr = False @@ -80,9 +80,7 @@ def validate_clause(clause): warn_msg = f"\n- The input '{search_val}' looks like a value that belongs to the '{match_attr}' attribute. \n Please specify the value fully, as something like {match_attr}={search_val}." else: warn_msg = f"\n- The input attribute '{clause.attribute}' does not exist in the DataFrame. \n Please check your input intent for typos." - if clause.value and clause.attribute and clause.filter_op == "=": - import math - + if clause.value != "" and clause.attribute != "" and clause.filter_op == "=": # Skip check for NaN filter values if not lux.utils.utils.like_nan(clause.value): series = ldf[clause.attribute] diff --git a/lux/utils/utils.py b/lux/utils/utils.py index e19afcf4..3ae4503d 100644 --- a/lux/utils/utils.py +++ b/lux/utils/utils.py @@ -57,16 +57,17 @@ def check_import_lux_widget(): def get_agg_title(clause): + attr = str(clause.attribute) if clause.aggregation is None: - if len(clause.attribute) > 25: - return clause.attribute[:15] + "..." + clause.attribute[-10:] - return f"{clause.attribute}" - elif clause.attribute == "Record": + if len(attr) > 25: + return attr[:15] + "..." + attr[-10:] + return f"{attr}" + elif attr == "Record": return f"Number of Records" else: - if len(clause.attribute) > 15: - return f"{clause._aggregation_name.capitalize()} of {clause.attribute[:15]}..." - return f"{clause._aggregation_name.capitalize()} of {clause.attribute}" + if len(attr) > 15: + return f"{clause._aggregation_name.capitalize()} of {attr[:15]}..." + return f"{clause._aggregation_name.capitalize()} of {attr}" def check_if_id_like(df, attribute): diff --git a/lux/vis/Clause.py b/lux/vis/Clause.py index ca7efd76..fcaf71d3 100644 --- a/lux/vis/Clause.py +++ b/lux/vis/Clause.py @@ -116,7 +116,7 @@ def to_string(self): if isinstance(self.attribute, list): clauseStr = "|".join(self.attribute) elif self.value == "": - clauseStr = self.attribute + clauseStr = str(self.attribute) else: clauseStr = f"{self.attribute}{self.filter_op}{self.value}" return clauseStr @@ -126,23 +126,23 @@ def __repr__(self): if self.description != "": attributes.append(f" description: {self.description}") if self.channel != "": - attributes.append(" channel: " + self.channel) - if len(self.attribute) != 0: - attributes.append(" attribute: " + str(self.attribute)) + attributes.append(f" channel: {self.channel}") + if self.attribute != "": + attributes.append(f" attribute: {str(self.attribute)}") if self.filter_op != "=": attributes.append(f" filter_op: {str(self.filter_op)}") if self.aggregation != "" and self.aggregation is not None: attributes.append(" aggregation: " + self._aggregation_name) if self.value != "" or len(self.value) != 0: - attributes.append(" value: " + str(self.value)) + attributes.append(f" value: {str(self.value)}") if self.data_model != "": - attributes.append(" data_model: " + self.data_model) + attributes.append(f" data_model: {self.data_model}") if len(self.data_type) != 0: - attributes.append(" data_type: " + str(self.data_type)) - if self.bin_size != None: - attributes.append(" bin_size: " + str(self.bin_size)) + attributes.append(f" data_type: {str(self.data_type)}") + if self.bin_size != 0: + attributes.append(f" bin_size: {str(self.bin_size)}") if len(self.exclude) != 0: - attributes.append(" exclude: " + str(self.exclude)) + attributes.append(f" exclude: {str(self.exclude)}") attributes[0] = " 0: - attribute = "BIN(" + clause.attribute + ")" + attribute = f"BIN({clause.attribute})" else: attribute = clause.attribute if clause.channel == "x": @@ -64,7 +64,7 @@ def __repr__(self): channels.extend(additional_channels) str_channels = "" for channel in channels: - str_channels += channel[0] + ": " + channel[1] + ", " + str_channels += f"{channel[0]}: {channel[1]}, " if filter_intents: return f"" @@ -324,5 +324,8 @@ def check_not_vislist_intent(self): for i in range(len(self._intent)): clause = self._intent[i] - if type(clause) != Clause and ("|" in clause or type(clause) == list or "?" in clause): + if isinstance(clause, str): + if "|" in clause or "?" in clause: + raise TypeError(syntaxMsg) + if isinstance(clause, list): raise TypeError(syntaxMsg) diff --git a/lux/vis/VisList.py b/lux/vis/VisList.py index 9e74961a..a346e6cc 100644 --- a/lux/vis/VisList.py +++ b/lux/vis/VisList.py @@ -133,16 +133,17 @@ def __repr__(self): for vis in self._collection: filter_intents = None for clause in vis._inferred_intent: + attr = str(clause.attribute) if clause.value != "": filter_intents = clause if clause.aggregation != "" and clause.aggregation is not None: - attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")" + attribute = clause._aggregation_name.upper() + f"({attr})" elif clause.bin_size > 0: - attribute = "BIN(" + clause.attribute + ")" + attribute = f"BIN({attr})" else: - attribute = clause.attribute - + attribute = attr + attribute = str(attribute) if clause.channel == "x" and len(x_channel) < len(attribute): x_channel = attribute if clause.channel == "y" and len(y_channel) < len(attribute): @@ -151,9 +152,9 @@ def __repr__(self): largest_mark = len(vis.mark) if ( filter_intents - and len(str(filter_intents.value)) + len(filter_intents.attribute) > largest_filter + and len(str(filter_intents.value)) + len(str(filter_intents.attribute)) > largest_filter ): - largest_filter = len(str(filter_intents.value)) + len(filter_intents.attribute) + largest_filter = len(str(filter_intents.value)) + len(str(filter_intents.attribute)) vis_repr = [] largest_x_length = len(x_channel) largest_y_length = len(y_channel) @@ -164,16 +165,16 @@ def __repr__(self): y_channel = "" additional_channels = [] for clause in vis._inferred_intent: + attr = str(clause.attribute) if clause.value != "": filter_intents = clause if clause.aggregation != "" and clause.aggregation is not None and vis.mark != "scatter": - attribute = clause._aggregation_name.upper() + "(" + clause.attribute + ")" + attribute = clause._aggregation_name.upper() + f"({attr})" elif clause.bin_size > 0: - attribute = "BIN(" + clause.attribute + ")" + attribute = f"BIN({attr})" else: - attribute = clause.attribute - + attribute = attr if clause.channel == "x": x_channel = attribute.ljust(largest_x_length) elif clause.channel == "y": @@ -197,7 +198,7 @@ def __repr__(self): if filter_intents: aligned_filter = ( " -- [" - + filter_intents.attribute + + str(filter_intents.attribute) + filter_intents.filter_op + str(filter_intents.value) + "]" diff --git a/lux/vislib/altair/AltairChart.py b/lux/vislib/altair/AltairChart.py index de4830f7..77fef1ec 100644 --- a/lux/vislib/altair/AltairChart.py +++ b/lux/vislib/altair/AltairChart.py @@ -87,7 +87,7 @@ def encode_color(self): timeUnit = compute_date_granularity(self.vis.data[color_attr_name]) self.chart = self.chart.encode( color=alt.Color( - color_attr_name, + str(color_attr_name), type=color_attr_type, timeUnit=timeUnit, title=color_attr_name, @@ -95,7 +95,9 @@ def encode_color(self): ) self.code += f"chart = chart.encode(color=alt.Color('{color_attr_name}',type='{color_attr_type}',timeUnit='{timeUnit}',title='{color_attr_name}'))" else: - self.chart = self.chart.encode(color=alt.Color(color_attr_name, type=color_attr_type)) + self.chart = self.chart.encode( + color=alt.Color(str(color_attr_name), type=color_attr_type) + ) self.code += f"chart = chart.encode(color=alt.Color('{color_attr_name}',type='{color_attr_type}'))\n" elif len(color_attr) > 1: raise ValueError( @@ -111,3 +113,11 @@ def add_title(self): def initialize_chart(self): return NotImplemented + + @classmethod + def sanitize_dataframe(self, df): + for attr in df.columns: + # Altair can not visualize non-string columns + # convert all non-string columns in to strings + df = df.rename(columns={attr: str(attr)}) + return df diff --git a/lux/vislib/altair/AltairRenderer.py b/lux/vislib/altair/AltairRenderer.py index 2957cd17..080dd8a2 100644 --- a/lux/vislib/altair/AltairRenderer.py +++ b/lux/vislib/altair/AltairRenderer.py @@ -66,11 +66,12 @@ def create_vis(self, vis, standalone=True): vis.data[attr].iloc[0], pd.Interval ): vis.data[attr] = vis.data[attr].astype(str) - if "." in attr: - attr_clause = vis.get_attr_by_attr_name(attr)[0] - # Suppress special character ".", not displayable in Altair - # attr_clause.attribute = attr_clause.attribute.replace(".", "") - vis._vis_data = vis.data.rename(columns={attr: attr.replace(".", "")}) + if isinstance(attr, str): + if "." in attr: + attr_clause = vis.get_attr_by_attr_name(attr)[0] + # Suppress special character ".", not displayable in Altair + # attr_clause.attribute = attr_clause.attribute.replace(".", "") + vis._vis_data = vis.data.rename(columns={attr: attr.replace(".", "")}) if vis.mark == "histogram": chart = Histogram(vis) elif vis.mark == "bar": diff --git a/lux/vislib/altair/BarChart.py b/lux/vislib/altair/BarChart.py index 66c36d02..30b8fcfe 100644 --- a/lux/vislib/altair/BarChart.py +++ b/lux/vislib/altair/BarChart.py @@ -40,28 +40,28 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] - x_attr_abv = x_attr.attribute - y_attr_abv = y_attr.attribute + x_attr_abv = str(x_attr.attribute) + y_attr_abv = str(y_attr.attribute) - if len(x_attr.attribute) > 25: + if len(x_attr_abv) > 25: x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] - if len(y_attr.attribute) > 25: + if len(y_attr_abv) > 25: y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] - - x_attr.attribute = x_attr.attribute.replace(".", "") - y_attr.attribute = y_attr.attribute.replace(".", "") + if isinstance(x_attr.attribute, str): + x_attr.attribute = x_attr.attribute.replace(".", "") + if isinstance(y_attr.attribute, str): + y_attr.attribute = y_attr.attribute.replace(".", "") if x_attr.data_model == "measure": agg_title = get_agg_title(x_attr) measure_attr = x_attr.attribute - bar_attr = y_attr.attribute y_attr_field = alt.Y( - y_attr.attribute, + str(y_attr.attribute), type=y_attr.data_type, axis=alt.Axis(labelOverlap=True, title=y_attr_abv), ) x_attr_field = alt.X( - x_attr.attribute, + str(x_attr.attribute), type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title), @@ -75,15 +75,14 @@ def initialize_chart(self): else: agg_title = get_agg_title(y_attr) measure_attr = y_attr.attribute - bar_attr = x_attr.attribute x_attr_field = alt.X( - x_attr.attribute, + str(x_attr.attribute), type=x_attr.data_type, axis=alt.Axis(labelOverlap=True, title=x_attr_abv), ) x_attr_field_code = f"alt.X('{x_attr.attribute}', type= '{x_attr.data_type}', axis=alt.Axis(labelOverlap=True, title='{x_attr_abv}'))" y_attr_field = alt.Y( - y_attr.attribute, + str(y_attr.attribute), type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=agg_title), @@ -95,9 +94,11 @@ def initialize_chart(self): k = 10 self._topkcode = "" n_bars = len(self.data.iloc[:, 0].unique()) + if n_bars > k: # Truncating to only top k remaining_bars = n_bars - k - self.data = self.data.nlargest(k, measure_attr) + self.data = self.data.nlargest(k, columns=measure_attr) + self.data = AltairChart.sanitize_dataframe(self.data) self.text = alt.Chart(self.data).mark_text( x=155, y=142, @@ -116,7 +117,7 @@ def initialize_chart(self): text=f"+ {remaining_bars} more ..." ) chart = chart + text\n""" - + self.data = AltairChart.sanitize_dataframe(self.data) chart = alt.Chart(self.data).mark_bar().encode(y=y_attr_field, x=x_attr_field) # TODO: tooltip messes up the count() bar charts diff --git a/lux/vislib/altair/Heatmap.py b/lux/vislib/altair/Heatmap.py index 4432de56..f83a3bbb 100644 --- a/lux/vislib/altair/Heatmap.py +++ b/lux/vislib/altair/Heatmap.py @@ -39,16 +39,18 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] - x_attr_abv = x_attr.attribute - y_attr_abv = y_attr.attribute + x_attr_abv = str(x_attr.attribute) + y_attr_abv = str(y_attr.attribute) - if len(x_attr.attribute) > 25: + if len(x_attr_abv) > 25: x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] - if len(y_attr.attribute) > 25: + if len(y_attr_abv) > 25: y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] - x_attr.attribute = x_attr.attribute.replace(".", "") - y_attr.attribute = y_attr.attribute.replace(".", "") + if isinstance(x_attr.attribute, str): + x_attr.attribute = x_attr.attribute.replace(".", "") + if isinstance(y_attr.attribute, str): + y_attr.attribute = y_attr.attribute.replace(".", "") chart = ( alt.Chart(self.data) diff --git a/lux/vislib/altair/Histogram.py b/lux/vislib/altair/Histogram.py index 38e578ab..60c2d999 100644 --- a/lux/vislib/altair/Histogram.py +++ b/lux/vislib/altair/Histogram.py @@ -38,28 +38,30 @@ def initialize_chart(self): self.tooltip = False measure = self.vis.get_attr_by_data_model("measure", exclude_record=True)[0] msr_attr = self.vis.get_attr_by_channel(measure.channel)[0] + msr_attr_abv = str(msr_attr.attribute) - msr_attr_abv = msr_attr.attribute - - if len(msr_attr.attribute) > 17: - msr_attr_abv = msr_attr.attribute[:10] + "..." + msr_attr.attribute[-7:] + if len(msr_attr_abv) > 17: + msr_attr_abv = msr_attr_abv[:10] + "..." + msr_attr_abv[-7:] x_min = self.vis.min_max[msr_attr.attribute][0] x_max = self.vis.min_max[msr_attr.attribute][1] - msr_attr.attribute = msr_attr.attribute.replace(".", "") + if isinstance(msr_attr.attribute, str): + msr_attr.attribute = msr_attr.attribute.replace(".", "") - x_range = abs(max(self.vis.data[msr_attr.attribute]) - min(self.vis.data[msr_attr.attribute])) + colval = self.vis.data[msr_attr.attribute] + x_range = abs(max(colval) - min(colval)) plot_range = abs(x_max - x_min) markbar = x_range / plot_range * 12 + self.data = AltairChart.sanitize_dataframe(self.data) if measure.channel == "x": chart = ( alt.Chart(self.data) .mark_bar(size=markbar) .encode( alt.X( - msr_attr.attribute, + str(msr_attr.attribute), title=f"{msr_attr.attribute} (binned)", bin=alt.Bin(binned=True), type=msr_attr.data_type, @@ -76,7 +78,7 @@ def initialize_chart(self): .encode( x=alt.X("Number of Records", type="quantitative"), y=alt.Y( - msr_attr.attribute, + str(msr_attr.attribute), title=f"{msr_attr.attribute} (binned)", bin=alt.Bin(binned=True), axis=alt.Axis(labelOverlap=True, title=f"{msr_attr_abv} (binned)"), diff --git a/lux/vislib/altair/LineChart.py b/lux/vislib/altair/LineChart.py index c711127a..ae589030 100644 --- a/lux/vislib/altair/LineChart.py +++ b/lux/vislib/altair/LineChart.py @@ -40,16 +40,18 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] - x_attr_abv = x_attr.attribute - y_attr_abv = y_attr.attribute + x_attr_abv = str(x_attr.attribute) + y_attr_abv = str(y_attr.attribute) - if len(x_attr.attribute) > 25: + if len(x_attr_abv) > 25: x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] - if len(y_attr.attribute) > 25: + if len(y_attr_abv) > 25: y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] - x_attr.attribute = x_attr.attribute.replace(".", "") - y_attr.attribute = y_attr.attribute.replace(".", "") + if isinstance(x_attr.attribute, str): + x_attr.attribute = x_attr.attribute.replace(".", "") + if isinstance(y_attr.attribute, str): + y_attr.attribute = y_attr.attribute.replace(".", "") # Remove NaNs only for Line Charts (offsets axis range) self.data = self.data.dropna(subset=[x_attr.attribute, y_attr.attribute]) @@ -60,9 +62,11 @@ def initialize_chart(self): if y_attr.data_model == "measure": agg_title = get_agg_title(y_attr) - x_attr_spec = alt.X(x_attr.attribute, type=x_attr.data_type, axis=alt.Axis(title=x_attr_abv)) + x_attr_spec = alt.X( + str(x_attr.attribute), type=x_attr.data_type, axis=alt.Axis(title=x_attr_abv) + ) y_attr_spec = alt.Y( - y_attr.attribute, + str(y_attr.attribute), type=y_attr.data_type, title=agg_title, axis=alt.Axis(title=y_attr_abv), @@ -72,15 +76,17 @@ def initialize_chart(self): else: agg_title = get_agg_title(x_attr) x_attr_spec = alt.X( - x_attr.attribute, + str(x_attr.attribute), type=x_attr.data_type, title=agg_title, axis=alt.Axis(title=x_attr_abv), ) - y_attr_spec = alt.Y(y_attr.attribute, type=y_attr.data_type, axis=alt.Axis(title=y_attr_abv)) + y_attr_spec = alt.Y( + str(y_attr.attribute), type=y_attr.data_type, axis=alt.Axis(title=y_attr_abv) + ) x_attr_field_code = f"alt.X('{x_attr.attribute}', type = '{x_attr.data_type}', title='{agg_title}', axis=alt.Axis(title='{x_attr_abv}')" - y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(title='{u_attr_abv}')" - + y_attr_fieldCode = f"alt.Y('{y_attr.attribute}', type= '{y_attr.data_type}', axis=alt.Axis(title='{y_attr_abv}')" + self.data = AltairChart.sanitize_dataframe(self.data) chart = alt.Chart(self.data).mark_line().encode(x=x_attr_spec, y=y_attr_spec) chart = chart.interactive() # Enable Zooming and Panning self.code += f""" diff --git a/lux/vislib/altair/ScatterChart.py b/lux/vislib/altair/ScatterChart.py index da645cda..21ae39ab 100644 --- a/lux/vislib/altair/ScatterChart.py +++ b/lux/vislib/altair/ScatterChart.py @@ -38,12 +38,12 @@ def initialize_chart(self): x_attr = self.vis.get_attr_by_channel("x")[0] y_attr = self.vis.get_attr_by_channel("y")[0] - x_attr_abv = x_attr.attribute - y_attr_abv = y_attr.attribute + x_attr_abv = str(x_attr.attribute) + y_attr_abv = str(y_attr.attribute) - if len(x_attr.attribute) > 25: + if len(x_attr_abv) > 25: x_attr_abv = x_attr.attribute[:15] + "..." + x_attr.attribute[-10:] - if len(y_attr.attribute) > 25: + if len(y_attr_abv) > 25: y_attr_abv = y_attr.attribute[:15] + "..." + y_attr.attribute[-10:] x_min = self.vis.min_max[x_attr.attribute][0] @@ -52,21 +52,23 @@ def initialize_chart(self): y_min = self.vis.min_max[y_attr.attribute][0] y_max = self.vis.min_max[y_attr.attribute][1] - x_attr.attribute = x_attr.attribute.replace(".", "") - y_attr.attribute = y_attr.attribute.replace(".", "") - + if isinstance(x_attr.attribute, str): + x_attr.attribute = x_attr.attribute.replace(".", "") + if isinstance(y_attr.attribute, str): + y_attr.attribute = y_attr.attribute.replace(".", "") + self.data = AltairChart.sanitize_dataframe(self.data) chart = ( alt.Chart(self.data) .mark_circle() .encode( x=alt.X( - x_attr.attribute, + str(x_attr.attribute), scale=alt.Scale(domain=(x_min, x_max)), type=x_attr.data_type, axis=alt.Axis(title=x_attr_abv), ), y=alt.Y( - y_attr.attribute, + str(y_attr.attribute), scale=alt.Scale(domain=(y_min, y_max)), type=y_attr.data_type, axis=alt.Axis(title=y_attr_abv), diff --git a/tests/test_columns.py b/tests/test_columns.py index 6216b471..19db44b0 100644 --- a/tests/test_columns.py +++ b/tests/test_columns.py @@ -121,3 +121,13 @@ def test_abbrev_agg(): test = pd.DataFrame(dataset) vis = Vis([long_var, "normal"], test).to_Altair() assert "axis=alt.Axis(title='Mean of Lorem ipsum dol...')" in vis + + +def test_int_columns(global_var): + df = pd.read_csv("lux/data/college.csv") + df.columns = range(len(df.columns)) + assert list(df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"] + df.intent = [8, 3] + assert list(df.recommendation.keys()) == ["Enhance", "Filter", "Generalize"] + df.intent = [0] + assert list(df.recommendation.keys()) == ["Enhance", "Filter"] From 9dc095816a71bf4e7be4dd4b9a5fd74cf8b3d35e Mon Sep 17 00:00:00 2001 From: Kunal Agarwal <32151899+westernguy2@users.noreply.github.com> Date: Fri, 8 Jan 2021 04:12:53 -0800 Subject: [PATCH 14/28] Override Pandas DataFrames created from I/O pandas operations (#207) * update export tutorial to add explanation for standalone argument * minor fixes and remove cell output in notebooks * added contributing doc * fix bugs and uncomment some tests * remove raise warning * remove unnecessary import * split up rename test into two parts * fix setting warning, fix data_type bugs and add relevant tests * remove ordinal data type * add test for small dataframe resetting index * add loc and iloc tests * fix attribute access directly to dataframe * add small changes to code * added test for qcut and cut * add check if dtype is Interval * added qcut test * fix Record KeyError * add tests * take care of reset_index case * small edits * add data_model to column_group Clause * small edits for row_group * fixes to row group * add config for start and cap for samples * finish sampling config and tests * black formatting * add documentation for sampling config * remove small added issues * minor changes to docs * implement heatmap flag and add tests * black formatting and documentation edits * add pd.io equalities for DataFrames Co-authored-by: Doris Lee --- lux/core/__init__.py | 32 +++++++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/lux/core/__init__.py b/lux/core/__init__.py index 9a13cd20..f3aa83bc 100644 --- a/lux/core/__init__.py +++ b/lux/core/__init__.py @@ -26,7 +26,37 @@ def setOption(overridePandas=True): if overridePandas: pd.DataFrame = ( pd.io.json._json.DataFrame - ) = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = LuxDataFrame + ) = ( + pd.io.parsers.DataFrame + ) = ( + pd.io.sql.DataFrame + ) = ( + pd.io.excel.DataFrame + ) = ( + pd.io.formats.DataFrame + ) = ( + pd.io.sas.DataFrame + ) = ( + pd.io.clipboards.DataFrame + ) = ( + pd.io.common.DataFrame + ) = ( + pd.io.feather_format.DataFrame + ) = ( + pd.io.gbq.DataFrame + ) = ( + pd.io.html.DataFrame + ) = ( + pd.io.orc.DataFrame + ) = ( + pd.io.parquet.DataFrame + ) = ( + pd.io.pickle.DataFrame + ) = ( + pd.io.pytables.DataFrame + ) = ( + pd.io.spss.DataFrame + ) = pd.io.stata.DataFrame = pd.io.api.DataFrame = pd.core.frame.DataFrame = LuxDataFrame pd.Series = LuxSeries else: pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF From 623fb51b2932d66a5e4a395964db8046a75000e4 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sat, 9 Jan 2021 10:09:41 +0800 Subject: [PATCH 15/28] Configuration for topk and sort order (#206) * bugfix for describe and convert_dtypes * added back metadata series test * black * default to pandas display when df.dtypes printed * various fixes to support int columns * skip series vis for df.iterrows series element * config setting for modifying top K and sorting * note about regenerated config --- doc/source/reference/config.rst | 55 ++++++++++++++++++- .../gen/lux._config.config.Config.rst | 4 ++ .../gen/lux.core.series.LuxSeries.rst | 1 - .../reference/gen/lux.vis.VisList.VisList.rst | 3 +- ....vislib.altair.AltairChart.AltairChart.rst | 1 + .../lux.vislib.altair.BarChart.BarChart.rst | 1 + .../lux.vislib.altair.Histogram.Histogram.rst | 1 + .../lux.vislib.altair.LineChart.LineChart.rst | 1 + ...islib.altair.ScatterChart.ScatterChart.rst | 1 + lux/_config/config.py | 53 +++++++++++++++++- lux/action/correlation.py | 3 +- lux/action/enhance.py | 3 +- lux/action/filter.py | 3 +- lux/action/generalize.py | 1 + lux/action/univariate.py | 1 - lux/core/series.py | 6 +- lux/vis/VisList.py | 22 +++++--- tests/test_config.py | 41 +++++++++++++- tests/test_series.py | 9 +++ 19 files changed, 188 insertions(+), 22 deletions(-) diff --git a/doc/source/reference/config.rst b/doc/source/reference/config.rst index 7b85b687..14ac5e48 100644 --- a/doc/source/reference/config.rst +++ b/doc/source/reference/config.rst @@ -2,7 +2,28 @@ Configuration Settings *********************** -In Lux, users can customize various global settings to configure the behavior of Lux through :py:class:`lux.config.Config`. This page documents some of the configurations that you can apply in Lux. +In Lux, users can customize various global settings to configure the behavior of Lux through :py:class:`lux.config.Config`. These configurations are applied across all dataframes in the session. This page documents some of the configurations that you can apply in Lux. + +.. note:: + + Lux caches past generated recommendations, so if you have already printed the dataframe in the past, the recommendations would not be regenerated with the new config properties. In order for the config properties to apply, you would need to explicitly expire the recommendations as such: + + .. code-block:: python + + df = pd.read_csv("..") + df # recommendations already generated here + + df.expire_recs() + lux.config.SOME_SETTING = "..." + df # recommendation will be generated again here + + Alternatively, you can place the config settings before you first print out the dataframe for the first time: + + .. code-block:: python + + df = pd.read_csv("..") + lux.config.SOME_SETTING = "..." + df # recommendations generated for the first time with config Change the default display of Lux @@ -108,3 +129,35 @@ The above results in the following changes: See `this page `__ for more details. +Modify Sorting and Ranking in Recommendations +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +In Lux, we select a small subset of visualizations to display in each action tab to avoid displaying too many charts at once. +Certain recommendation categories ranks and selects the top K most interesting visualizations to display. +You can modify the sorting order and selection cutoff via :code:`lux.config`. +By default, the recommendations are sorted in a :code:`"descending"` order based on their interestingness score, you can reverse the ordering by setting the sort order as: + +.. code-block:: python + + lux.config.sort = "ascending" + +To turn off the sorting of visualizations based on its score completely and ensure that the visualizations show up in the same order across all dataframes, you can set the sorting as "none": + +.. code-block:: python + + lux.config.sort = "none" + +For recommendation actions that generate a lot of visualizations, we select the cutoff criteria as the top 15 visualizations. If you would like to see only see the top 6 visualizations, you can set: + +.. code-block:: python + + lux.config.topk = 6 + +If you would like to turn off the selection criteria completely and display everything, you can turn off the top K selection by: + +.. code-block:: python + + lux.config.topk = False + +Beware that this may generate large numbers of visualizations (e.g., for 10 quantitative variables, this will generate 45 scatterplots in the Correlation action!) + diff --git a/doc/source/reference/gen/lux._config.config.Config.rst b/doc/source/reference/gen/lux._config.config.Config.rst index 0000b36f..48db70fb 100644 --- a/doc/source/reference/gen/lux._config.config.Config.rst +++ b/doc/source/reference/gen/lux._config.config.Config.rst @@ -14,6 +14,8 @@ lux.\_config.config.Config .. autosummary:: ~Config.__init__ + ~Config.register_action + ~Config.remove_action ~Config.set_SQL_connection ~Config.set_executor_type @@ -30,5 +32,7 @@ lux.\_config.config.Config ~Config.sampling ~Config.sampling_cap ~Config.sampling_start + ~Config.sort + ~Config.topk \ No newline at end of file diff --git a/doc/source/reference/gen/lux.core.series.LuxSeries.rst b/doc/source/reference/gen/lux.core.series.LuxSeries.rst index 0f50d3e4..a136115e 100644 --- a/doc/source/reference/gen/lux.core.series.LuxSeries.rst +++ b/doc/source/reference/gen/lux.core.series.LuxSeries.rst @@ -53,7 +53,6 @@ lux.core.series.LuxSeries ~LuxSeries.cumsum ~LuxSeries.describe ~LuxSeries.diff - ~LuxSeries.display_pandas ~LuxSeries.div ~LuxSeries.divide ~LuxSeries.divmod diff --git a/doc/source/reference/gen/lux.vis.VisList.VisList.rst b/doc/source/reference/gen/lux.vis.VisList.VisList.rst index daf9c501..f22b5bae 100644 --- a/doc/source/reference/gen/lux.vis.VisList.VisList.rst +++ b/doc/source/reference/gen/lux.vis.VisList.VisList.rst @@ -14,7 +14,6 @@ lux.vis.VisList.VisList .. autosummary:: ~VisList.__init__ - ~VisList.bottomK ~VisList.get ~VisList.map ~VisList.normalize_score @@ -23,8 +22,8 @@ lux.vis.VisList.VisList ~VisList.remove_index ~VisList.set ~VisList.set_intent + ~VisList.showK ~VisList.sort - ~VisList.topK diff --git a/doc/source/reference/gen/lux.vislib.altair.AltairChart.AltairChart.rst b/doc/source/reference/gen/lux.vislib.altair.AltairChart.AltairChart.rst index accd69eb..b2cafeed 100644 --- a/doc/source/reference/gen/lux.vislib.altair.AltairChart.AltairChart.rst +++ b/doc/source/reference/gen/lux.vislib.altair.AltairChart.AltairChart.rst @@ -19,6 +19,7 @@ lux.vislib.altair.AltairChart.AltairChart ~AltairChart.apply_default_config ~AltairChart.encode_color ~AltairChart.initialize_chart + ~AltairChart.sanitize_dataframe diff --git a/doc/source/reference/gen/lux.vislib.altair.BarChart.BarChart.rst b/doc/source/reference/gen/lux.vislib.altair.BarChart.BarChart.rst index 5c4878f8..b55c95b3 100644 --- a/doc/source/reference/gen/lux.vislib.altair.BarChart.BarChart.rst +++ b/doc/source/reference/gen/lux.vislib.altair.BarChart.BarChart.rst @@ -20,6 +20,7 @@ lux.vislib.altair.BarChart.BarChart ~BarChart.apply_default_config ~BarChart.encode_color ~BarChart.initialize_chart + ~BarChart.sanitize_dataframe diff --git a/doc/source/reference/gen/lux.vislib.altair.Histogram.Histogram.rst b/doc/source/reference/gen/lux.vislib.altair.Histogram.Histogram.rst index 47733466..920d6394 100644 --- a/doc/source/reference/gen/lux.vislib.altair.Histogram.Histogram.rst +++ b/doc/source/reference/gen/lux.vislib.altair.Histogram.Histogram.rst @@ -19,6 +19,7 @@ lux.vislib.altair.Histogram.Histogram ~Histogram.apply_default_config ~Histogram.encode_color ~Histogram.initialize_chart + ~Histogram.sanitize_dataframe diff --git a/doc/source/reference/gen/lux.vislib.altair.LineChart.LineChart.rst b/doc/source/reference/gen/lux.vislib.altair.LineChart.LineChart.rst index 3143e2f9..89257108 100644 --- a/doc/source/reference/gen/lux.vislib.altair.LineChart.LineChart.rst +++ b/doc/source/reference/gen/lux.vislib.altair.LineChart.LineChart.rst @@ -19,6 +19,7 @@ lux.vislib.altair.LineChart.LineChart ~LineChart.apply_default_config ~LineChart.encode_color ~LineChart.initialize_chart + ~LineChart.sanitize_dataframe diff --git a/doc/source/reference/gen/lux.vislib.altair.ScatterChart.ScatterChart.rst b/doc/source/reference/gen/lux.vislib.altair.ScatterChart.ScatterChart.rst index f7a1d283..be0569f7 100644 --- a/doc/source/reference/gen/lux.vislib.altair.ScatterChart.ScatterChart.rst +++ b/doc/source/reference/gen/lux.vislib.altair.ScatterChart.ScatterChart.rst @@ -19,6 +19,7 @@ lux.vislib.altair.ScatterChart.ScatterChart ~ScatterChart.apply_default_config ~ScatterChart.encode_color ~ScatterChart.initialize_chart + ~ScatterChart.sanitize_dataframe diff --git a/lux/_config/config.py b/lux/_config/config.py index 419f9909..09acb132 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -3,9 +3,9 @@ For more resources, see https://github.com/pandas-dev/pandas/blob/master/pandas/_config """ from collections import namedtuple -from typing import Any, Callable, Dict, Iterable, List, Optional -import warnings +from typing import Any, Callable, Dict, Iterable, List, Optional, Union import lux +import warnings RegisteredOption = namedtuple("RegisteredOption", "name action display_condition args") @@ -30,6 +30,55 @@ def __init__(self): self._sampling_cap = 30000 self._sampling_flag = True self._heatmap_flag = True + self._topk = 15 + self._sort = "descending" + + @property + def topk(self): + return self._topk + + @topk.setter + def topk(self, k: Union[int, bool]): + """ + Setting parameter to display top k visualizations in each action + + Parameters + ---------- + k : Union[int,bool] + False: if display all visualizations (no top-k) + k: number of visualizations to display + """ + if isinstance(k, int) or isinstance(k, bool): + self._topk = k + else: + warnings.warn( + "Parameter to lux.config.topk must be an integer or a boolean.", + stacklevel=2, + ) + + @property + def sort(self): + return self._sort + + @sort.setter + def sort(self, flag: Union[str]): + """ + Setting parameter to determine sort order of each action + + Parameters + ---------- + flag : Union[str] + "none", "ascending","descending" + No sorting, sort by ascending order, sort by descending order + """ + flag = flag.lower() + if isinstance(flag, str) and flag in ["none", "ascending", "descending"]: + self._sort = flag + else: + warnings.warn( + "Parameter to lux.config.sort must be one of the following: 'none', 'ascending', or 'descending'.", + stacklevel=2, + ) @property def sampling_cap(self): diff --git a/lux/action/correlation.py b/lux/action/correlation.py index 53cc8540..6d178e84 100644 --- a/lux/action/correlation.py +++ b/lux/action/correlation.py @@ -77,7 +77,8 @@ def correlation(ldf: LuxDataFrame, ignore_transpose: bool = True): if ignore_rec_flag: recommendation["collection"] = [] return recommendation - vlist = vlist.topK(15) + vlist.sort() + vlist = vlist.showK() recommendation["collection"] = vlist return recommendation diff --git a/lux/action/enhance.py b/lux/action/enhance.py index 94a4ea60..c6f240eb 100644 --- a/lux/action/enhance.py +++ b/lux/action/enhance.py @@ -66,6 +66,7 @@ def enhance(ldf): for vis in vlist: vis.score = interestingness(vis, ldf) - vlist = vlist.topK(15) + vlist.sort() + vlist = vlist.showK() recommendation["collection"] = vlist return recommendation diff --git a/lux/action/filter.py b/lux/action/filter.py index 6b27b843..a353d449 100644 --- a/lux/action/filter.py +++ b/lux/action/filter.py @@ -132,7 +132,8 @@ def get_complementary_ops(fltr_op): vlist_copy = lux.vis.VisList.VisList(output, ldf) for i in range(len(vlist_copy)): vlist[i].score = interestingness(vlist_copy[i], ldf) - vlist = vlist.topK(15) + vlist.sort() + vlist = vlist.showK() if recommendation["action"] == "Similarity": recommendation["collection"] = vlist[1:] else: diff --git a/lux/action/generalize.py b/lux/action/generalize.py index 91b83239..45e9d0f8 100644 --- a/lux/action/generalize.py +++ b/lux/action/generalize.py @@ -93,5 +93,6 @@ def generalize(ldf): vlist.remove_duplicates() vlist.sort(remove_invalid=True) + vlist._collection = list(filter(lambda x: x.score != -1, vlist._collection)) recommendation["collection"] = vlist return recommendation diff --git a/lux/action/univariate.py b/lux/action/univariate.py index 030a6f03..740f9105 100644 --- a/lux/action/univariate.py +++ b/lux/action/univariate.py @@ -82,7 +82,6 @@ def univariate(ldf, *args): vlist = VisList(intent, ldf) for vis in vlist: vis.score = interestingness(vis, ldf) - # vlist = vlist.topK(15) # Basic visualizations should not be capped vlist.sort() recommendation["collection"] = vlist return recommendation diff --git a/lux/core/series.py b/lux/core/series.py index 3a4068d3..89382660 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -84,8 +84,12 @@ def __repr__(self): ldf = LuxDataFrame(self) try: + # Ignore recommendations when Series a results of: + # 1) Values of the series are of dtype objects (df.dtypes) is_dtype_series = all(isinstance(val, np.dtype) for val in self.values) - if ldf._pandas_only or is_dtype_series: + # 2) Mixed type, often a result of a "row" acting as a series (df.iterrows, df.iloc[0]) + mixed_dtype = len(set([type(val) for val in self.values])) > 1 + if ldf._pandas_only or is_dtype_series or mixed_dtype: print(series_repr) ldf._pandas_only = False else: diff --git a/lux/vis/VisList.py b/lux/vis/VisList.py index a346e6cc..e3bdfa3e 100644 --- a/lux/vis/VisList.py +++ b/lux/vis/VisList.py @@ -233,18 +233,22 @@ def sort(self, remove_invalid=True, descending=True): # remove the items that have invalid (-1) score if remove_invalid: self._collection = list(filter(lambda x: x.score != -1, self._collection)) + if lux.config.sort == "none": + return + elif lux.config.sort == "ascending": + descending = False + elif lux.config.sort == "descending": + descending = True # sort in-place by “score” by default if available, otherwise user-specified field to sort by self._collection.sort(key=lambda x: x.score, reverse=descending) - def topK(self, k): - # sort and truncate list to first K items - self.sort(remove_invalid=True) - return VisList(self._collection[:k]) - - def bottomK(self, k): - # sort and truncate list to first K items - self.sort(descending=False, remove_invalid=True) - return VisList(self._collection[:k]) + def showK(self): + k = lux.config.topk + if k == False: + return self + elif isinstance(k, int): + k = abs(k) + return VisList(self._collection[:k]) def normalize_score(self, invert_order=False): max_score = max(list(self.get("score"))) diff --git a/tests/test_config.py b/tests/test_config.py index 644c4628..8a721d97 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -28,7 +28,8 @@ def random_categorical(ldf): vlist = VisList(intent, ldf) for vis in vlist: vis.score = 10 - vlist = vlist.topK(15) + vlist.sort() + vlist = vlist.showK() return { "action": "bars", "description": "Random list of Bar charts", @@ -105,7 +106,8 @@ def random_categorical(ldf): vlist = VisList(intent, ldf) for vis in vlist: vis.score = 10 - vlist = vlist.topK(15) + vlist.sort() + vlist = vlist.showK() return { "action": "bars", "description": "Random list of Bar charts", @@ -235,6 +237,41 @@ def test_heatmap_flag_config(): lux.config.heatmap = True +def test_topk(global_var): + df = pd.read_csv("lux/data/college.csv") + lux.config.topk = False + df._repr_html_() + assert len(df.recommendation["Correlation"]) == 45, "Turn off top K" + lux.config.topk = 20 + df = pd.read_csv("lux/data/college.csv") + df._repr_html_() + assert len(df.recommendation["Correlation"]) == 20, "Show top 20" + for vis in df.recommendation["Correlation"]: + assert vis.score > 0.2 + + +def test_sort(global_var): + df = pd.read_csv("lux/data/college.csv") + lux.config.topk = 15 + df._repr_html_() + assert len(df.recommendation["Correlation"]) == 15, "Show top 15" + for vis in df.recommendation["Correlation"]: + assert vis.score > 0.2 + df = pd.read_csv("lux/data/college.csv") + lux.config.sort = "ascending" + df._repr_html_() + assert len(df.recommendation["Correlation"]) == 15, "Show bottom 15" + for vis in df.recommendation["Correlation"]: + assert vis.score < 0.2 + + lux.config.sort = "none" + df = pd.read_csv("lux/data/college.csv") + df._repr_html_() + scorelst = [x.score for x in df.recommendation["Distribution"]] + assert sorted(scorelst) != scorelst, "unsorted setting" + lux.config.sort = "descending" + + # TODO: This test does not pass in pytest but is working in Jupyter notebook. # def test_plot_setting(global_var): # df = pytest.car_df diff --git a/tests/test_series.py b/tests/test_series.py index 62a4697f..6bbbed26 100644 --- a/tests/test_series.py +++ b/tests/test_series.py @@ -51,3 +51,12 @@ def test_print_dtypes(global_var): with warnings.catch_warnings(record=True) as w: print(df.dtypes) assert len(w) == 0, "Warning displayed when printing dtypes" + + +def test_print_iterrow(global_var): + df = pytest.college_df + with warnings.catch_warnings(record=True) as w: + for index, row in df.iterrows(): + print(row) + break + assert len(w) == 0, "Warning displayed when printing iterrow" From e1430df959afdf8a73fa5c875e758a5f67dcb623 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sat, 9 Jan 2021 12:29:18 +0800 Subject: [PATCH 16/28] Version lock for jupyter-client (#211) * move to single requirements-dev without lux-widget install manually * pin jedi version * pin jupyter-client version * add back old travis and requirement-dev --- .travis.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.travis.yml b/.travis.yml index 6dfca243..9cb0e4cd 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,13 +2,13 @@ language: python python: - "3.7" install: + - pip install jupyter-client==6.1.6 - pip install -r requirements.txt - pip install -r requirements-dev.txt - - pip install git+https://github.com/lux-org/lux-widget # command to run tests script: - black --target-version py37 --line-length 105 --check . - python -m pytest tests/*.py - pytest --cov-report term --cov=lux tests/ after_success: - - bash <(curl -s https://codecov.io/bash) + - bash <(curl -s https://codecov.io/bash) \ No newline at end of file From 14c141b76f0dcdfdc852df0760b75572f2a83749 Mon Sep 17 00:00:00 2001 From: jinimukh <46768380+jinimukh@users.noreply.github.com> Date: Sat, 9 Jan 2021 00:17:15 -0800 Subject: [PATCH 17/28] Mixed dtype issue (#205) * coalesce data_types into data_type_lookup * merge fixed * merge conflicts * add warning and suggestion on how to fix * formatting for warnings version * change to internal data * legibility update * test added * update test * test updated * xlrd in dev reqs * black * update link * changes to test logic, minor string format for warning Co-authored-by: Doris Lee --- lux/executor/PandasExecutor.py | 18 ++++++++++++++---- requirements-dev.txt | 1 + tests/test_pandas_coverage.py | 10 ++++++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index cb8a8ce7..f8a0279c 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -238,10 +238,20 @@ def execute_aggregate(vis: Vis, isFiltered=True): assert ( len(list(vis.data[groupby_attr.attribute])) == N_unique_vals ), f"Aggregated data missing values compared to original range of values of `{groupby_attr.attribute}`." - vis._vis_data = vis.data.dropna(subset=[measure_attr.attribute]) - vis._vis_data = vis.data.sort_values(by=groupby_attr.attribute, ascending=True) - vis._vis_data = vis.data.reset_index() - vis._vis_data = vis.data.drop(columns="index") + + vis._vis_data = vis._vis_data.dropna(subset=[measure_attr.attribute]) + try: + vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True) + except TypeError: + warnings.warn( + f"\nLux detects that the attribute '{groupby_attr.attribute}' maybe contain mixed type." + + f"\nTo visualize this attribute, you may want to convert the '{groupby_attr.attribute}' into a uniform type as follows:" + + f"\n\tdf['{groupby_attr.attribute}'] = df['{groupby_attr.attribute}'].astype(str)" + ) + vis._vis_data[groupby_attr.attribute] = vis._vis_data[groupby_attr.attribute].astype(str) + vis._vis_data = vis._vis_data.sort_values(by=groupby_attr.attribute, ascending=True) + vis._vis_data = vis._vis_data.reset_index() + vis._vis_data = vis._vis_data.drop(columns="index") @staticmethod def execute_binning(vis: Vis): diff --git a/requirements-dev.txt b/requirements-dev.txt index 0365d5bd..4e0e654d 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -2,4 +2,5 @@ pytest>=5.3.1 pytest-cov>=2.8.1 Sphinx>=3.0.2 sphinx-rtd-theme>=0.4.3 +xlrd black diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index e861769b..f595bedb 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -15,6 +15,8 @@ from .context import lux import pytest import pandas as pd +import numpy as np +import warnings ################### # DataFrame Tests # @@ -678,3 +680,11 @@ def test_read_sas(global_var): df._repr_html_() assert list(df.recommendation.keys()) == ["Correlation", "Distribution", "Temporal"] assert len(df.data_type) == 6 + + +def test_read_multi_dtype(global_var): + url = "https://github.com/lux-org/lux-datasets/blob/master/data/car-data.xls?raw=true" + df = pd.read_excel(url) + with pytest.warns(UserWarning, match="mixed type") as w: + df._repr_html_() + assert "df['Car Type'] = df['Car Type'].astype(str)" in str(w[-1].message) From c135c0123877d5d8df5a148b25dd020a3fdd5227 Mon Sep 17 00:00:00 2001 From: Kunal Agarwal <32151899+westernguy2@users.noreply.github.com> Date: Sat, 9 Jan 2021 00:27:13 -0800 Subject: [PATCH 18/28] Fixes issue where value_counts was not returning LuxSeries (#210) * add series equality and value counts test * black formatting * fix old value counts test instead * minor fix Co-authored-by: Doris Lee --- lux/action/default.py | 1 - lux/core/__init__.py | 2 +- lux/core/series.py | 3 ++- tests/test_pandas_coverage.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lux/action/default.py b/lux/action/default.py index b075232d..de31f2ca 100644 --- a/lux/action/default.py +++ b/lux/action/default.py @@ -7,7 +7,6 @@ def register_default_actions(): from lux.action.filter import add_filter from lux.action.generalize import generalize - print("Register default actions") # display conditions for default actions no_vis = lambda ldf: (ldf.current_vis is None) or ( ldf.current_vis is not None and len(ldf.current_vis) == 0 diff --git a/lux/core/__init__.py b/lux/core/__init__.py index f3aa83bc..b1a69371 100644 --- a/lux/core/__init__.py +++ b/lux/core/__init__.py @@ -57,7 +57,7 @@ def setOption(overridePandas=True): ) = ( pd.io.spss.DataFrame ) = pd.io.stata.DataFrame = pd.io.api.DataFrame = pd.core.frame.DataFrame = LuxDataFrame - pd.Series = LuxSeries + pd.Series = pd.core.series.Series = LuxSeries else: pd.DataFrame = pd.io.parsers.DataFrame = pd.core.frame.DataFrame = originalDF pd.Series = originalSeries diff --git a/lux/core/series.py b/lux/core/series.py index 89382660..b1388bb9 100644 --- a/lux/core/series.py +++ b/lux/core/series.py @@ -88,7 +88,8 @@ def __repr__(self): # 1) Values of the series are of dtype objects (df.dtypes) is_dtype_series = all(isinstance(val, np.dtype) for val in self.values) # 2) Mixed type, often a result of a "row" acting as a series (df.iterrows, df.iloc[0]) - mixed_dtype = len(set([type(val) for val in self.values])) > 1 + # Tolerant for NaNs + 1 type + mixed_dtype = len(set([type(val) for val in self.values])) > 2 if ldf._pandas_only or is_dtype_series or mixed_dtype: print(series_repr) ldf._pandas_only = False diff --git a/tests/test_pandas_coverage.py b/tests/test_pandas_coverage.py index f595bedb..43e3ffff 100644 --- a/tests/test_pandas_coverage.py +++ b/tests/test_pandas_coverage.py @@ -607,7 +607,7 @@ def test_value_counts(global_var): assert df.cardinality is not None series = df["Weight"] series.value_counts() - assert isinstance(series, lux.core.series.LuxSeries), "Derived series is type LuxSeries." + assert type(df["Brand"].value_counts()) == lux.core.series.LuxSeries assert df["Weight"]._metadata == [ "_intent", "data_type", From c94265ef73ae44c073b6d232ba2e33f1a159569f Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sat, 9 Jan 2021 17:56:36 +0800 Subject: [PATCH 19/28] bump version --- lux/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lux/_version.py b/lux/_version.py index bd12c3d5..718a587d 100644 --- a/lux/_version.py +++ b/lux/_version.py @@ -1,5 +1,5 @@ #!/usr/bin/env python # coding: utf-8 -version_info = (0, 2, 1, 2) +version_info = (0, 2, 2) __version__ = ".".join(map(str, version_info)) From a6a929a8188ab841c4471b2051e102628af19e83 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Sat, 9 Jan 2021 18:09:49 +0800 Subject: [PATCH 20/28] update README --- README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/README.md b/README.md index 38098805..062a94db 100644 --- a/README.md +++ b/README.md @@ -157,8 +157,7 @@ To use Lux in [Jupyter Lab](https://github.com/jupyterlab/jupyterlab), activate jupyter labextension install @jupyter-widgets/jupyterlab-manager jupyter labextension install luxwidget ``` - -Note that JupyterLab and VSCode is supported only for lux-widget version >=0.1.2, if you have an earlier version, please upgrade to the latest version of [lux-widget](https://pypi.org/project/lux-widget/). Lux currently only works with the Chrome browser. +Lux is only compatible with Jupyter Lab version 2.2.9 and below. Support for the recent [JupyterLab 3](https://blog.jupyter.org/jupyterlab-3-0-is-out-4f58385e25bb) will come soon. Note that JupyterLab and VSCode is supported only for lux-widget version >=0.1.2, if you have an earlier version, please upgrade to the latest version of [lux-widget](https://pypi.org/project/lux-widget/). Lux currently only works with the Chrome browser. If you encounter issues with the installation, please refer to [this page](https://lux-api.readthedocs.io/en/latest/source/guide/FAQ.html#troubleshooting-tips) to troubleshoot the installation. Follow [these instructions](https://lux-api.readthedocs.io/en/latest/source/getting_started/installation.html#manual-installation-dev-setup) to set up Lux for development purposes. From f70e1fd4a17640720ab64d146b2d1a32d4233f13 Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Tue, 12 Jan 2021 20:38:55 +0800 Subject: [PATCH 21/28] Performance optimization on recommendation compute (#219) * remove sampling from _is_datetime_string * changed spearman's r to pearson's r for interestingness --- lux/executor/PandasExecutor.py | 7 ++----- lux/interestingness/interestingness.py | 6 +++--- tests/test_config.py | 4 ++-- tests/test_dates.py | 21 +++++++-------------- tests/test_interestingness.py | 12 ------------ 5 files changed, 14 insertions(+), 36 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index f8a0279c..d07859fd 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -457,12 +457,9 @@ def compute_data_type(self, ldf: LuxDataFrame): warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" warnings.warn(warn_msg, stacklevel=2) - def _is_datetime_string(self, series): - if len(series) > 100: - series = series.sample(100) - + @staticmethod + def _is_datetime_string(series): if series.dtype == object: - not_numeric = False try: pd.to_numeric(series) diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index cf81045e..70605827 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -316,7 +316,7 @@ def mutual_information(v_x: list, v_y: list) -> int: def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> int: """ Monotonicity measures there is a monotonic trend in the scatterplot, whether linear or not. - This score is computed as the square of the Spearman correlation coefficient, which is the Pearson correlation on the ranks of x and y. + This score is computed as the Pearson's correlation on the ranks of x and y. See "Graph-Theoretic Scagnostics", Wilkinson et al 2005: https://research.tableau.com/sites/default/files/Wilkinson_Infovis-05.pdf Parameters ---------- @@ -332,7 +332,7 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in int Score describing the strength of monotonic relationship in vis """ - from scipy.stats import spearmanr + from scipy.stats import pearsonr msr1 = attr_specs[0].attribute msr2 = attr_specs[1].attribute @@ -347,7 +347,7 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in with warnings.catch_warnings(): warnings.filterwarnings("error") try: - score = (spearmanr(v_x, v_y)[0]) ** 2 + score = np.abs(pearsonr(v_x, v_y)[0]) except (RuntimeWarning): # RuntimeWarning: invalid value encountered in true_divide (occurs when v_x and v_y are uniform, stdev in denominator is zero, leading to spearman's correlation as nan), ignore these cases. score = -1 diff --git a/tests/test_config.py b/tests/test_config.py index 8a721d97..ffe7f0fb 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -256,13 +256,13 @@ def test_sort(global_var): df._repr_html_() assert len(df.recommendation["Correlation"]) == 15, "Show top 15" for vis in df.recommendation["Correlation"]: - assert vis.score > 0.2 + assert vis.score > 0.5 df = pd.read_csv("lux/data/college.csv") lux.config.sort = "ascending" df._repr_html_() assert len(df.recommendation["Correlation"]) == 15, "Show bottom 15" for vis in df.recommendation["Correlation"]: - assert vis.score < 0.2 + assert vis.score < 0.35 lux.config.sort = "none" df = pd.read_csv("lux/data/college.csv") diff --git a/tests/test_dates.py b/tests/test_dates.py index ce859f5d..dc530fc7 100644 --- a/tests/test_dates.py +++ b/tests/test_dates.py @@ -59,30 +59,23 @@ def test_period_selection(global_var): def test_period_filter(global_var): ldf = pd.read_csv("lux/data/car.csv") ldf["Year"] = pd.to_datetime(ldf["Year"], format="%Y") - ldf["Year"] = pd.DatetimeIndex(ldf["Year"]).to_period(freq="A") - ldf.set_intent([lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Horsepower")]) - - lux.config.executor.execute(ldf.current_vis, ldf) - ldf._repr_html_() + from lux.vis.Vis import Vis - assert isinstance(ldf.recommendation["Filter"][2]._inferred_intent[2].value, pd.Period) + vis = Vis(["Acceleration", "Horsepower", "Year=1972"], ldf) + assert ldf.data_type["Year"] == "temporal" + assert isinstance(vis._inferred_intent[2].value, str) def test_period_to_altair(global_var): - chart = None df = pd.read_csv("lux/data/car.csv") df["Year"] = pd.to_datetime(df["Year"], format="%Y") - df["Year"] = pd.DatetimeIndex(df["Year"]).to_period(freq="A") + from lux.vis.Vis import Vis - df.set_intent([lux.Clause(attribute="Acceleration"), lux.Clause(attribute="Horsepower")]) - - lux.config.executor.execute(df.current_vis, df) - df._repr_html_() - - exported_code = df.recommendation["Filter"][2].to_Altair() + vis = Vis(["Acceleration", "Horsepower", "Year=1972"], df) + exported_code = vis.to_Altair() assert "Year = 1972" in exported_code diff --git a/tests/test_interestingness.py b/tests/test_interestingness.py index 15b8e74d..38d8bfe9 100644 --- a/tests/test_interestingness.py +++ b/tests/test_interestingness.py @@ -241,18 +241,6 @@ def test_interestingness_0_2_0(global_var): # check that top recommended filter graph score is not none and that ordering makes intuitive sense assert interestingness(df.recommendation["Filter"][0], df) != None - rank1 = -1 - rank2 = -1 - rank3 = -1 - for f in range(0, len(df.recommendation["Filter"])): - if "1973" in str(df.recommendation["Filter"][f]._inferred_intent[2].value): - rank1 = f - if "ford" in str(df.recommendation["Filter"][f]._inferred_intent[2].value): - rank2 = f - if str(df.recommendation["Filter"][f]._inferred_intent[2].value) == "USA": - rank3 = f - assert rank1 < rank2 and rank1 < rank3 and rank2 < rank3 - # check that top recommended Generalize graph score is not none assert interestingness(df.recommendation["Generalize"][0], df) != None From 23e4a79c4f9c2bd3fdfe3656260081d415a66a98 Mon Sep 17 00:00:00 2001 From: Micah Yong Date: Sat, 16 Jan 2021 02:27:45 -0800 Subject: [PATCH 22/28] Temporal type detection for integers via PandasExecutor._is_datetime_number (#232) * Implement _is_datetime_number, which uses PDs built-in to_datetime to predict if a numeric series holds temporal values * Add test_check_datetime_numeric_values in test_type.py * Format via Black * black and remove print Co-authored-by: Doris Lee --- lux/executor/PandasExecutor.py | 17 ++++++++++++++--- tests/test_type.py | 15 +++++++++++++++ 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index d07859fd..7084c982 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -20,7 +20,6 @@ from lux.utils import utils from lux.utils.date_utils import is_datetime_series from lux.utils.utils import check_import_lux_widget, check_if_id_like -from lux.utils.date_utils import is_datetime_series import warnings import lux @@ -400,7 +399,7 @@ def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime for attr in list(ldf.columns): - temporal_var_list = ["month", "year", "day", "date", "time"] + temporal_var_list = ["month", "year", "day", "date", "time", "weekday"] if is_datetime(ldf[attr]): ldf.data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): @@ -409,6 +408,8 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: ldf.data_type[attr] = "temporal" + elif self._is_datetime_number(ldf[attr]): + ldf.data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) @@ -472,11 +473,21 @@ def _is_datetime_string(series): datetime_col = pd.to_datetime(series) except Exception as e: return False - if datetime_col is not None: return True return False + @staticmethod + def _is_datetime_number(series): + if series.dtype == int: + try: + temp = series.astype(str) + pd.to_datetime(temp) + return True + except Exception: + return False + return False + def compute_stats(self, ldf: LuxDataFrame): # precompute statistics ldf.unique_values = {} diff --git a/tests/test_type.py b/tests/test_type.py index ac6472fc..badcd023 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -124,6 +124,21 @@ def test_check_datetime(): } +def test_check_datetime_numeric_values(): + car_df = pd.read_csv("lux/data/car.csv") + car_df = car_df.rename(columns={"Year": "blah"}) + car_df.maintain_metadata() + assert car_df.data_type["blah"] == "temporal" + + spotify_df = pd.read_csv( + "https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/spotify.csv" + ) + spotify_df = spotify_df.rename(columns={"year": "blah"}) + spotify_df.maintain_metadata() + assert spotify_df.data_type["blah"] == "temporal" + assert spotify_df.data_type["release_date"] == "temporal" + + def test_check_stock(): df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true") df.maintain_metadata() From 3ea3db126f3e3ea2a4a0994ec4b20440f65e034f Mon Sep 17 00:00:00 2001 From: Kunal Agarwal <32151899+westernguy2@users.noreply.github.com> Date: Sat, 16 Jan 2021 06:06:32 -0800 Subject: [PATCH 23/28] Add config parameters for Pandas Fallback (#233) * add series equality and value counts test * black formatting * fix old value counts test instead * add configs for pandas fallbacks * run black * turn off fallback option for tests; fixed interestingness bug causing failed test * black Co-authored-by: Doris Lee --- lux/_config/config.py | 43 ++++++++++++++++++++++++++ lux/core/frame.py | 19 +++++++----- lux/interestingness/interestingness.py | 12 ++++--- tests/context.py | 3 ++ tests/test_config.py | 4 +-- 5 files changed, 66 insertions(+), 15 deletions(-) diff --git a/lux/_config/config.py b/lux/_config/config.py index 09acb132..398b04bc 100644 --- a/lux/_config/config.py +++ b/lux/_config/config.py @@ -32,6 +32,8 @@ def __init__(self): self._heatmap_flag = True self._topk = 15 self._sort = "descending" + self._pandas_fallback = True + self._interestingness_fallback = True @property def topk(self): @@ -80,6 +82,47 @@ def sort(self, flag: Union[str]): stacklevel=2, ) + @property + def pandas_fallback(self): + return self._pandas_fallback + + @pandas_fallback.setter + def pandas_fallback(self, fallback: bool) -> None: + """ + Parameters + ---------- + fallback : bool + If an error occurs, whether or not to raise an exception or fallback to default Pandas. + """ + if type(fallback) == bool: + self._pandas_fallback = fallback + else: + warnings.warn( + "The flag for Pandas fallback must be a boolean.", + stacklevel=2, + ) + + @property + def interestingness_fallback(self): + return self._interestingness_fallback + + @interestingness_fallback.setter + def interestingness_fallback(self, fallback: bool) -> None: + """ + Parameters + ---------- + fallback : bool + If an error occurs while calculating interestingness, whether or not + to raise an exception or fallback to default Pandas. + """ + if type(fallback) == bool: + self._interestingness_fallback = fallback + else: + warnings.warn( + "The flag for interestingness fallback must be a boolean.", + stacklevel=2, + ) + @property def sampling_cap(self): """ diff --git a/lux/core/frame.py b/lux/core/frame.py index a91c8802..9da89eca 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -644,14 +644,17 @@ def on_button_clicked(b): except (KeyboardInterrupt, SystemExit): raise except Exception: - warnings.warn( - "\nUnexpected error in rendering Lux widget and recommendations. " - "Falling back to Pandas display.\n" - "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n", - stacklevel=2, - ) - warnings.warn(traceback.format_exc()) - display(self.display_pandas()) + if lux.config.pandas_fallback: + warnings.warn( + "\nUnexpected error in rendering Lux widget and recommendations. " + "Falling back to Pandas display.\n" + "Please report the following issue on Github: https://github.com/lux-org/lux/issues \n", + stacklevel=2, + ) + warnings.warn(traceback.format_exc()) + display(self.display_pandas()) + else: + raise def display_pandas(self): return self.to_pandas() diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 70605827..8dd168a3 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -162,9 +162,12 @@ def interestingness(vis: Vis, ldf: LuxDataFrame) -> int: else: return -1 except: - # Supress interestingness related issues - warnings.warn(f"An error occurred when computing interestingness for: {vis}") - return -1 + if lux.config.interestingness_fallback: + # Supress interestingness related issues + warnings.warn(f"An error occurred when computing interestingness for: {vis}") + return -1 + else: + raise def get_filtered_size(filter_specs, ldf): @@ -339,6 +342,7 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in if ignore_identity and msr1 == msr2: # remove if measures are the same return -1 + vis._vis_data = vis.data.dropna() v_x = vis.data[msr1] v_y = vis.data[msr2] @@ -356,5 +360,3 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in return -1 else: return score - # import scipy.stats - # return abs(scipy.stats.pearsonr(v_x,v_y)[0]) diff --git a/tests/context.py b/tests/context.py index 42b3eb15..b55d161b 100644 --- a/tests/context.py +++ b/tests/context.py @@ -18,3 +18,6 @@ sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))) import lux + +lux.config.interestingness_fallback = False +lux.config.pandas_fallback = False diff --git a/tests/test_config.py b/tests/test_config.py index ffe7f0fb..405f2ac6 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -205,7 +205,7 @@ def change_color_make_transparent_add_title(chart): def test_sampling_flag_config(): df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") df._repr_html_() - assert df.recommendation["Correlation"][0].data.shape[0] == 30000 + assert df.recommendation["Correlation"][0].data.shape[0] < 48895 lux.config.sampling = False df = df.copy() df._repr_html_() @@ -232,7 +232,7 @@ def test_heatmap_flag_config(): assert df.recommendation["Correlation"][0]._postbin lux.config.heatmap = False df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") - df = df.copy() + df._repr_html_() assert not df.recommendation["Correlation"][0]._postbin lux.config.heatmap = True From d1b9916ec5038f66dd5348b4243df58c853de08c Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Mon, 18 Jan 2021 06:55:05 +0800 Subject: [PATCH 24/28] PATCH (#233) * ensure dropna does not mutate vis.data --- lux/interestingness/interestingness.py | 6 +++--- tests/test_config.py | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/lux/interestingness/interestingness.py b/lux/interestingness/interestingness.py index 8dd168a3..cdd0f28a 100644 --- a/lux/interestingness/interestingness.py +++ b/lux/interestingness/interestingness.py @@ -342,9 +342,9 @@ def monotonicity(vis: Vis, attr_specs: list, ignore_identity: bool = True) -> in if ignore_identity and msr1 == msr2: # remove if measures are the same return -1 - vis._vis_data = vis.data.dropna() - v_x = vis.data[msr1] - v_y = vis.data[msr2] + vxy = vis.data.dropna() + v_x = vxy[msr1] + v_y = vxy[msr2] import warnings diff --git a/tests/test_config.py b/tests/test_config.py index 405f2ac6..7f3b49ac 100644 --- a/tests/test_config.py +++ b/tests/test_config.py @@ -205,7 +205,7 @@ def change_color_make_transparent_add_title(chart): def test_sampling_flag_config(): df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv") df._repr_html_() - assert df.recommendation["Correlation"][0].data.shape[0] < 48895 + assert df.recommendation["Correlation"][0].data.shape[0] == 30000 lux.config.sampling = False df = df.copy() df._repr_html_() From 91d60251c88041b7e183abb9aaf328cfe208e3f5 Mon Sep 17 00:00:00 2001 From: jinimukh <46768380+jinimukh@users.noreply.github.com> Date: Sun, 17 Jan 2021 18:13:21 -0800 Subject: [PATCH 25/28] [PATCH] Make df.data_type a property that accesses df._data_type (#231) * add protected attribute and make property * access private var to avoid looping * fix type --- lux/core/frame.py | 18 +++++++++++------ lux/executor/PandasExecutor.py | 36 +++++++++++++++++----------------- 2 files changed, 30 insertions(+), 24 deletions(-) diff --git a/lux/core/frame.py b/lux/core/frame.py index 9da89eca..d1792cb1 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -37,7 +37,7 @@ class LuxDataFrame(pd.DataFrame): _metadata = [ "_intent", "_inferred_intent", - "data_type", + "_data_type", "unique_values", "cardinality", "_rec_info", @@ -76,7 +76,7 @@ def __init__(self, *args, **kw): self._message = Message() self._pandas_only = False # Metadata - self.data_type = None + self._data_type = None self.unique_values = None self.cardinality = None self._min_max = None @@ -101,6 +101,12 @@ def f(*args, **kwargs): def history(self): return self._history + @property + def data_type(self): + if not self._data_type: + self.maintain_metadata() + return self._data_type + def maintain_metadata(self): # Check that metadata has not yet been computed if not hasattr(self, "_metadata_fresh") or not self._metadata_fresh: @@ -127,7 +133,7 @@ def expire_metadata(self): Expire all saved metadata to trigger a recomputation the next time the data is required. """ self._metadata_fresh = False - self.data_type = None + self._data_type = None self.unique_values = None self.cardinality = None self._min_max = None @@ -293,7 +299,7 @@ def compute_SQL_dataset_metadata(self): self.get_SQL_attributes() for attr in list(self.columns): self[attr] = None - self.data_type = {} + self._data_type = {} #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this ##### in the initialization and do it just once self.compute_SQL_data_type() @@ -307,7 +313,7 @@ def compute_SQL_stats(self): self.get_SQL_unique_values() # self.get_SQL_cardinality() for attribute in self.columns: - if self.data_type[attribute] == "quantitative": + if self._data_type[attribute] == "quantitative": self._min_max[attribute] = ( self[attribute].min(), self[attribute].max(), @@ -381,7 +387,7 @@ def compute_SQL_data_type(self): data_type[attr] = "quantitative" elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]: data_type[attr] = "temporal" - self.data_type = data_type + self._data_type = data_type def _append_rec(self, rec_infolist, recommendations: Dict): if recommendations["collection"] is not None and len(recommendations["collection"]) > 0: diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 7084c982..911d8d3b 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -392,7 +392,7 @@ def execute_2D_binning(vis: Vis): ############ Metadata: data type, model ############# ####################################################### def compute_dataset_metadata(self, ldf: LuxDataFrame): - ldf.data_type = {} + ldf._data_type = {} self.compute_data_type(ldf) def compute_data_type(self, ldf: LuxDataFrame): @@ -401,50 +401,50 @@ def compute_data_type(self, ldf: LuxDataFrame): for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time", "weekday"] if is_datetime(ldf[attr]): - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" elif self._is_datetime_string(ldf[attr]): - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" elif str(attr).lower() in temporal_var_list: - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" elif self._is_datetime_number(ldf[attr]): - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): # int columns gets coerced into floats if contain NaN convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20: - ldf.data_type[attr] = "nominal" + ldf._data_type[attr] = "nominal" else: - ldf.data_type[attr] = "quantitative" + ldf._data_type[attr] = "quantitative" elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values if ldf.pre_aggregated: if ldf.cardinality[attr] == len(ldf): - ldf.data_type[attr] = "nominal" + ldf._data_type[attr] = "nominal" if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20: - ldf.data_type[attr] = "nominal" + ldf._data_type[attr] = "nominal" else: - ldf.data_type[attr] = "quantitative" + ldf._data_type[attr] = "quantitative" if check_if_id_like(ldf, attr): - ldf.data_type[attr] = "id" + ldf._data_type[attr] = "id" # Eliminate this clause because a single NaN value can cause the dtype to be object elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): if check_if_id_like(ldf, attr): - ldf.data_type[attr] = "id" + ldf._data_type[attr] = "id" else: - ldf.data_type[attr] = "nominal" + ldf._data_type[attr] = "nominal" # check if attribute is any type of datetime dtype elif is_datetime_series(ldf.dtypes[attr]): - ldf.data_type[attr] = "temporal" + ldf._data_type[attr] = "temporal" else: - ldf.data_type[attr] = "nominal" + ldf._data_type[attr] = "nominal" if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: - ldf.data_type[ldf.index.name] = "nominal" + ldf._data_type[ldf.index.name] = "nominal" non_datetime_attrs = [] for attr in ldf.columns: - if ldf.data_type[attr] == "temporal" and not is_datetime(ldf[attr]): + if ldf._data_type[attr] == "temporal" and not is_datetime(ldf[attr]): non_datetime_attrs.append(attr) warn_msg = "" if len(non_datetime_attrs) == 1: From dd43cc930dea9b621ad13ff8ebf17cb4cefb159e Mon Sep 17 00:00:00 2001 From: jinimukh <46768380+jinimukh@users.noreply.github.com> Date: Sun, 17 Jan 2021 19:58:55 -0800 Subject: [PATCH 26/28] Explicit data type override via df.set_data_type (#215) * coalesce data_types into data_type_lookup * merge fixed * merge conflicts * implemented and tested * black reformat * added failure tests and changed names * update test * update docs * add protected attribute and make property * access private var to avoid looping * fix type * black Co-authored-by: Doris Lee --- lux/core/frame.py | 40 +++++++++++++++- lux/executor/PandasExecutor.py | 83 +++++++++++++++++++--------------- tests/test_type.py | 35 ++++++++++++++ 3 files changed, 120 insertions(+), 38 deletions(-) diff --git a/lux/core/frame.py b/lux/core/frame.py index d1792cb1..1f346ed9 100644 --- a/lux/core/frame.py +++ b/lux/core/frame.py @@ -53,6 +53,7 @@ class LuxDataFrame(pd.DataFrame): "_message", "_pandas_only", "pre_aggregated", + "_type_override", ] def __init__(self, *args, **kw): @@ -81,6 +82,7 @@ def __init__(self, *args, **kw): self.cardinality = None self._min_max = None self.pre_aggregated = None + self._type_override = {} warnings.formatwarning = lux.warning_format @property @@ -248,6 +250,40 @@ def set_intent_as_vis(self, vis: Vis): self._intent = vis._inferred_intent self._parse_validate_compile_intent() + def set_data_type(self, types: dict): + """ + Set the data type for a particular attribute in the dataframe + overriding the automatically-detected type inferred by Lux + + Parameters + ---------- + types: dict + Dictionary that maps attribute/column name to a specified Lux Type. + Possible options: "nominal", "quantitative", "id", and "temporal". + + Example + ---------- + df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/absenteeism.csv") + df.set_data_type({"ID":"id", + "Reason for absence":"nominal"}) + """ + if self._type_override == None: + self._type_override = types + else: + self._type_override = {**self._type_override, **types} + + if not self.data_type: + self.maintain_metadata() + + for attr in types: + if types[attr] not in ["nominal", "quantitative", "id", "temporal"]: + raise ValueError( + f'Invalid data type option specified for {attr}. Please use one of the following supported types: ["nominal", "quantitative", "id", "temporal"]' + ) + self.data_type[attr] = types[attr] + + self.expire_recs() + def to_pandas(self): import lux.core @@ -364,7 +400,9 @@ def compute_SQL_data_type(self): sql_dtypes[attr] = datatype for attr in list(self.columns): - if str(attr).lower() in ["month", "year"]: + if attr in self._type_override: + data_type[attr] = self._type_override[attr] + elif str(attr).lower() in ["month", "year"]: data_type[attr] = "temporal" elif sql_dtypes[attr] in [ "character", diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 911d8d3b..ebbfed1b 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -399,46 +399,53 @@ def compute_data_type(self, ldf: LuxDataFrame): from pandas.api.types import is_datetime64_any_dtype as is_datetime for attr in list(ldf.columns): - temporal_var_list = ["month", "year", "day", "date", "time", "weekday"] - if is_datetime(ldf[attr]): - ldf._data_type[attr] = "temporal" - elif self._is_datetime_string(ldf[attr]): - ldf._data_type[attr] = "temporal" - elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): - ldf._data_type[attr] = "temporal" - elif str(attr).lower() in temporal_var_list: - ldf._data_type[attr] = "temporal" - elif self._is_datetime_number(ldf[attr]): - ldf._data_type[attr] = "temporal" - elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): - # int columns gets coerced into floats if contain NaN - convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) - if convertible2int and ldf.cardinality[attr] != len(ldf) and ldf.cardinality[attr] < 20: - ldf._data_type[attr] = "nominal" - else: - ldf._data_type[attr] = "quantitative" - elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): - # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values - if ldf.pre_aggregated: - if ldf.cardinality[attr] == len(ldf): + if attr in ldf._type_override: + ldf._data_type[attr] = ldf._type_override[attr] + else: + temporal_var_list = ["month", "year", "day", "date", "time", "weekday"] + if is_datetime(ldf[attr]): + ldf._data_type[attr] = "temporal" + elif self._is_datetime_string(ldf[attr]): + ldf._data_type[attr] = "temporal" + elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): + ldf._data_type[attr] = "temporal" + elif str(attr).lower() in temporal_var_list: + ldf._data_type[attr] = "temporal" + elif self._is_datetime_number(ldf[attr]): + ldf._data_type[attr] = "temporal" + elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): + # int columns gets coerced into floats if contain NaN + convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes()) + if ( + convertible2int + and ldf.cardinality[attr] != len(ldf) + and ldf.cardinality[attr] < 20 + ): ldf._data_type[attr] = "nominal" - if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20: - ldf._data_type[attr] = "nominal" - else: - ldf._data_type[attr] = "quantitative" - if check_if_id_like(ldf, attr): - ldf._data_type[attr] = "id" - # Eliminate this clause because a single NaN value can cause the dtype to be object - elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): - if check_if_id_like(ldf, attr): - ldf._data_type[attr] = "id" + else: + ldf._data_type[attr] = "quantitative" + elif pd.api.types.is_integer_dtype(ldf.dtypes[attr]): + # See if integer value is quantitative or nominal by checking if the ratio of cardinality/data size is less than 0.4 and if there are less than 10 unique values + if ldf.pre_aggregated: + if ldf.cardinality[attr] == len(ldf): + ldf._data_type[attr] = "nominal" + if ldf.cardinality[attr] / len(ldf) < 0.4 and ldf.cardinality[attr] < 20: + ldf._data_type[attr] = "nominal" + else: + ldf._data_type[attr] = "quantitative" + if check_if_id_like(ldf, attr): + ldf._data_type[attr] = "id" + # Eliminate this clause because a single NaN value can cause the dtype to be object + elif pd.api.types.is_string_dtype(ldf.dtypes[attr]): + if check_if_id_like(ldf, attr): + ldf._data_type[attr] = "id" + else: + ldf._data_type[attr] = "nominal" + # check if attribute is any type of datetime dtype + elif is_datetime_series(ldf.dtypes[attr]): + ldf._data_type[attr] = "temporal" else: ldf._data_type[attr] = "nominal" - # check if attribute is any type of datetime dtype - elif is_datetime_series(ldf.dtypes[attr]): - ldf._data_type[attr] = "temporal" - else: - ldf._data_type[attr] = "nominal" if not pd.api.types.is_integer_dtype(ldf.index) and ldf.index.name: ldf._data_type[ldf.index.name] = "nominal" @@ -456,6 +463,8 @@ def compute_data_type(self, ldf: LuxDataFrame): for attr in non_datetime_attrs: warn_msg += f"\tdf['{attr}'] = pd.to_datetime(df['{attr}'], format='')\n" warn_msg += "\nSee more at: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.to_datetime.html" + warn_msg += f"\nIf {attr} is not a temporal attribute, please use override Lux's automatically detected type:" + warn_msg += f"\n\tdf.set_data_type({{'{attr}':'quantitative'}})" warnings.warn(warn_msg, stacklevel=2) @staticmethod diff --git a/tests/test_type.py b/tests/test_type.py index badcd023..5a22c517 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -15,6 +15,7 @@ from .context import lux import pytest import pandas as pd +import warnings # Suite of test that checks if data_type inferred correctly by Lux @@ -203,3 +204,37 @@ def test_float_categorical(): ], "Float column should be detected as categorical" for x in list(df.dtypes): assert x == "float64", "Source dataframe preserved as float dtype" + + +def test_set_data_type(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/real_estate_tutorial.csv?raw=true" + ) + with pytest.warns(UserWarning) as w: + df._repr_html_() + assert "starter template that you can use" in str(w[-1].message) + assert "df.set_data_type" in str(w[-1].message) + + df.set_data_type({"Month": "nominal", "Year": "nominal"}) + assert df.data_type["Month"] == "nominal" + assert df.data_type["Year"] == "nominal" + with warnings.catch_warnings() as w: + warnings.simplefilter("always") + df._repr_html_() + assert not w + + +def test_set_data_type_invalid(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/real_estate_tutorial.csv?raw=true" + ) + with pytest.raises(ValueError): + df.set_data_type({"Month": "nomnal", "Year": "nomnal"}) + + +def test_set_wrong_data_type(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/real_estate_tutorial.csv?raw=true" + ) + df.set_data_type({"Year": "quantitative"}) + assert df.data_type["Year"] == "quantitative" From 0defa4472b03b2f76cbce7f12ec79c0fc64d4065 Mon Sep 17 00:00:00 2001 From: 19thyneb Date: Mon, 18 Jan 2021 17:44:15 -0800 Subject: [PATCH 27/28] Updated data_type reference in SQLExecutor --- lux/executor/SQLExecutor.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/lux/executor/SQLExecutor.py b/lux/executor/SQLExecutor.py index 215d50fe..f6d728a3 100644 --- a/lux/executor/SQLExecutor.py +++ b/lux/executor/SQLExecutor.py @@ -582,7 +582,7 @@ def compute_dataset_metadata(self, ldf: LuxDataFrame): self.get_SQL_attributes(ldf) for attr in list(ldf.columns): ldf[attr] = None - ldf.data_type = {} + ldf._data_type = {} #####NOTE: since we aren't expecting users to do much data processing with the SQL database, should we just keep this ##### in the initialization and do it just once self.compute_data_type(ldf) @@ -639,7 +639,7 @@ def compute_stats(self, ldf: LuxDataFrame): self.get_unique_values(ldf) # ldf.get_cardinality() for attribute in ldf.columns: - if ldf.data_type[attribute] == "quantitative": + if ldf._data_type[attribute] == "quantitative": min_max_query = pandas.read_sql( 'SELECT MIN("{}") as min, MAX("{}") as max FROM {}'.format( attribute, attribute, ldf.table_name @@ -753,9 +753,9 @@ def compute_data_type(self, ldf: LuxDataFrame): if ldf.cardinality[attr] < 13: data_type[attr] = "nominal" elif check_if_id_like(ldf, attr): - ldf.data_type[attr] = "id" + ldf._data_type[attr] = "id" else: data_type[attr] = "quantitative" elif "time" in sql_dtypes[attr] or "date" in sql_dtypes[attr]: data_type[attr] = "temporal" - ldf.data_type = data_type + ldf._data_type = data_type From 02906f9f8943566b756f9787d9f38a43b0671f93 Mon Sep 17 00:00:00 2001 From: 19thyneb Date: Thu, 21 Jan 2021 11:14:56 -0800 Subject: [PATCH 28/28] Update Datetime Numeric Check Changed to handle other pandas and numpy integer types such as int64. --- lux/executor/PandasExecutor.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index b67e87db..24fb0223 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -488,7 +488,7 @@ def _is_datetime_string(series): @staticmethod def _is_datetime_number(series): - if series.dtype == int: + if series.dtype == int or "int" in str(series.dtype): try: temp = series.astype(str) pd.to_datetime(temp)