Skip to content

Commit

Permalink
Merge branch 'master' into sql-engine
Browse files Browse the repository at this point in the history
  • Loading branch information
dorisjlee committed Apr 11, 2021
2 parents 40b85b1 + bab48ff commit 2298f13
Show file tree
Hide file tree
Showing 11 changed files with 125 additions and 46 deletions.
5 changes: 4 additions & 1 deletion lux/action/univariate.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,13 @@ def univariate(ldf, *args):
examples = f" (e.g., {possible_attributes[0]})"
intent = [lux.Clause("?", data_type="geographical"), lux.Clause("?", data_model="measure")]
intent.extend(filter_specs)
long_description = f"Geographical displays <a href='https://en.wikipedia.org/wiki/Choropleth_map'>choropleths</a> for geographic attribute{examples}, with colors indicating the average measure values. "
if lux.config.plotting_backend == "matplotlib":
long_description += "The map visualizations from the 'Geographical' tab are rendered using <a href='https://altair-viz.github.io/'>Altair</a>. Lux does not currently support geographical maps with Matplotlib. If you would like this feature, please leave us a comment at <a href='https://github.com/lux-org/lux/issues/310'>issue #310</a> to let us know!"
recommendation = {
"action": "Geographical",
"description": "Show choropleth maps of <p class='highlight-descriptor'>geographic</p> attributes",
"long_description": f"Occurence displays choropleths of averages for some geographic attribute{examples}. Visualizations are ranked by diversity of the geographic attribute.",
"long_description": long_description,
}
elif data_type_constraint == "temporal":
intent = [lux.Clause("?", data_type="temporal")]
Expand Down
60 changes: 34 additions & 26 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from lux.executor.Executor import Executor
from lux.utils import utils
from lux.utils.date_utils import is_datetime_series
from lux.utils.utils import check_import_lux_widget, check_if_id_like
from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column
import warnings
import lux

Expand Down Expand Up @@ -97,7 +97,7 @@ def execute(vislist: VisList, ldf: LuxDataFrame):
if vis.mark == "bar" or vis.mark == "line" or vis.mark == "geographical":
PandasExecutor.execute_aggregate(vis, isFiltered=filter_executed)
elif vis.mark == "histogram":
PandasExecutor.execute_binning(vis)
PandasExecutor.execute_binning(ldf, vis)
elif vis.mark == "scatter":
HBIN_START = 5000
if lux.config.heatmap and len(ldf) > HBIN_START:
Expand Down Expand Up @@ -259,7 +259,7 @@ def execute_aggregate(vis: Vis, isFiltered=True):
vis._vis_data = vis._vis_data.drop(columns="index")

@staticmethod
def execute_binning(vis: Vis):
def execute_binning(ldf, vis: Vis):
"""
Binning of data points for generating histograms
Expand All @@ -278,16 +278,22 @@ def execute_binning(vis: Vis):

bin_attribute = list(filter(lambda x: x.bin_size != 0, vis._inferred_intent))[0]
bin_attr = bin_attribute.attribute
if not np.isnan(vis.data[bin_attr]).all():
# np.histogram breaks if array contain NaN
series = vis.data[bin_attr].dropna()
# TODO:binning runs for name attribte. Name attribute has datatype quantitative which is wrong.
counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
# TODO: Should vis.data be a LuxDataFrame or a Pandas DataFrame?
binned_result = np.array([bin_start, counts]).T
vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])
series = vis.data[bin_attr]

if series.hasnans:
ldf._message.add_unique(
f"The column <code>{bin_attr}</code> contains missing values, not shown in the displayed histogram.",
priority=100,
)
series = series.dropna()
if pd.api.types.is_object_dtype(series):
series = series.astype("float", errors="ignore")

counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
binned_result = np.array([bin_start, counts]).T
vis._vis_data = pd.DataFrame(binned_result, columns=[bin_attr, "Number of Records"])

@staticmethod
def execute_filter(vis: Vis):
Expand Down Expand Up @@ -422,13 +428,8 @@ def compute_data_type(self, ldf: LuxDataFrame):
elif self._is_geographical_attribute(ldf[attr]):
ldf._data_type[attr] = "geographical"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
if (
convertible2int
and ldf.cardinality[attr] != len(ldf)
and (len(ldf[attr].convert_dtypes().unique() < 20))
):

if ldf.cardinality[attr] != len(ldf) and (ldf.cardinality[attr] < 20):
ldf._data_type[attr] = "nominal"
else:
ldf._data_type[attr] = "quantitative"
Expand All @@ -445,7 +446,17 @@ def compute_data_type(self, ldf: LuxDataFrame):
ldf._data_type[attr] = "id"
# Eliminate this clause because a single NaN value can cause the dtype to be object
elif pd.api.types.is_string_dtype(ldf.dtypes[attr]):
if check_if_id_like(ldf, attr):
# Check first if it's castable to float after removing NaN
is_numeric_nan, series = is_numeric_nan_column(ldf[attr])
if is_numeric_nan:
# int columns gets coerced into floats if contain NaN
ldf._data_type[attr] = "quantitative"
# min max was not computed since object type, so recompute here
ldf._min_max[attr] = (
series.min(),
series.max(),
)
elif check_if_id_like(ldf, attr):
ldf._data_type[attr] = "id"
else:
ldf._data_type[attr] = "nominal"
Expand Down Expand Up @@ -527,11 +538,8 @@ def compute_stats(self, ldf: LuxDataFrame):
else:
attribute_repr = attribute

if ldf.dtypes[attribute] != "float64" or ldf[attribute].isnull().values.any():
ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute])
else:
ldf.cardinality[attribute_repr] = 999 # special value for non-numeric attribute
ldf.unique_values[attribute_repr] = list(ldf[attribute].unique())
ldf.cardinality[attribute_repr] = len(ldf.unique_values[attribute_repr])

if pd.api.types.is_float_dtype(ldf.dtypes[attribute]) or pd.api.types.is_integer_dtype(
ldf.dtypes[attribute]
Expand Down
12 changes: 12 additions & 0 deletions lux/utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,3 +131,15 @@ def matplotlib_setup(w, h):
ax.spines["right"].set_color("#dddddd")
ax.spines["top"].set_color("#dddddd")
return fig, ax


def is_numeric_nan_column(series):
if series.dtype == object:
if series.hasnans:
series = series.dropna()
try:
return True, series.astype("float")
except Exception as e:
return False, series
else:
return False, series
2 changes: 1 addition & 1 deletion lux/vislib/altair/Choropleth.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ def __init__(self, dobj):
super().__init__(dobj)

def __repr__(self):
return f"Proportional Symbol Map <{str(self.vis)}>"
return f"Choropleth Map <{str(self.vis)}>"

def initialize_chart(self):
x_attr = self.vis.get_attr_by_channel("x")[0]
Expand Down
3 changes: 3 additions & 0 deletions lux/vislib/matplotlib/MatplotlibRenderer.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
from lux.vislib.matplotlib.LineChart import LineChart
from lux.vislib.matplotlib.Histogram import Histogram
from lux.vislib.matplotlib.Heatmap import Heatmap
from lux.vislib.altair.AltairRenderer import AltairRenderer
import matplotlib.pyplot as plt
from lux.utils.utils import matplotlib_setup

Expand Down Expand Up @@ -81,6 +82,8 @@ def create_vis(self, vis, standalone=True):
chart = LineChart(vis, fig, ax)
elif vis.mark == "heatmap":
chart = Heatmap(vis, fig, ax)
elif vis.mark == "geographical":
return AltairRenderer().create_vis(vis, False)
else:
chart = None
return chart
Expand Down
8 changes: 4 additions & 4 deletions lux/vislib/matplotlib/ScatterChart.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,7 +66,7 @@ def initialize_chart(self):
vals = [unique.index(i) for i in colors]
if color_attr_type == "quantitative":
self.fig, self.ax = matplotlib_setup(7, 5)
set_fig_code = "fig, ax = plt.subplots(7, 5)\n"
set_fig_code = "fig, ax = plt.subplots(figsize=(7, 5))\n"
self.ax.scatter(x_pts, y_pts, c=vals, cmap="Blues", alpha=0.5)
plot_code += f"ax.scatter(x_pts, y_pts, c={vals}, cmap='Blues', alpha=0.5)\n"
my_cmap = plt.cm.get_cmap("Blues")
Expand Down Expand Up @@ -96,10 +96,10 @@ def initialize_chart(self):
maxlen = len(unique[i])
if maxlen > 20:
self.fig, self.ax = matplotlib_setup(9, 5)
set_fig_code = "fig, ax = plt.subplots(9, 5)\n"
set_fig_code = "fig, ax = plt.subplots(figsize=(9, 5))\n"
else:
self.fig, self.ax = matplotlib_setup(7, 5)
set_fig_code = "fig, ax = plt.subplots(7, 5)\n"
set_fig_code = "fig, ax = plt.subplots(figsize=(7, 5))\n"

cmap = "Set1"
if len(unique) > 9:
Expand Down Expand Up @@ -131,7 +131,7 @@ def initialize_chart(self):
fontsize='13')\n"""
plot_code += "scatter.set_alpha(0.5)\n"
else:
set_fig_code = "fig, ax = plt.subplots(4.5, 4)\n"
set_fig_code = "fig, ax = plt.subplots(figsize=(4.5, 4))\n"
self.ax.scatter(x_pts, y_pts, alpha=0.5)
plot_code += f"ax.scatter(x_pts, y_pts, alpha=0.5)\n"
self.ax.set_xlabel(x_attr_abv, fontsize="15")
Expand Down
3 changes: 2 additions & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -6,4 +6,5 @@ xlrd
black
# Install only to use SQLExecutor
psycopg2>=2.8.5
psycopg2-binary>=2.8.5
psycopg2-binary>=2.8.5
lxml
21 changes: 12 additions & 9 deletions tests/test_interestingness.py
Original file line number Diff line number Diff line change
Expand Up @@ -328,25 +328,28 @@ def test_interestingness_deviation_nan():
import numpy as np

dataset = [
{"date": "2017-08-25 09:06:11+00:00", "category": "A", "value": 25.0},
{"date": "2017-08-25 09:06:11+00:00", "category": "B", "value": 1.2},
{"date": "2017-08-25 09:06:11+00:00", "category": "C", "value": 1.3},
{"date": "2017-08-25 09:06:11+00:00", "category": "D", "value": 1.4},
{"date": "2017-08-25 09:06:11+00:00", "category": "E", "value": 1.5},
{"date": "2017-08-25 09:06:11+00:00", "category": "F", "value": 0.1},
{"date": "2017-08-25", "category": "A", "value": 25.0},
{"date": "2017-08-25", "category": "B", "value": 1.2},
{"date": "2017-08-25", "category": "C", "value": 1.3},
{"date": "2017-08-25", "category": "D", "value": 1.4},
{"date": "2017-08-25", "category": "E", "value": 1.5},
{"date": "2017-08-25", "category": "F", "value": 0.1},
{"date": np.nan, "category": "C", "value": 0.2},
{"date": np.nan, "category": "B", "value": 0.2},
{"date": np.nan, "category": "F", "value": 0.3},
{"date": np.nan, "category": "E", "value": 0.3},
{"date": np.nan, "category": "D", "value": 0.4},
{"date": np.nan, "category": "A", "value": 10.4},
{"date": "2017-07-25 15:06:11+00:00", "category": "A", "value": 15.5},
{"date": "2017-07-25 15:06:11+00:00", "category": "F", "value": 1.0},
{"date": "2017-07-25 15:06:11+00:00", "category": "B", "value": 0.1},
{"date": "2017-07-25", "category": "A", "value": 15.5},
{"date": "2017-07-25", "category": "F", "value": 1.0},
{"date": "2017-07-25", "category": "B", "value": 0.1},
]
test = pd.DataFrame(dataset)
from lux.vis.Vis import Vis

test["date"] = pd.to_datetime(test["date"], format="%Y-%M-%d")
test.set_data_type({"value": "quantitative"})

vis = Vis(["date", "value", "category=A"], test)
vis2 = Vis(["date", "value", "category=B"], test)
from lux.interestingness.interestingness import interestingness
Expand Down
27 changes: 27 additions & 0 deletions tests/test_nan.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,3 +113,30 @@ def test_nan_series_occurence():
ldf = pd.DataFrame(nan_series, columns=["col"])
ldf._ipython_display_()
assert ldf.recommendation["Occurrence"][0].mark == "bar"


def test_numeric_with_nan():
df = pd.read_html(
"https://archive.ics.uci.edu/ml/datasets.php?format=&task=&att=&area=&numAtt=&numIns=&type=&sort=nameUp&view=table"
)[5]
df.columns = df.loc[0]
df = df.loc[1:]
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
assert (
df.data_type["# Instances"] == "quantitative"
), "Testing a numeric columns with NaN, check if type can be detected correctly"
assert (
df.data_type["# Attributes"] == "quantitative"
), "Testing a numeric columns with NaN, check if type can be detected correctly"
a = df[["# Instances", "# Attributes"]]
a._ipython_display_()
assert (
len(a.recommendation["Distribution"]) == 2
), "Testing a numeric columns with NaN, check that histograms are displayed"
assert "contains missing values" in a._message.to_html(), "Warning message for NaN displayed"
a = a.dropna()
a._ipython_display_()
assert (
len(a.recommendation["Distribution"]) == 2
), "Example where dtype might be off after dropna(), check if histograms are still displayed"
assert "" in a._message.to_html(), "No warning message for NaN should be displayed"
6 changes: 3 additions & 3 deletions tests/test_pandas_coverage.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,7 +257,7 @@ def test_transform(global_var):
df["Year"] = pd.to_datetime(df["Year"], format="%Y")
new_df = df.iloc[:, 1:].groupby("Origin").transform(sum)
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Occurrence"]
assert len(new_df.cardinality) == 7


Expand Down Expand Up @@ -409,7 +409,7 @@ def test_loc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.loc[0:10, "Displacement":"Horsepower"]
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand Down Expand Up @@ -438,7 +438,7 @@ def test_iloc(global_var):
assert len(new_df.cardinality) == 6
new_df = df.iloc[0:11, 3:5]
new_df._ipython_display_()
assert list(new_df.recommendation.keys()) == ["Distribution", "Occurrence"]
assert list(new_df.recommendation.keys()) == ["Correlation", "Distribution"]
assert len(new_df.cardinality) == 2
import numpy as np

Expand Down
24 changes: 23 additions & 1 deletion tests/test_vis.py
Original file line number Diff line number Diff line change
Expand Up @@ -465,7 +465,7 @@ def test_vegalite_default_actions_registered_2(global_var):
df["magnitude"] = np.random.randint(0, 20, size=len(df))
lux.config.plotting_backend = "vegalite"

# Symbol Map
# Choropleth Map
assert "Geographical" in df.recommendation
assert len(df.recommendation["Geographical"]) > 0

Expand Down Expand Up @@ -499,6 +499,28 @@ def test_matplotlib_default_actions_registered(global_var):
assert len(df.recommendation["Correlation"]) > 0


def test_matplotlib_default_actions_registered_2(global_var):
import numpy as np

df = pd.read_csv(
"https://raw.githubusercontent.com/altair-viz/vega_datasets/master/vega_datasets/_data/airports.csv"
)
df["magnitude"] = np.random.randint(0, 20, size=len(df))
lux.config.plotting_backend = "matplotlib"

# Choropleth Map
assert "Geographical" in df.recommendation
assert len(df.recommendation["Geographical"]) > 0

# Occurrence Chart
assert "Occurrence" in df.recommendation
assert len(df.recommendation["Occurrence"]) > 0

# Scatter Chart
assert "Correlation" in df.recommendation
assert len(df.recommendation["Correlation"]) > 0


def test_vegalite_heatmap_flag_config():
df = pd.read_csv("https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/airbnb_nyc.csv")
lux.config.plotting_backend = "vegalite"
Expand Down

0 comments on commit 2298f13

Please sign in to comment.