Skip to content

Commit

Permalink
FIX-#419: Handle incompatibility issues between timedelta64 dtypes wi…
Browse files Browse the repository at this point in the history
…th Altair (#444)

* add test_timedeltas to reproduce #419

* add tests

* handle timedelta64

* make timedelta64 quantitative
  • Loading branch information
cgarciae committed Jan 22, 2022
1 parent 4c95211 commit a7e2cf7
Show file tree
Hide file tree
Showing 4 changed files with 84 additions and 6 deletions.
15 changes: 13 additions & 2 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,13 +18,14 @@
from lux.core.frame import LuxDataFrame
from lux.executor.Executor import Executor
from lux.utils import utils
from lux.utils.date_utils import is_datetime_series
from lux.utils.date_utils import is_datetime_series, is_timedelta64_series, timedelta64_to_float_seconds
from lux.utils.utils import check_import_lux_widget, check_if_id_like, is_numeric_nan_column
import warnings
import lux
from lux.utils.tracing_utils import LuxTracer



class PandasExecutor(Executor):
"""
Given a Vis objects with complete specifications, fetch and process data using Pandas dataframe operations.
Expand Down Expand Up @@ -296,6 +297,9 @@ def execute_binning(ldf: LuxDataFrame, vis: Vis):
if pd.api.types.is_object_dtype(series):
series = series.astype("float", errors="ignore")

if is_timedelta64_series(series):
series = timedelta64_to_float_seconds(series)

counts, bin_edges = np.histogram(series, bins=bin_attribute.bin_size)
# bin_edges of size N+1, so need to compute bin_start as the bin location
bin_start = bin_edges[0:-1]
Expand Down Expand Up @@ -435,7 +439,14 @@ def compute_data_type(self, ldf: LuxDataFrame):
ldf._data_type[attr] = ldf._type_override[attr]
else:
temporal_var_list = ["month", "year", "day", "date", "time", "weekday"]
if is_datetime(ldf[attr]):

if is_timedelta64_series(ldf[attr]):
ldf._data_type[attr] = "quantitative"
ldf._min_max[attr] = (
timedelta64_to_float_seconds(ldf[attr].min()),
timedelta64_to_float_seconds(ldf[attr].max()),
)
elif is_datetime(ldf[attr]):
ldf._data_type[attr] = "temporal"
elif self._is_datetime_string(ldf[attr]):
ldf._data_type[attr] = "temporal"
Expand Down
38 changes: 37 additions & 1 deletion lux/utils/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,44 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import re
from typing import Any

import lux
import pandas as pd
import numpy as np

timedelta_re = re.compile(r"^timedelta64\[\w+\]$")


def is_timedelta64_series(series: pd.Series) -> bool:
"""
Check if the Series object is of timedelta64 type
Parameters
----------
series : pd.Series
Returns
-------
is_date: bool
"""
return pd.api.types.is_timedelta64_dtype(series)


def timedelta64_to_float_seconds(series: pd.Series) -> pd.Series:
"""
Convert a timedelta64 Series to a float Series in seconds
Parameters
----------
series : pd.Series
Returns
-------
series: pd.Series
"""
return series.view(np.int64) / 1_000_000_000


def date_formatter(time_stamp, ldf):
Expand Down
15 changes: 12 additions & 3 deletions lux/vislib/altair/AltairChart.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,14 @@
# See the License for the specific language governing permissions and
# limitations under the License.

import pandas as pd
import re

import lux
import numpy as np
import altair as alt
import pandas as pd
from lux.utils.date_utils import compute_date_granularity
import lux

import altair as alt


class AltairChart:
Expand Down Expand Up @@ -123,11 +126,17 @@ def initialize_chart(self):

@classmethod
def sanitize_dataframe(self, df):
from lux.utils.date_utils import is_timedelta64_series, timedelta64_to_float_seconds

for attr in df.columns:
# Check if dtype is unrecognized by Altair (#247)
if str(df[attr].dtype) == "Float64":
df[attr] = df[attr].astype(np.float64)

# Check for timedelta64[...] dtype
if is_timedelta64_series(df[attr]):
df[attr] = timedelta64_to_float_seconds(df[attr])

# Altair can not visualize non-string columns
# convert all non-string columns in to strings
df = df.rename(columns={attr: str(attr)})
Expand Down
22 changes: 22 additions & 0 deletions tests/test_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,17 @@ def test_infs():
df._ipython_display_()


def test_timedeltas():
nrows = 100_000

c1 = np.random.uniform(0, 10, size=nrows)
c2 = c1.astype("timedelta64[ms]")

df = pd.DataFrame({"c1": c1, "c2": c2})

df._ipython_display_()


def test_datetime_index():
nrows = 10

Expand All @@ -92,6 +103,17 @@ def test_datetime_index():
df._ipython_display_()


def test_interval():
nrows = 100_000

c1 = pd.Interval(left=0, right=nrows)
c2 = np.random.uniform(0, 10, size=nrows)

df = pd.DataFrame({"c1": c1, "c2": c2})

df._ipython_display_()


def test_datetime_index_serialize():
nrows = 100000

Expand Down

0 comments on commit a7e2cf7

Please sign in to comment.