Skip to content

Commit

Permalink
Temporal type detection for integers via PandasExecutor._is_datetime_…
Browse files Browse the repository at this point in the history
…number (#232)

* Implement _is_datetime_number, which uses PDs built-in to_datetime to predict if a numeric series holds temporal values

* Add test_check_datetime_numeric_values in test_type.py

* Format via Black

* black and remove print

Co-authored-by: Doris Lee <dorisjunglinlee@gmail.com>
  • Loading branch information
micahtyong and dorisjlee committed Jan 16, 2021
1 parent f70e1fd commit 23e4a79
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 3 deletions.
17 changes: 14 additions & 3 deletions lux/executor/PandasExecutor.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
from lux.utils import utils
from lux.utils.date_utils import is_datetime_series
from lux.utils.utils import check_import_lux_widget, check_if_id_like
from lux.utils.date_utils import is_datetime_series
import warnings
import lux

Expand Down Expand Up @@ -400,7 +399,7 @@ def compute_data_type(self, ldf: LuxDataFrame):
from pandas.api.types import is_datetime64_any_dtype as is_datetime

for attr in list(ldf.columns):
temporal_var_list = ["month", "year", "day", "date", "time"]
temporal_var_list = ["month", "year", "day", "date", "time", "weekday"]
if is_datetime(ldf[attr]):
ldf.data_type[attr] = "temporal"
elif self._is_datetime_string(ldf[attr]):
Expand All @@ -409,6 +408,8 @@ def compute_data_type(self, ldf: LuxDataFrame):
ldf.data_type[attr] = "temporal"
elif str(attr).lower() in temporal_var_list:
ldf.data_type[attr] = "temporal"
elif self._is_datetime_number(ldf[attr]):
ldf.data_type[attr] = "temporal"
elif pd.api.types.is_float_dtype(ldf.dtypes[attr]):
# int columns gets coerced into floats if contain NaN
convertible2int = pd.api.types.is_integer_dtype(ldf[attr].convert_dtypes())
Expand Down Expand Up @@ -472,11 +473,21 @@ def _is_datetime_string(series):
datetime_col = pd.to_datetime(series)
except Exception as e:
return False

if datetime_col is not None:
return True
return False

@staticmethod
def _is_datetime_number(series):
if series.dtype == int:
try:
temp = series.astype(str)
pd.to_datetime(temp)
return True
except Exception:
return False
return False

def compute_stats(self, ldf: LuxDataFrame):
# precompute statistics
ldf.unique_values = {}
Expand Down
15 changes: 15 additions & 0 deletions tests/test_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,21 @@ def test_check_datetime():
}


def test_check_datetime_numeric_values():
car_df = pd.read_csv("lux/data/car.csv")
car_df = car_df.rename(columns={"Year": "blah"})
car_df.maintain_metadata()
assert car_df.data_type["blah"] == "temporal"

spotify_df = pd.read_csv(
"https://raw.githubusercontent.com/lux-org/lux-datasets/master/data/spotify.csv"
)
spotify_df = spotify_df.rename(columns={"year": "blah"})
spotify_df.maintain_metadata()
assert spotify_df.data_type["blah"] == "temporal"
assert spotify_df.data_type["release_date"] == "temporal"


def test_check_stock():
df = pd.read_csv("https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true")
df.maintain_metadata()
Expand Down

0 comments on commit 23e4a79

Please sign in to comment.