diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index f396c86b..806f47ba 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -383,12 +383,16 @@ def compute_dataset_metadata(self, ldf: LuxDataFrame): self.compute_data_model(ldf) def compute_data_type(self, ldf: LuxDataFrame): + from pandas.api.types import is_datetime64_any_dtype as is_datetime + for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] - if isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): - # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') + if is_datetime(ldf[attr]): + ldf.data_type_lookup[attr] = "temporal" + elif self._is_datetime_string(ldf[attr]): + ldf.data_type_lookup[attr] = "temporal" + elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): ldf.data_type_lookup[attr] = "temporal" - # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): @@ -425,8 +429,6 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) - from pandas.api.types import is_datetime64_any_dtype as is_datetime - non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]): @@ -450,6 +452,29 @@ def compute_data_type(self, ldf: LuxDataFrame): stacklevel=2, ) + def _is_datetime_string(self, series): + if len(series) > 100: + series = series.sample(100) + + if series.dtype == object: + + not_numeric = False + try: + pd.to_numeric(series) + except Exception as e: + not_numeric = True + + datetime_col = None + if not_numeric: + try: + datetime_col = pd.to_datetime(series) + except Exception as e: + return False + + if datetime_col is not None: + return True + return False + def compute_data_model(self, ldf: LuxDataFrame): ldf.data_model = { "measure": ldf.data_type["quantitative"], diff --git a/tests/test_type.py b/tests/test_type.py index f71766c0..0738d8bd 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -97,13 +97,51 @@ def test_check_airbnb(): "price": "quantitative", "minimum_nights": "quantitative", "number_of_reviews": "quantitative", - "last_review": "nominal", + "last_review": "temporal", "reviews_per_month": "quantitative", "calculated_host_listings_count": "quantitative", "availability_365": "quantitative", } +def test_check_datetime(): + df = pd.DataFrame( + { + "a": ["2020-01-01"], + "b": ["20-01-01"], + "c": ["20-jan-01"], + "d": ["20-january-01"], + "e": ["2020 January 01"], + "f": ["2020 January 01 00:00:00 pm PT"], + "g": ["2020 January 01 13:00:00"], + "h": ["2020 January 01 23:59:59 GTC-6"], + } + ) + df.maintain_metadata() + assert df.data_type_lookup == { + "a": "temporal", + "b": "temporal", + "c": "temporal", + "d": "temporal", + "e": "temporal", + "f": "temporal", + "g": "temporal", + "h": "temporal", + } + + +def test_check_stock(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true" + ) + df.maintain_metadata() + assert df.data_type_lookup == { + "symbol": "nominal", + "monthdate": "temporal", + "price": "quantitative", + }, "Stock dataset type detection error" + + def test_check_college(): df = pd.read_csv("lux/data/college.csv") df.maintain_metadata()