From 0ac411dd1558b46f9556b7f87a6017ae9074e380 Mon Sep 17 00:00:00 2001 From: jrdzha Date: Thu, 12 Nov 2020 22:16:09 -0800 Subject: [PATCH 1/4] Updated temporal detection and tests --- lux/executor/PandasExecutor.py | 33 ++++++++++++++++++++++++++++++--- tests/test_type.py | 26 +++++++++++++++++++++++++- 2 files changed, 55 insertions(+), 4 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index f396c86b..225d0c33 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -383,9 +383,15 @@ def compute_dataset_metadata(self, ldf: LuxDataFrame): self.compute_data_model(ldf) def compute_data_type(self, ldf: LuxDataFrame): + from pandas.api.types import is_datetime64_any_dtype as is_datetime + for attr in list(ldf.columns): temporal_var_list = ["month", "year", "day", "date", "time"] - if isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): + if is_datetime(ldf[attr]): + ldf.data_type_lookup[attr] = "temporal" + elif self._is_datetime_string(ldf[attr]): + ldf.data_type_lookup[attr] = "temporal" + elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" # elif any(var in str(attr).lower() for var in temporal_var_list): @@ -425,8 +431,6 @@ def compute_data_type(self, ldf: LuxDataFrame): ldf.data_type_lookup[ldf.index.name] = "nominal" ldf.data_type = self.mapping(ldf.data_type_lookup) - from pandas.api.types import is_datetime64_any_dtype as is_datetime - non_datetime_attrs = [] for attr in ldf.columns: if ldf.data_type_lookup[attr] == "temporal" and not is_datetime(ldf[attr]): @@ -450,6 +454,29 @@ def compute_data_type(self, ldf: LuxDataFrame): stacklevel=2, ) + def _is_datetime_string(self, series): + if (len(series) > 100): + series = series.sample(100) + + if (series.dtype == object): + + not_numeric = False + try: + pd.to_numeric(series) + except Exception as e: + not_numeric = True + + datetime_col = None + if (not_numeric): + try: + datetime_col = pd.to_datetime(series) + except Exception as e: + return False + + if (datetime_col is not None): + return True + return False + def compute_data_model(self, ldf: LuxDataFrame): ldf.data_model = { "measure": ldf.data_type["quantitative"], diff --git a/tests/test_type.py b/tests/test_type.py index f71766c0..a24019c0 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -97,13 +97,37 @@ def test_check_airbnb(): "price": "quantitative", "minimum_nights": "quantitative", "number_of_reviews": "quantitative", - "last_review": "nominal", + "last_review": "temporal", "reviews_per_month": "quantitative", "calculated_host_listings_count": "quantitative", "availability_365": "quantitative", } +def test_check_datetime(): + df = pd.DataFrame({ + "a": ["2020-01-01"], + "b": ["20-01-01"], + "c": ["20-jan-01"], + "d": ["20-january-01"], + "e": ["2020 January 01"], + "f": ["2020 January 01 00:00:00 pm PT"], + "g": ["2020 January 01 13:00:00"], + "h": ["2020 January 01 23:59:59 GTC-6"] + }) + df.maintain_metadata() + assert df.data_type_lookup == { + "a": "temporal", + "b": "temporal", + "c": "temporal", + "d": "temporal", + "e": "temporal", + "f": "temporal", + "g": "temporal", + "h": "temporal" + } + + def test_check_college(): df = pd.read_csv("lux/data/college.csv") df.maintain_metadata() From cf6d42f235594011dc8b37fac4f215158eda3406 Mon Sep 17 00:00:00 2001 From: jrdzha Date: Thu, 12 Nov 2020 22:38:22 -0800 Subject: [PATCH 2/4] Reformatted code with black --- lux/executor/PandasExecutor.py | 8 ++++---- tests/test_type.py | 24 +++++++++++++----------- 2 files changed, 17 insertions(+), 15 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index 225d0c33..ff801a17 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -455,10 +455,10 @@ def compute_data_type(self, ldf: LuxDataFrame): ) def _is_datetime_string(self, series): - if (len(series) > 100): + if len(series) > 100: series = series.sample(100) - if (series.dtype == object): + if series.dtype == object: not_numeric = False try: @@ -467,13 +467,13 @@ def _is_datetime_string(self, series): not_numeric = True datetime_col = None - if (not_numeric): + if not_numeric: try: datetime_col = pd.to_datetime(series) except Exception as e: return False - if (datetime_col is not None): + if datetime_col is not None: return True return False diff --git a/tests/test_type.py b/tests/test_type.py index a24019c0..a6a8fc15 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -105,16 +105,18 @@ def test_check_airbnb(): def test_check_datetime(): - df = pd.DataFrame({ - "a": ["2020-01-01"], - "b": ["20-01-01"], - "c": ["20-jan-01"], - "d": ["20-january-01"], - "e": ["2020 January 01"], - "f": ["2020 January 01 00:00:00 pm PT"], - "g": ["2020 January 01 13:00:00"], - "h": ["2020 January 01 23:59:59 GTC-6"] - }) + df = pd.DataFrame( + { + "a": ["2020-01-01"], + "b": ["20-01-01"], + "c": ["20-jan-01"], + "d": ["20-january-01"], + "e": ["2020 January 01"], + "f": ["2020 January 01 00:00:00 pm PT"], + "g": ["2020 January 01 13:00:00"], + "h": ["2020 January 01 23:59:59 GTC-6"], + } + ) df.maintain_metadata() assert df.data_type_lookup == { "a": "temporal", @@ -124,7 +126,7 @@ def test_check_datetime(): "e": "temporal", "f": "temporal", "g": "temporal", - "h": "temporal" + "h": "temporal", } From 524203b8e5b4f92e23d58b2b860b20f8996420ab Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Fri, 13 Nov 2020 16:55:38 +0800 Subject: [PATCH 3/4] Update PandasExecutor.py --- lux/executor/PandasExecutor.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/lux/executor/PandasExecutor.py b/lux/executor/PandasExecutor.py index ff801a17..806f47ba 100644 --- a/lux/executor/PandasExecutor.py +++ b/lux/executor/PandasExecutor.py @@ -392,9 +392,7 @@ def compute_data_type(self, ldf: LuxDataFrame): elif self._is_datetime_string(ldf[attr]): ldf.data_type_lookup[attr] = "temporal" elif isinstance(attr, pd._libs.tslibs.timestamps.Timestamp): - # If timestamp, make the dictionary keys the _repr_ (e.g., TimeStamp('2020-04-05 00.000')--> '2020-04-05') ldf.data_type_lookup[attr] = "temporal" - # elif any(var in str(attr).lower() for var in temporal_var_list): elif str(attr).lower() in temporal_var_list: ldf.data_type_lookup[attr] = "temporal" elif pd.api.types.is_float_dtype(ldf.dtypes[attr]): From 4c54e24005c4f849ae647a9e0a761273eca57bac Mon Sep 17 00:00:00 2001 From: Doris Lee Date: Fri, 13 Nov 2020 17:10:58 +0800 Subject: [PATCH 4/4] added stock date test --- tests/test_type.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tests/test_type.py b/tests/test_type.py index a6a8fc15..0738d8bd 100644 --- a/tests/test_type.py +++ b/tests/test_type.py @@ -130,6 +130,18 @@ def test_check_datetime(): } +def test_check_stock(): + df = pd.read_csv( + "https://github.com/lux-org/lux-datasets/blob/master/data/stocks.csv?raw=true" + ) + df.maintain_metadata() + assert df.data_type_lookup == { + "symbol": "nominal", + "monthdate": "temporal", + "price": "quantitative", + }, "Stock dataset type detection error" + + def test_check_college(): df = pd.read_csv("lux/data/college.csv") df.maintain_metadata()